|
|
|
@ -64,7 +64,8 @@ def __build_dict(tar_file, dict_size, save_path, lang):
|
|
|
|
|
word_dict = defaultdict(int)
|
|
|
|
|
with tarfile.open(tar_file, mode="r") as f:
|
|
|
|
|
for line in f.extractfile("wmt16/train"):
|
|
|
|
|
line_split = line.strip().split(six.b("\t"))
|
|
|
|
|
line = cpt.to_text(line)
|
|
|
|
|
line_split = line.strip().split("\t")
|
|
|
|
|
if len(line_split) != 2: continue
|
|
|
|
|
sen = line_split[0] if lang == "en" else line_split[1]
|
|
|
|
|
for w in sen.split():
|
|
|
|
@ -123,7 +124,8 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
|
|
|
|
|
|
|
|
|
|
with tarfile.open(tar_file, mode="r") as f:
|
|
|
|
|
for line in f.extractfile(file_name):
|
|
|
|
|
line_split = line.strip().split(six.b("\t"))
|
|
|
|
|
line = cpt.to_text(line)
|
|
|
|
|
line_split = line.strip().split("\t")
|
|
|
|
|
if len(line_split) != 2:
|
|
|
|
|
continue
|
|
|
|
|
src_words = line_split[src_col].split()
|
|
|
|
|