|
|
|
@ -26,7 +26,7 @@ URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/de
|
|
|
|
|
MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
|
|
|
|
|
# this is a small set of data for test. The original data is too large and will be add later.
|
|
|
|
|
URL_TRAIN = 'http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
|
|
|
|
|
MD5_TRAIN = 'a755315dd01c2c35bde29a744ede23a6'
|
|
|
|
|
MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
|
|
|
|
|
# this is the pretrained model, whose bleu = 26.92
|
|
|
|
|
URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
|
|
|
|
|
MD5_MODEL = '4ce14a26607fb8a1cc23bcdedb1895e4'
|
|
|
|
@ -108,6 +108,11 @@ def test(dict_size):
|
|
|
|
|
download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def gen(dict_size):
|
|
|
|
|
return reader_creator(
|
|
|
|
|
download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'gen/gen', dict_size)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def model():
|
|
|
|
|
tar_file = download(URL_MODEL, 'wmt14', MD5_MODEL)
|
|
|
|
|
with gzip.open(tar_file, 'r') as f:
|
|
|
|
@ -115,10 +120,15 @@ def model():
|
|
|
|
|
return parameters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def trg_dict(dict_size):
|
|
|
|
|
def get_dict(dict_size, reverse=True):
|
|
|
|
|
# if reverse = False, return dict = {'a':'001', 'b':'002', ...}
|
|
|
|
|
# else reverse = true, return dict = {'001':'a', '002':'b', ...}
|
|
|
|
|
tar_file = download(URL_TRAIN, 'wmt14', MD5_TRAIN)
|
|
|
|
|
src_dict, trg_dict = __read_to_dict__(tar_file, dict_size)
|
|
|
|
|
return trg_dict
|
|
|
|
|
if reverse:
|
|
|
|
|
src_dict = {v: k for k, v in src_dict.items()}
|
|
|
|
|
trg_dict = {v: k for k, v in trg_dict.items()}
|
|
|
|
|
return src_dict, trg_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch():
|
|
|
|
|