|
|
|
@ -18,11 +18,11 @@ This module will download dataset from
|
|
|
|
|
http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
|
|
|
|
|
into paddle reader creators.
|
|
|
|
|
"""
|
|
|
|
|
import paddle.v2.dataset.common as common
|
|
|
|
|
import paddle.v2.dataset.common
|
|
|
|
|
import collections
|
|
|
|
|
import tarfile
|
|
|
|
|
|
|
|
|
|
__all__ = ['train', 'test', 'build_dict']
|
|
|
|
|
__all__ = ['train', 'test', 'build_dict', 'convert']
|
|
|
|
|
|
|
|
|
|
URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
|
|
|
|
|
MD5 = '30177ea32e27c525793142b6bf2c8e2d'
|
|
|
|
@ -145,7 +145,7 @@ def test(word_idx, n, data_type=DataType.NGRAM):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch():
|
|
|
|
|
common.download(URL, "imikolov", MD5)
|
|
|
|
|
paddle.v2.dataset.common.download(URL, "imikolov", MD5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert(path):
|
|
|
|
@ -154,5 +154,7 @@ def convert(path):
|
|
|
|
|
"""
|
|
|
|
|
N = 5
|
|
|
|
|
word_dict = build_dict()
|
|
|
|
|
common.convert(path, train(word_dict, N), 10, "imikolov_train")
|
|
|
|
|
common.convert(path, test(word_dict, N), 10, "imikolov_test")
|
|
|
|
|
paddle.v2.dataset.common.convert(path,
|
|
|
|
|
train(word_dict, N), 10, "imikolov_train")
|
|
|
|
|
paddle.v2.dataset.common.convert(path,
|
|
|
|
|
test(word_dict, N), 10, "imikolov_test")
|
|
|
|
|