|
|
|
|
@ -23,7 +23,7 @@ to initialize SRL model.
|
|
|
|
|
import tarfile
|
|
|
|
|
import gzip
|
|
|
|
|
import itertools
|
|
|
|
|
import paddle.v2.dataset.common
|
|
|
|
|
import paddle.dataset.common
|
|
|
|
|
|
|
|
|
|
__all__ = ['test, get_dict', 'get_embedding', 'convert']
|
|
|
|
|
|
|
|
|
|
@ -203,14 +203,11 @@ def get_dict():
|
|
|
|
|
Get the word, verb and label dictionary of Wikipedia corpus.
|
|
|
|
|
"""
|
|
|
|
|
word_dict = load_dict(
|
|
|
|
|
paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st',
|
|
|
|
|
WORDDICT_MD5))
|
|
|
|
|
paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
|
|
|
|
|
verb_dict = load_dict(
|
|
|
|
|
paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
|
|
|
|
|
VERBDICT_MD5))
|
|
|
|
|
paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
|
|
|
|
|
label_dict = load_label_dict(
|
|
|
|
|
paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
|
|
|
|
|
TRGDICT_MD5))
|
|
|
|
|
paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
|
|
|
|
|
return word_dict, verb_dict, label_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -218,7 +215,7 @@ def get_embedding():
|
|
|
|
|
"""
|
|
|
|
|
Get the trained word vector based on Wikipedia corpus.
|
|
|
|
|
"""
|
|
|
|
|
return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
|
|
|
|
|
return paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test():
|
|
|
|
|
@ -235,23 +232,23 @@ def test():
|
|
|
|
|
"""
|
|
|
|
|
word_dict, verb_dict, label_dict = get_dict()
|
|
|
|
|
reader = corpus_reader(
|
|
|
|
|
paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
|
|
|
|
|
paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
|
|
|
|
|
words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
|
|
|
|
|
props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
|
|
|
|
|
return reader_creator(reader, word_dict, verb_dict, label_dict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch():
|
|
|
|
|
paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
|
|
|
|
|
paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
|
|
|
|
|
paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
|
|
|
|
|
paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
|
|
|
|
|
paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
|
|
|
|
|
paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
|
|
|
|
|
paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
|
|
|
|
|
paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
|
|
|
|
|
paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
|
|
|
|
|
paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert(path):
|
|
|
|
|
"""
|
|
|
|
|
Converts dataset to recordio format
|
|
|
|
|
"""
|
|
|
|
|
paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train")
|
|
|
|
|
paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test")
|
|
|
|
|
paddle.dataset.common.convert(path, test(), 1000, "conl105_train")
|
|
|
|
|
paddle.dataset.common.convert(path, test(), 1000, "conl105_test")
|