Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fit_a_line
commit
62ff19e388
@ -0,0 +1,205 @@
|
||||
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle.v2.dataset.common
|
||||
import tarfile
|
||||
import gzip
|
||||
import itertools
|
||||
|
||||
__all__ = ['test, get_dict', 'get_embedding']
|
||||
"""
|
||||
Conll 2005 dataset. Paddle semantic role labeling Book and demo use this
|
||||
dataset as an example. Because Conll 2005 is not free in public, the default
|
||||
downloaded URL is test set of Conll 2005 (which is public). Users can change
|
||||
URL and MD5 to their Conll dataset.
|
||||
"""
|
||||
|
||||
DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
|
||||
DATA_MD5 = '387719152ae52d60422c016e92a742fc'
|
||||
WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
|
||||
WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
|
||||
VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
|
||||
VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
|
||||
TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
|
||||
TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
|
||||
EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
|
||||
EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
|
||||
|
||||
UNK_IDX = 0
|
||||
|
||||
|
||||
def load_dict(filename):
|
||||
d = dict()
|
||||
with open(filename, 'r') as f:
|
||||
for i, line in enumerate(f):
|
||||
d[line.strip()] = i
|
||||
return d
|
||||
|
||||
|
||||
def corpus_reader(data_path, words_name, props_name):
|
||||
"""
|
||||
Read one corpus. It returns an iterator. Each element of
|
||||
this iterator is a tuple including sentence and labels. The sentence is
|
||||
consist of a list of word IDs. The labels include a list of label IDs.
|
||||
:return: a iterator of data.
|
||||
:rtype: iterator
|
||||
"""
|
||||
|
||||
def reader():
|
||||
tf = tarfile.open(data_path)
|
||||
wf = tf.extractfile(words_name)
|
||||
pf = tf.extractfile(props_name)
|
||||
with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
|
||||
fileobj=pf) as props_file:
|
||||
sentences = []
|
||||
labels = []
|
||||
one_seg = []
|
||||
for word, label in itertools.izip(words_file, props_file):
|
||||
word = word.strip()
|
||||
label = label.strip().split()
|
||||
|
||||
if len(label) == 0: # end of sentence
|
||||
for i in xrange(len(one_seg[0])):
|
||||
a_kind_lable = [x[i] for x in one_seg]
|
||||
labels.append(a_kind_lable)
|
||||
|
||||
if len(labels) >= 1:
|
||||
verb_list = []
|
||||
for x in labels[0]:
|
||||
if x != '-':
|
||||
verb_list.append(x)
|
||||
|
||||
for i, lbl in enumerate(labels[1:]):
|
||||
cur_tag = 'O'
|
||||
is_in_bracket = False
|
||||
lbl_seq = []
|
||||
verb_word = ''
|
||||
for l in lbl:
|
||||
if l == '*' and is_in_bracket == False:
|
||||
lbl_seq.append('O')
|
||||
elif l == '*' and is_in_bracket == True:
|
||||
lbl_seq.append('I-' + cur_tag)
|
||||
elif l == '*)':
|
||||
lbl_seq.append('I-' + cur_tag)
|
||||
is_in_bracket = False
|
||||
elif l.find('(') != -1 and l.find(')') != -1:
|
||||
cur_tag = l[1:l.find('*')]
|
||||
lbl_seq.append('B-' + cur_tag)
|
||||
is_in_bracket = False
|
||||
elif l.find('(') != -1 and l.find(')') == -1:
|
||||
cur_tag = l[1:l.find('*')]
|
||||
lbl_seq.append('B-' + cur_tag)
|
||||
is_in_bracket = True
|
||||
else:
|
||||
raise RuntimeError('Unexpected label: %s' %
|
||||
l)
|
||||
|
||||
yield sentences, verb_list[i], lbl_seq
|
||||
|
||||
sentences = []
|
||||
labels = []
|
||||
one_seg = []
|
||||
else:
|
||||
sentences.append(word)
|
||||
one_seg.append(label)
|
||||
|
||||
pf.close()
|
||||
wf.close()
|
||||
tf.close()
|
||||
|
||||
return reader
|
||||
|
||||
|
||||
def reader_creator(corpus_reader,
|
||||
word_dict=None,
|
||||
predicate_dict=None,
|
||||
label_dict=None):
|
||||
def reader():
|
||||
for sentence, predicate, labels in corpus_reader():
|
||||
|
||||
sen_len = len(sentence)
|
||||
|
||||
verb_index = labels.index('B-V')
|
||||
mark = [0] * len(labels)
|
||||
if verb_index > 0:
|
||||
mark[verb_index - 1] = 1
|
||||
ctx_n1 = sentence[verb_index - 1]
|
||||
else:
|
||||
ctx_n1 = 'bos'
|
||||
|
||||
if verb_index > 1:
|
||||
mark[verb_index - 2] = 1
|
||||
ctx_n2 = sentence[verb_index - 2]
|
||||
else:
|
||||
ctx_n2 = 'bos'
|
||||
|
||||
mark[verb_index] = 1
|
||||
ctx_0 = sentence[verb_index]
|
||||
|
||||
if verb_index < len(labels) - 1:
|
||||
mark[verb_index + 1] = 1
|
||||
ctx_p1 = sentence[verb_index + 1]
|
||||
else:
|
||||
ctx_p1 = 'eos'
|
||||
|
||||
if verb_index < len(labels) - 2:
|
||||
mark[verb_index + 2] = 1
|
||||
ctx_p2 = sentence[verb_index + 2]
|
||||
else:
|
||||
ctx_p2 = 'eos'
|
||||
|
||||
word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
|
||||
pred_idx = [predicate_dict.get(predicate)] * sen_len
|
||||
|
||||
ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
|
||||
ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
|
||||
ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
|
||||
ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
|
||||
ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len
|
||||
|
||||
label_idx = [label_dict.get(w) for w in labels]
|
||||
|
||||
yield word_idx, pred_idx, ctx_n2_idx, ctx_n1_idx, \
|
||||
ctx_0_idx, ctx_p1_idx, ctx_p2_idx, mark, label_idx
|
||||
|
||||
return reader()
|
||||
|
||||
|
||||
def get_dict():
|
||||
word_dict = load_dict(
|
||||
common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
|
||||
verb_dict = load_dict(
|
||||
common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
|
||||
label_dict = load_dict(
|
||||
common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
|
||||
return word_dict, verb_dict, label_dict
|
||||
|
||||
|
||||
def get_embedding():
|
||||
return common.download(EMB_URL, 'conll05st', EMB_MD5)
|
||||
|
||||
|
||||
def test():
|
||||
word_dict, verb_dict, label_dict = get_dict()
|
||||
reader = corpus_reader(
|
||||
common.download(DATA_URL, 'conll05st', DATA_MD5),
|
||||
words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
|
||||
props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
|
||||
return reader_creator(reader, word_dict, verb_dict, label_dict)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print get_embedding()
|
||||
for f in test():
|
||||
print f
|
File diff suppressed because it is too large
Load Diff
@ -1,12 +1,16 @@
|
||||
add_test(NAME test_v2_api
|
||||
COMMAND bash ${PROJ_ROOT}/python/paddle/v2/tests/run_tests.sh ${PYTHON_EXECUTABLE})
|
||||
|
||||
add_test(NAME test_v2_layer
|
||||
COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
|
||||
${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/v2/tests/test_layer.py
|
||||
WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
|
||||
|
||||
add_test(NAME test_v2_api
|
||||
COMMAND bash ${PROJ_ROOT}/python/paddle/v2/tests/run_tests.sh ${PYTHON_EXECUTABLE})
|
||||
add_test(NAME test_v2_rnn_layer
|
||||
COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
|
||||
${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/v2/tests/test_rnn_layer.py)
|
||||
|
||||
add_test(NAME topology_test
|
||||
add_test(NAME test_topology
|
||||
COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
|
||||
${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/v2/tests/test_topology.py
|
||||
WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
|
||||
|
@ -0,0 +1,155 @@
|
||||
# Copyright PaddlePaddle contributors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import difflib
|
||||
import unittest
|
||||
|
||||
import paddle.trainer_config_helpers as conf_helps
|
||||
import paddle.v2.activation as activation
|
||||
import paddle.v2.data_type as data_type
|
||||
import paddle.v2.layer as layer
|
||||
from paddle.trainer_config_helpers.config_parser_utils import \
|
||||
parse_network_config as parse_network
|
||||
|
||||
|
||||
class RNNTest(unittest.TestCase):
|
||||
def test_simple_rnn(self):
|
||||
dict_dim = 10
|
||||
word_dim = 8
|
||||
hidden_dim = 8
|
||||
|
||||
def parse_old_rnn():
|
||||
def step(y):
|
||||
mem = conf_helps.memory(name="rnn_state", size=hidden_dim)
|
||||
out = conf_helps.fc_layer(
|
||||
input=[y, mem],
|
||||
size=hidden_dim,
|
||||
act=activation.Tanh(),
|
||||
bias_attr=True,
|
||||
name="rnn_state")
|
||||
return out
|
||||
|
||||
def test():
|
||||
data = conf_helps.data_layer(name="word", size=dict_dim)
|
||||
embd = conf_helps.embedding_layer(input=data, size=word_dim)
|
||||
conf_helps.recurrent_group(name="rnn", step=step, input=embd)
|
||||
|
||||
return str(parse_network(test))
|
||||
|
||||
def parse_new_rnn():
|
||||
def new_step(y):
|
||||
mem = layer.memory(name="rnn_state", size=hidden_dim)
|
||||
out = layer.fc(input=[y, mem],
|
||||
size=hidden_dim,
|
||||
act=activation.Tanh(),
|
||||
bias_attr=True,
|
||||
name="rnn_state")
|
||||
return out
|
||||
|
||||
data = layer.data(
|
||||
name="word", type=data_type.integer_value(dict_dim))
|
||||
embd = layer.embedding(input=data, size=word_dim)
|
||||
rnn_layer = layer.recurrent_group(
|
||||
name="rnn", step=new_step, input=embd)
|
||||
return str(layer.parse_network(rnn_layer))
|
||||
|
||||
diff = difflib.unified_diff(parse_old_rnn().splitlines(1),
|
||||
parse_new_rnn().splitlines(1))
|
||||
print ''.join(diff)
|
||||
|
||||
def test_sequence_rnn_multi_input(self):
|
||||
dict_dim = 10
|
||||
word_dim = 8
|
||||
hidden_dim = 8
|
||||
label_dim = 3
|
||||
|
||||
def parse_old_rnn():
|
||||
def test():
|
||||
data = conf_helps.data_layer(name="word", size=dict_dim)
|
||||
label = conf_helps.data_layer(name="label", size=label_dim)
|
||||
emb = conf_helps.embedding_layer(input=data, size=word_dim)
|
||||
boot_layer = conf_helps.data_layer(name="boot", size=10)
|
||||
boot_layer = conf_helps.fc_layer(
|
||||
name='boot_fc', input=boot_layer, size=10)
|
||||
|
||||
def step(y, wid):
|
||||
z = conf_helps.embedding_layer(input=wid, size=word_dim)
|
||||
mem = conf_helps.memory(
|
||||
name="rnn_state",
|
||||
size=hidden_dim,
|
||||
boot_layer=boot_layer)
|
||||
out = conf_helps.fc_layer(
|
||||
input=[y, z, mem],
|
||||
size=hidden_dim,
|
||||
act=conf_helps.TanhActivation(),
|
||||
bias_attr=True,
|
||||
name="rnn_state")
|
||||
return out
|
||||
|
||||
out = conf_helps.recurrent_group(
|
||||
name="rnn", step=step, input=[emb, data])
|
||||
|
||||
rep = conf_helps.last_seq(input=out)
|
||||
prob = conf_helps.fc_layer(
|
||||
size=label_dim,
|
||||
input=rep,
|
||||
act=conf_helps.SoftmaxActivation(),
|
||||
bias_attr=True)
|
||||
|
||||
conf_helps.outputs(
|
||||
conf_helps.classification_cost(
|
||||
input=prob, label=label))
|
||||
|
||||
return str(parse_network(test))
|
||||
|
||||
def parse_new_rnn():
|
||||
data = layer.data(
|
||||
name="word", type=data_type.dense_vector(dict_dim))
|
||||
label = layer.data(
|
||||
name="label", type=data_type.dense_vector(label_dim))
|
||||
emb = layer.embedding(input=data, size=word_dim)
|
||||
boot_layer = layer.data(
|
||||
name="boot", type=data_type.dense_vector(10))
|
||||
boot_layer = layer.fc(name='boot_fc', input=boot_layer, size=10)
|
||||
|
||||
def step(y, wid):
|
||||
z = layer.embedding(input=wid, size=word_dim)
|
||||
mem = layer.memory(
|
||||
name="rnn_state", size=hidden_dim, boot_layer=boot_layer)
|
||||
out = layer.fc(input=[y, z, mem],
|
||||
size=hidden_dim,
|
||||
act=activation.Tanh(),
|
||||
bias_attr=True,
|
||||
name="rnn_state")
|
||||
return out
|
||||
|
||||
out = layer.recurrent_group(
|
||||
name="rnn", step=step, input=[emb, data])
|
||||
|
||||
rep = layer.last_seq(input=out)
|
||||
prob = layer.fc(size=label_dim,
|
||||
input=rep,
|
||||
act=activation.Softmax(),
|
||||
bias_attr=True)
|
||||
|
||||
cost = layer.classification_cost(input=prob, label=label)
|
||||
|
||||
return str(layer.parse_network(cost))
|
||||
|
||||
diff = difflib.unified_diff(parse_old_rnn().splitlines(1),
|
||||
parse_new_rnn().splitlines(1))
|
||||
print ''.join(diff)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Loading…
Reference in new issue