From 9ccc94f4a4d5bd87793730be1a73888c09a55cb3 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Mon, 27 Feb 2017 19:56:33 +0800 Subject: [PATCH 01/22] srl api training --- demo/semantic_role_labeling/api_train_v2.py | 112 ++++++++++++++++++++ demo/semantic_role_labeling/model_v2.py | 103 ++++++++++++++++++ 2 files changed, 215 insertions(+) create mode 100644 demo/semantic_role_labeling/api_train_v2.py create mode 100644 demo/semantic_role_labeling/model_v2.py diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py new file mode 100644 index 0000000000..33b966cca5 --- /dev/null +++ b/demo/semantic_role_labeling/api_train_v2.py @@ -0,0 +1,112 @@ +import numpy +import paddle.v2 as paddle +from paddle.trainer_config_helpers.atts import ParamAttr + +from mode_v2 import db_lstm + +word_dict_file = './data/wordDict.txt' +label_dict_file = './data/targetDict.txt' +predicate_file = './data/verbDict.txt' + +word_dict = dict() +label_dict = dict() +predicate_dict = dict() + +with open(word_dict_file, 'r') as f_word, \ + open(label_dict_file, 'r') as f_label, \ + open(predicate_file, 'r') as f_pre: + for i, line in enumerate(f_word): + w = line.strip() + word_dict[w] = i + + for i, line in enumerate(f_label): + w = line.strip() + label_dict[w] = i + + for i, line in enumerate(f_pre): + w = line.strip() + predicate_dict[w] = i + +word_dict_len = len(word_dict) +label_dict_len = len(label_dict) +pred_len = len(predicate_dict) + + +def train_reader(file_name="data/feature"): + def reader(): + with open(file_name, 'r') as fdata: + for line in fdata: + sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = \ + line.strip().split('\t') + + words = sentence.split() + sen_len = len(words) + word_slot = [word_dict.get(w, UNK_IDX) for w in words] + + predicate_slot = [predicate_dict.get(predicate)] * sen_len + ctx_n2_slot = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len + ctx_n1_slot = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len + ctx_0_slot = [word_dict.get(ctx_0, UNK_IDX)] * sen_len + ctx_p1_slot = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len + ctx_p2_slot = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len + + marks = mark.split() + mark_slot = [int(w) for w in marks] + + label_list = label.split() + label_slot = [label_dict.get(w) for w in label_list] + yield word_slot, ctx_n2_slot, ctx_n1_slot, \ + ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot, label_slot + + return reader + + +def main(): + paddle.init(use_gpu=False, trainer_count=1) + + label_dict_len = 500 + # define network topology + output = db_lstm() + target = paddle.layer.data(name='target', size=label_dict_len) + crf_cost = paddle.layer.crf_layer( + size=500, + input=output, + label=target, + param_attr=paddle.attr.Param( + name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr)) + + crf_dec = paddle.layer.crf_decoding_layer( + name='crf_dec_l', + size=label_dict_len, + input=output, + label=target, + param_attr=paddle.attr.Param(name='crfw')) + + topo = [crf_cost, crf_dec] + parameters = paddle.parameters.create(topo) + optimizer = paddle.optimizer.Momentum(momentum=0.01, learning_rate=2e-2) + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + para = parameters.get('___fc_2__.w0') + print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id, + event.cost, para.mean()) + + else: + pass + + trainer = paddle.trainer.SGD(update_equation=optimizer) + + trainer.train( + train_data_reader=train_reader, + batch_size=32, + topology=topo, + parameters=parameters, + event_handler=event_handler, + num_passes=10000, + data_types=[], + reader_dict={}) + + +if __name__ == '__main__': + main() diff --git a/demo/semantic_role_labeling/model_v2.py b/demo/semantic_role_labeling/model_v2.py new file mode 100644 index 0000000000..d4d011770d --- /dev/null +++ b/demo/semantic_role_labeling/model_v2.py @@ -0,0 +1,103 @@ +import paddle.v2 as paddle + + +def db_lstm(word_dict_len, label_dict_len, pred_len): + mark_dict_len = 2 + word_dim = 32 + mark_dim = 5 + hidden_dim = 512 + depth = 8 + + #8 features + word = paddle.layer.data(name='word_data', size=word_dict_len) + predicate = paddle.layer.data(name='verb_data', size=pred_len) + + ctx_n2 = paddle.layer.data(name='ctx_n2_data', size=word_dict_len) + ctx_n1 = paddle.layer.data(name='ctx_n1_data', size=word_dict_len) + ctx_0 = paddle.layer.data(name='ctx_0_data', size=word_dict_len) + ctx_p1 = paddle.layer.data(name='ctx_p1_data', size=word_dict_len) + ctx_p2 = paddle.layer.data(name='ctx_p2_data', size=word_dict_len) + mark = paddle.layer.data(name='mark_data', size=mark_dict_len) + + default_std = 1 / math.sqrt(hidden_dim) / 3.0 + + emb_para = paddle.attr.Param(name='emb', initial_std=0., learning_rate=0.) + std_0 = paddle.attr.Param(initial_std=0.) + std_default = paddle.attr.Param(initial_std=default_std) + + predicate_embedding = paddle.layer.embeding( + size=word_dim, + input=predicate, + param_attr=paddle.attr.Param( + name='vemb', initial_std=default_std)) + mark_embedding = paddle.layer.embeding( + name='word_ctx-in_embedding', + size=mark_dim, + input=mark, + param_attr=std_0) + + word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] + emb_layers = [ + paddle.layer.embeding( + size=word_dim, input=x, param_attr=emb_para) for x in word_input + ] + emb_layers.append(predicate_embedding) + emb_layers.append(mark_embedding) + + hidden_0 = paddle.layer.mixed( + size=hidden_dim, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=emb, param_attr=std_default) for emb in emb_layers + ]) + + mix_hidden_lr = 1e-3 + lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0) + hidden_para_attr = paddle.attr.Param( + initial_std=default_std, learning_rate=mix_hidden_lr) + + lstm_0 = paddle.layer.lstmemory( + input=hidden_0, + act=paddle.activation.Relu(), + gate_act=paddle.activation.Sigmoid(), + state_act=paddle.activation.Sigmoid(), + bias_attr=std_0, + param_attr=lstm_para_attr) + + #stack L-LSTM and R-LSTM with direct edges + input_tmp = [hidden_0, lstm_0] + + for i in range(1, depth): + mix_hidden = paddle.layer.mixed( + size=hidden_dim, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=input_tmp[0], param_attr=hidden_para_attr), + paddle.layer.full_matrix_projection( + input=input_tmp[1], param_attr=lstm_para_attr) + ]) + + lstm = paddle.layer.lstmemory( + input=mix_hidden, + act=paddle.activation.Relu(), + gate_act=paddle.activation.Sigmoid(), + state_act=paddle.activation.Sigmoid(), + reverse=((i % 2) == 1), + bias_attr=std_0, + param_attr=lstm_para_attr) + + input_tmp = [mix_hidden, lstm] + + feature_out = paddle.layer.mixed( + size=label_dict_len, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=input_tmp[0], param_attr=hidden_para_attr), + paddle.layer.full_matrix_projection( + input=input_tmp[1], param_attr=lstm_para_attr) + ], ) + + return feature_out From da754d85de3ffcf850ad2b375d8922110c7279e1 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Tue, 28 Feb 2017 14:55:16 +0800 Subject: [PATCH 02/22] srl api training --- demo/semantic_role_labeling/api_train_v2.py | 21 +++++++++++------ demo/semantic_role_labeling/model_v2.py | 25 +++++++++++---------- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py index 33b966cca5..daaf0f0582 100644 --- a/demo/semantic_role_labeling/api_train_v2.py +++ b/demo/semantic_role_labeling/api_train_v2.py @@ -1,8 +1,6 @@ import numpy import paddle.v2 as paddle -from paddle.trainer_config_helpers.atts import ParamAttr - -from mode_v2 import db_lstm +from model_v2 import db_lstm word_dict_file = './data/wordDict.txt' label_dict_file = './data/targetDict.txt' @@ -64,9 +62,8 @@ def train_reader(file_name="data/feature"): def main(): paddle.init(use_gpu=False, trainer_count=1) - label_dict_len = 500 # define network topology - output = db_lstm() + output = db_lstm(word_dict_len, label_dict_len, pred_len) target = paddle.layer.data(name='target', size=label_dict_len) crf_cost = paddle.layer.crf_layer( size=500, @@ -97,6 +94,17 @@ def main(): trainer = paddle.trainer.SGD(update_equation=optimizer) + reader_dict = { + 'word_data': 0, + 'verb_data': 1, + 'ctx_n2_data': 2, + 'ctx_n1_data': 3, + 'ctx_0_data': 4, + 'ctx_p1_data': 5, + 'ctx_p2_data': 6, + 'mark_data': 7, + 'target': 8 + } trainer.train( train_data_reader=train_reader, batch_size=32, @@ -104,8 +112,7 @@ def main(): parameters=parameters, event_handler=event_handler, num_passes=10000, - data_types=[], - reader_dict={}) + reader_dict=reader_dict) if __name__ == '__main__': diff --git a/demo/semantic_role_labeling/model_v2.py b/demo/semantic_role_labeling/model_v2.py index d4d011770d..a78190a2b2 100644 --- a/demo/semantic_role_labeling/model_v2.py +++ b/demo/semantic_role_labeling/model_v2.py @@ -1,3 +1,4 @@ +import math import paddle.v2 as paddle @@ -9,15 +10,18 @@ def db_lstm(word_dict_len, label_dict_len, pred_len): depth = 8 #8 features - word = paddle.layer.data(name='word_data', size=word_dict_len) - predicate = paddle.layer.data(name='verb_data', size=pred_len) + def d_type(size): + return paddle.data_type.integer_value_sequence(size) - ctx_n2 = paddle.layer.data(name='ctx_n2_data', size=word_dict_len) - ctx_n1 = paddle.layer.data(name='ctx_n1_data', size=word_dict_len) - ctx_0 = paddle.layer.data(name='ctx_0_data', size=word_dict_len) - ctx_p1 = paddle.layer.data(name='ctx_p1_data', size=word_dict_len) - ctx_p2 = paddle.layer.data(name='ctx_p2_data', size=word_dict_len) - mark = paddle.layer.data(name='mark_data', size=mark_dict_len) + word = paddle.layer.data(name='word_data', type=d_type(word_dict_len)) + predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len)) + + ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len)) + ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len)) + ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len)) + ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len)) + ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len)) + mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len)) default_std = 1 / math.sqrt(hidden_dim) / 3.0 @@ -31,10 +35,7 @@ def db_lstm(word_dict_len, label_dict_len, pred_len): param_attr=paddle.attr.Param( name='vemb', initial_std=default_std)) mark_embedding = paddle.layer.embeding( - name='word_ctx-in_embedding', - size=mark_dim, - input=mark, - param_attr=std_0) + size=mark_dim, input=mark, param_attr=std_0) word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] emb_layers = [ From e6e8bfb44ef70320bcf1cca1abeebd6ff58281b4 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Tue, 28 Feb 2017 19:48:22 +0800 Subject: [PATCH 03/22] update --- demo/semantic_role_labeling/api_train_v2.py | 55 ++++++++++----------- demo/semantic_role_labeling/model_v2.py | 25 ++++++++-- 2 files changed, 46 insertions(+), 34 deletions(-) diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py index daaf0f0582..0317c818db 100644 --- a/demo/semantic_role_labeling/api_train_v2.py +++ b/demo/semantic_role_labeling/api_train_v2.py @@ -2,6 +2,8 @@ import numpy import paddle.v2 as paddle from model_v2 import db_lstm +UNK_IDX = 0 + word_dict_file = './data/wordDict.txt' label_dict_file = './data/targetDict.txt' predicate_file = './data/verbDict.txt' @@ -29,6 +31,10 @@ word_dict_len = len(word_dict) label_dict_len = len(label_dict) pred_len = len(predicate_dict) +print 'word_dict_len=%d' % word_dict_len +print 'label_dict_len=%d' % label_dict_len +print 'pred_len=%d' % pred_len + def train_reader(file_name="data/feature"): def reader(): @@ -63,31 +69,16 @@ def main(): paddle.init(use_gpu=False, trainer_count=1) # define network topology - output = db_lstm(word_dict_len, label_dict_len, pred_len) - target = paddle.layer.data(name='target', size=label_dict_len) - crf_cost = paddle.layer.crf_layer( - size=500, - input=output, - label=target, - param_attr=paddle.attr.Param( - name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr)) - - crf_dec = paddle.layer.crf_decoding_layer( - name='crf_dec_l', - size=label_dict_len, - input=output, - label=target, - param_attr=paddle.attr.Param(name='crfw')) - - topo = [crf_cost, crf_dec] - parameters = paddle.parameters.create(topo) + crf_cost, crf_dec = db_lstm(word_dict_len, label_dict_len, pred_len) + + #parameters = paddle.parameters.create([crf_cost, crf_dec]) + parameters = paddle.parameters.create(crf_cost) optimizer = paddle.optimizer.Momentum(momentum=0.01, learning_rate=2e-2) def event_handler(event): if isinstance(event, paddle.event.EndIteration): - para = parameters.get('___fc_2__.w0') print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id, - event.cost, para.mean()) + event.cost) else: pass @@ -96,23 +87,27 @@ def main(): reader_dict = { 'word_data': 0, - 'verb_data': 1, - 'ctx_n2_data': 2, - 'ctx_n1_data': 3, - 'ctx_0_data': 4, - 'ctx_p1_data': 5, - 'ctx_p2_data': 6, + 'ctx_n2_data': 1, + 'ctx_n1_data': 2, + 'ctx_0_data': 3, + 'ctx_p1_data': 4, + 'ctx_p2_data': 5, + 'verb_data': 6, 'mark_data': 7, - 'target': 8 + 'target': 8, } + #trn_reader = paddle.reader.batched( + # paddle.reader.shuffle( + # train_reader(), buf_size=8192), batch_size=2) + trn_reader = paddle.reader.batched(train_reader(), batch_size=1) trainer.train( - train_data_reader=train_reader, - batch_size=32, - topology=topo, + reader=trn_reader, + cost=crf_cost, parameters=parameters, event_handler=event_handler, num_passes=10000, reader_dict=reader_dict) + #cost=[crf_cost, crf_dec], if __name__ == '__main__': diff --git a/demo/semantic_role_labeling/model_v2.py b/demo/semantic_role_labeling/model_v2.py index a78190a2b2..cec58e52c7 100644 --- a/demo/semantic_role_labeling/model_v2.py +++ b/demo/semantic_role_labeling/model_v2.py @@ -23,23 +23,25 @@ def db_lstm(word_dict_len, label_dict_len, pred_len): ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len)) mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len)) + target = paddle.layer.data(name='target', type=d_type(label_dict_len)) + default_std = 1 / math.sqrt(hidden_dim) / 3.0 emb_para = paddle.attr.Param(name='emb', initial_std=0., learning_rate=0.) std_0 = paddle.attr.Param(initial_std=0.) std_default = paddle.attr.Param(initial_std=default_std) - predicate_embedding = paddle.layer.embeding( + predicate_embedding = paddle.layer.embedding( size=word_dim, input=predicate, param_attr=paddle.attr.Param( name='vemb', initial_std=default_std)) - mark_embedding = paddle.layer.embeding( + mark_embedding = paddle.layer.embedding( size=mark_dim, input=mark, param_attr=std_0) word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] emb_layers = [ - paddle.layer.embeding( + paddle.layer.embedding( size=word_dim, input=x, param_attr=emb_para) for x in word_input ] emb_layers.append(predicate_embedding) @@ -101,4 +103,19 @@ def db_lstm(word_dict_len, label_dict_len, pred_len): input=input_tmp[1], param_attr=lstm_para_attr) ], ) - return feature_out + crf_cost = paddle.layer.crf(size=label_dict_len, + input=feature_out, + label=target, + param_attr=paddle.attr.Param( + name='crfw', + initial_std=default_std, + learning_rate=mix_hidden_lr)) + + crf_dec = paddle.layer.crf_decoding( + name='crf_dec_l', + size=label_dict_len, + input=feature_out, + label=target, + param_attr=paddle.attr.Param(name='crfw')) + + return crf_cost, crf_dec From 542eb736ab66ca5f7f974fde8d6a91bbfa781f4b Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Wed, 1 Mar 2017 15:47:07 +0800 Subject: [PATCH 04/22] update --- demo/semantic_role_labeling/api_train_v2.py | 37 +++++++++++---------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py index 0317c818db..cfbd2a0224 100644 --- a/demo/semantic_role_labeling/api_train_v2.py +++ b/demo/semantic_role_labeling/api_train_v2.py @@ -1,4 +1,4 @@ -import numpy +import numpy as np import paddle.v2 as paddle from model_v2 import db_lstm @@ -31,10 +31,6 @@ word_dict_len = len(word_dict) label_dict_len = len(label_dict) pred_len = len(predicate_dict) -print 'word_dict_len=%d' % word_dict_len -print 'label_dict_len=%d' % label_dict_len -print 'pred_len=%d' % pred_len - def train_reader(file_name="data/feature"): def reader(): @@ -65,25 +61,34 @@ def train_reader(file_name="data/feature"): return reader +def load_parameter(file_name, h, w): + with open(file_name, 'rb') as f: + f.read(16) # skip header for float type. + return np.fromfile(f, dtype=np.float32).reshape(h, w) + + def main(): paddle.init(use_gpu=False, trainer_count=1) # define network topology crf_cost, crf_dec = db_lstm(word_dict_len, label_dict_len, pred_len) - #parameters = paddle.parameters.create([crf_cost, crf_dec]) - parameters = paddle.parameters.create(crf_cost) + parameters = paddle.parameters.create([crf_cost, crf_dec]) optimizer = paddle.optimizer.Momentum(momentum=0.01, learning_rate=2e-2) def event_handler(event): if isinstance(event, paddle.event.EndIteration): - print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id, - event.cost) - + if event.batch_id % 100 == 0: + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) else: pass - trainer = paddle.trainer.SGD(update_equation=optimizer) + trainer = paddle.trainer.SGD(cost=crf_cost, + parameters=parameters, + update_equation=optimizer) + + parameters.set('emb', load_parameter("data/emb", 44068, 32)) reader_dict = { 'word_data': 0, @@ -96,18 +101,14 @@ def main(): 'mark_data': 7, 'target': 8, } - #trn_reader = paddle.reader.batched( - # paddle.reader.shuffle( - # train_reader(), buf_size=8192), batch_size=2) - trn_reader = paddle.reader.batched(train_reader(), batch_size=1) + trn_reader = paddle.reader.batched( + paddle.reader.shuffle( + train_reader(), buf_size=8192), batch_size=10) trainer.train( reader=trn_reader, - cost=crf_cost, - parameters=parameters, event_handler=event_handler, num_passes=10000, reader_dict=reader_dict) - #cost=[crf_cost, crf_dec], if __name__ == '__main__': From 82ec9f225b210ff99d83b97e0e09938061aba4ee Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Wed, 1 Mar 2017 17:50:19 +0800 Subject: [PATCH 05/22] Training the understand sentiment model with the new API. --- demo/sentiment/train_with_new_api.py | 182 +++++++++++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 demo/sentiment/train_with_new_api.py diff --git a/demo/sentiment/train_with_new_api.py b/demo/sentiment/train_with_new_api.py new file mode 100644 index 0000000000..f937b02906 --- /dev/null +++ b/demo/sentiment/train_with_new_api.py @@ -0,0 +1,182 @@ +from os.path import join as join_path +import paddle.v2 as paddle +import paddle.v2.layer as layer +import paddle.v2.activation as activation +import paddle.v2.data_type as data_type + + +def sequence_conv_pool(input, + input_size, + context_len, + hidden_size, + name=None, + context_start=None, + pool_type=None, + context_proj_layer_name=None, + context_proj_param_attr=False, + fc_layer_name=None, + fc_param_attr=None, + fc_bias_attr=None, + fc_act=None, + pool_bias_attr=None, + fc_attr=None, + context_attr=None, + pool_attr=None): + """ + Text convolution pooling layers helper. + + Text input => Context Projection => FC Layer => Pooling => Output. + + :param name: name of output layer(pooling layer name) + :type name: basestring + :param input: name of input layer + :type input: LayerOutput + :param context_len: context projection length. See + context_projection's document. + :type context_len: int + :param hidden_size: FC Layer size. + :type hidden_size: int + :param context_start: context projection length. See + context_projection's context_start. + :type context_start: int or None + :param pool_type: pooling layer type. See pooling_layer's document. + :type pool_type: BasePoolingType. + :param context_proj_layer_name: context projection layer name. + None if user don't care. + :type context_proj_layer_name: basestring + :param context_proj_param_attr: context projection parameter attribute. + None if user don't care. + :type context_proj_param_attr: ParameterAttribute or None. + :param fc_layer_name: fc layer name. None if user don't care. + :type fc_layer_name: basestring + :param fc_param_attr: fc layer parameter attribute. None if user don't care. + :type fc_param_attr: ParameterAttribute or None + :param fc_bias_attr: fc bias parameter attribute. False if no bias, + None if user don't care. + :type fc_bias_attr: ParameterAttribute or None + :param fc_act: fc layer activation type. None means tanh + :type fc_act: BaseActivation + :param pool_bias_attr: pooling layer bias attr. None if don't care. + False if no bias. + :type pool_bias_attr: ParameterAttribute or None. + :param fc_attr: fc layer extra attribute. + :type fc_attr: ExtraLayerAttribute + :param context_attr: context projection layer extra attribute. + :type context_attr: ExtraLayerAttribute + :param pool_attr: pooling layer extra attribute. + :type pool_attr: ExtraLayerAttribute + :return: output layer name. + :rtype: LayerOutput + """ + # Set Default Value to param + context_proj_layer_name = "%s_conv_proj" % name \ + if context_proj_layer_name is None else context_proj_layer_name + + with layer.mixed( + name=context_proj_layer_name, + size=input_size * context_len, + act=activation.Linear(), + layer_attr=context_attr) as m: + m += layer.context_projection( + input=input, + context_len=context_len, + context_start=context_start, + padding_attr=context_proj_param_attr) + + fc_layer_name = "%s_conv_fc" % name \ + if fc_layer_name is None else fc_layer_name + fl = layer.fc(name=fc_layer_name, + input=m, + size=hidden_size, + act=fc_act, + layer_attr=fc_attr, + param_attr=fc_param_attr, + bias_attr=fc_bias_attr) + + return layer.pooling( + name=name, + input=fl, + pooling_type=pool_type, + bias_attr=pool_bias_attr, + layer_attr=pool_attr) + + +def convolution_net(input_dim, + class_dim=2, + emb_dim=128, + hid_dim=128, + is_predict=False): + data = layer.data("word", data_type.integer_value_sequence(input_dim)) + emb = layer.embedding(input=data, size=emb_dim) + conv_3 = sequence_conv_pool( + input=emb, input_size=emb_dim, context_len=3, hidden_size=hid_dim) + conv_4 = sequence_conv_pool( + input=emb, input_size=emb_dim, context_len=4, hidden_size=hid_dim) + output = layer.fc(input=[conv_3, conv_4], + size=class_dim, + act=activation.Softmax()) + lbl = layer.data("label", data_type.integer_value(1)) + cost = layer.classification_cost(input=output, label=lbl) + return cost + + +def data_reader(): + data_dir = "./data/pre-imdb" + train_file = "train_part_000" + test_file = "test_part_000" + dict_file = "dict.txt" + train_file = join_path(data_dir, train_file) + test_file = join_path(data_dir, test_file) + dict_file = join_path(data_dir, dict_file) + + with open(dict_file, 'r') as fdict, open(train_file, 'r') as fdata: + dictionary = dict() + for i, line in enumerate(fdict): + dictionary[line.split('\t')[0]] = i + + print('dict len : %d' % (len(dictionary))) + for line_count, line in enumerate(fdata): + label, comment = line.strip().split('\t\t') + label = int(label) + words = comment.split() + word_slot = [dictionary[w] for w in words if w in dictionary] + yield (word_slot, label) + + +if __name__ == '__main__': + data_dir = "./data/pre-imdb" + train_list = "train.list" + test_list = "test.list" + dict_file = "dict.txt" + dict_dim = len(open(join_path(data_dir, "dict.txt")).readlines()) + class_dim = len(open(join_path(data_dir, 'labels.list')).readlines()) + is_predict = False + + # init + paddle.init(use_gpu=True, trainer_count=4) + + # network config + cost = convolution_net(dict_dim, class_dim=class_dim, is_predict=is_predict) + + # create parameters + parameters = paddle.parameters.create(cost) + + adam_optimizer = paddle.optimizer.Adam(learning_rate=0.01) + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 1 == 0: + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=adam_optimizer) + + trainer.train( + reader=paddle.reader.batched( + data_reader, batch_size=128), + event_handler=event_handler, + reader_dict={'word': 0, + 'label': 1}, + num_passes=10) From 3a5f98c36a13a4c027ee87461f52b49ebb6b6002 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Wed, 1 Mar 2017 18:59:00 +0800 Subject: [PATCH 06/22] Add reader.shuffle --- demo/sentiment/train_with_new_api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/demo/sentiment/train_with_new_api.py b/demo/sentiment/train_with_new_api.py index f937b02906..59a303c0d5 100644 --- a/demo/sentiment/train_with_new_api.py +++ b/demo/sentiment/train_with_new_api.py @@ -134,7 +134,6 @@ def data_reader(): for i, line in enumerate(fdict): dictionary[line.split('\t')[0]] = i - print('dict len : %d' % (len(dictionary))) for line_count, line in enumerate(fdata): label, comment = line.strip().split('\t\t') label = int(label) @@ -165,7 +164,7 @@ if __name__ == '__main__': def event_handler(event): if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 1 == 0: + if event.batch_id % 100 == 0: print "Pass %d, Batch %d, Cost %f, %s" % ( event.pass_id, event.batch_id, event.cost, event.metrics) @@ -175,7 +174,8 @@ if __name__ == '__main__': trainer.train( reader=paddle.reader.batched( - data_reader, batch_size=128), + paddle.reader.shuffle( + data_reader, buf_size=4096), batch_size=128), event_handler=event_handler, reader_dict={'word': 0, 'label': 1}, From 41f04e5ae4a459c0934cc7bca55e75dbbbb51b8a Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Wed, 1 Mar 2017 19:25:09 +0800 Subject: [PATCH 07/22] Add regularization and model_average --- demo/sentiment/train_with_new_api.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/demo/sentiment/train_with_new_api.py b/demo/sentiment/train_with_new_api.py index 59a303c0d5..bec07de92a 100644 --- a/demo/sentiment/train_with_new_api.py +++ b/demo/sentiment/train_with_new_api.py @@ -160,7 +160,10 @@ if __name__ == '__main__': # create parameters parameters = paddle.parameters.create(cost) - adam_optimizer = paddle.optimizer.Adam(learning_rate=0.01) + adam_optimizer = paddle.optimizer.Adam( + learning_rate=2e-3, + regularization=paddle.optimizer.L2Regularization(rate=8e-4), + model_average=paddle.optimizer.ModelAverage(average_window=0.5)) def event_handler(event): if isinstance(event, paddle.event.EndIteration): From 1d0a8c2f745dc15d17a83ac43e8e3ca9296d6216 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Wed, 1 Mar 2017 19:31:57 +0800 Subject: [PATCH 08/22] rename train_v2.py --- demo/sentiment/{train_with_new_api.py => train_v2.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename demo/sentiment/{train_with_new_api.py => train_v2.py} (100%) diff --git a/demo/sentiment/train_with_new_api.py b/demo/sentiment/train_v2.py similarity index 100% rename from demo/sentiment/train_with_new_api.py rename to demo/sentiment/train_v2.py From 803da664eddfc85bb55e192b7a98c696bf4fe112 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Wed, 1 Mar 2017 19:49:17 +0800 Subject: [PATCH 09/22] Add test --- demo/sentiment/train_v2.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/demo/sentiment/train_v2.py b/demo/sentiment/train_v2.py index bec07de92a..a764798add 100644 --- a/demo/sentiment/train_v2.py +++ b/demo/sentiment/train_v2.py @@ -142,6 +142,28 @@ def data_reader(): yield (word_slot, label) +def test_reader(): + data_dir = "./data/pre-imdb" + train_file = "train_part_000" + test_file = "test_part_000" + dict_file = "dict.txt" + train_file = join_path(data_dir, train_file) + test_file = join_path(data_dir, test_file) + dict_file = join_path(data_dir, dict_file) + + with open(dict_file, 'r') as fdict, open(test_file, 'r') as ftest: + dictionary = dict() + for i, line in enumerate(fdict): + dictionary[line.split('\t')[0]] = i + + for line_count, line in enumerate(ftest): + label, comment = line.strip().split('\t\t') + label = int(label) + words = comment.split() + word_slot = [dictionary[w] for w in words if w in dictionary] + yield (word_slot, label) + + if __name__ == '__main__': data_dir = "./data/pre-imdb" train_list = "train.list" @@ -170,6 +192,13 @@ if __name__ == '__main__': if event.batch_id % 100 == 0: print "Pass %d, Batch %d, Cost %f, %s" % ( event.pass_id, event.batch_id, event.cost, event.metrics) + if isinstance(event, paddle.event.EndPass): + result = trainer.test( + reader=paddle.reader.batched( + test_reader, batch_size=128), + reader_dict={'word': 0, + 'label': 1}) + print "Test with Pass %d, %s" % (event.pass_id, result.metrics) trainer = paddle.trainer.SGD(cost=cost, parameters=parameters, From 0a33f170a423cc238f7b1c37a8e76a48ce9f48ec Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Wed, 1 Mar 2017 20:35:04 +0800 Subject: [PATCH 10/22] Add stacked lstm network --- demo/sentiment/train_v2.py | 74 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/demo/sentiment/train_v2.py b/demo/sentiment/train_v2.py index a764798add..779bfee5b6 100644 --- a/demo/sentiment/train_v2.py +++ b/demo/sentiment/train_v2.py @@ -1,4 +1,6 @@ from os.path import join as join_path +import paddle.trainer_config_helpers.attrs as attrs +from paddle.trainer_config_helpers.poolings import MaxPooling import paddle.v2 as paddle import paddle.v2.layer as layer import paddle.v2.activation as activation @@ -115,7 +117,73 @@ def convolution_net(input_dim, output = layer.fc(input=[conv_3, conv_4], size=class_dim, act=activation.Softmax()) - lbl = layer.data("label", data_type.integer_value(1)) + lbl = layer.data("label", data_type.integer_value(2)) + cost = layer.classification_cost(input=output, label=lbl) + return cost + + +def stacked_lstm_net(input_dim, + class_dim=2, + emb_dim=128, + hid_dim=512, + stacked_num=3, + is_predict=False): + """ + A Wrapper for sentiment classification task. + This network uses bi-directional recurrent network, + consisting three LSTM layers. This configure is referred to + the paper as following url, but use fewer layrs. + http://www.aclweb.org/anthology/P15-1109 + + input_dim: here is word dictionary dimension. + class_dim: number of categories. + emb_dim: dimension of word embedding. + hid_dim: dimension of hidden layer. + stacked_num: number of stacked lstm-hidden layer. + is_predict: is predicting or not. + Some layers is not needed in network when predicting. + """ + assert stacked_num % 2 == 1 + + layer_attr = attrs.ExtraLayerAttribute(drop_rate=0.5) + fc_para_attr = attrs.ParameterAttribute(learning_rate=1e-3) + lstm_para_attr = attrs.ParameterAttribute(initial_std=0., learning_rate=1.) + para_attr = [fc_para_attr, lstm_para_attr] + bias_attr = attrs.ParameterAttribute(initial_std=0., l2_rate=0.) + relu = activation.Relu() + linear = activation.Linear() + + data = layer.data("word", data_type.integer_value_sequence(input_dim)) + emb = layer.embedding(input=data, size=emb_dim) + + fc1 = layer.fc(input=emb, size=hid_dim, act=linear, bias_attr=bias_attr) + lstm1 = layer.lstmemory( + input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr) + + inputs = [fc1, lstm1] + for i in range(2, stacked_num + 1): + fc = layer.fc(input=inputs, + size=hid_dim, + act=linear, + param_attr=para_attr, + bias_attr=bias_attr) + lstm = layer.lstmemory( + input=fc, + reverse=(i % 2) == 0, + act=relu, + bias_attr=bias_attr, + layer_attr=layer_attr) + inputs = [fc, lstm] + + fc_last = layer.pooling(input=inputs[0], pooling_type=MaxPooling()) + lstm_last = layer.pooling(input=inputs[1], pooling_type=MaxPooling()) + output = layer.fc(input=[fc_last, lstm_last], + size=class_dim, + act=activation.Softmax(), + bias_attr=bias_attr, + param_attr=para_attr) + + lbl = layer.data("label", data_type.integer_value(2)) cost = layer.classification_cost(input=output, label=lbl) return cost @@ -177,7 +245,9 @@ if __name__ == '__main__': paddle.init(use_gpu=True, trainer_count=4) # network config - cost = convolution_net(dict_dim, class_dim=class_dim, is_predict=is_predict) + # cost = convolution_net(dict_dim, class_dim=class_dim, is_predict=is_predict) + cost = stacked_lstm_net( + dict_dim, class_dim=class_dim, stacked_num=3, is_predict=is_predict) # create parameters parameters = paddle.parameters.create(cost) From 1524f2041ee3e5dd6bf1613afeb16ed3884939e9 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 2 Mar 2017 13:03:41 +0800 Subject: [PATCH 11/22] Add testing cost. --- demo/mnist/api_train_v2.py | 9 +++++---- python/paddle/v2/event.py | 3 ++- python/paddle/v2/trainer.py | 12 ++++++++---- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py index 06beb7024d..00d1022175 100644 --- a/demo/mnist/api_train_v2.py +++ b/demo/mnist/api_train_v2.py @@ -30,10 +30,11 @@ def main(): result = trainer.test(reader=paddle.reader.batched( paddle.dataset.mnist.test(), batch_size=256)) - print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics, - result.metrics) - + print "Pass %d, Batch %d, Cost %.2f, %s, " \ + "Testing cost %.2f metrics %s" % ( + event.pass_id, event.batch_id, event.cost, + event.metrics, + result.cost, result.metrics) else: pass diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py index a78bcf076c..a429e36b63 100644 --- a/python/paddle/v2/event.py +++ b/python/paddle/v2/event.py @@ -34,8 +34,9 @@ class WithMetric(object): class TestResult(WithMetric): - def __init__(self, evaluator): + def __init__(self, evaluator, cost): super(TestResult, self).__init__(evaluator) + self.cost = cost class BeginPass(object): diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index 5003f55f3e..58ec6dd5fe 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -123,9 +123,8 @@ class SGD(ITrainer): for each_param in self.__gradient_machine__.getParameters(): updater.update(each_param) # Get cost. We use numpy to calculate total cost for this batch. - cost_vec = out_args.getSlotValue(0) - cost_vec = cost_vec.copyToNumpyMat() - cost = cost_vec.sum() / len(data_batch) + cost_sum = out_args.sumCosts() + cost = cost_sum / len(data_batch) updater.finishBatch(cost) batch_evaluator.finish() event_handler( @@ -154,13 +153,18 @@ class SGD(ITrainer): evaluator = self.__gradient_machine__.makeEvaluator() out_args = api.Arguments.createArguments(0) evaluator.start() + total_cost = 0 + num_samples = 0.0 for data_batch in reader(): + num_samples += len(data_batch) self.__gradient_machine__.forward( feeder(data_batch), out_args, api.PASS_TEST) + total_cost += out_args.sumCosts() self.__gradient_machine__.eval(evaluator) evaluator.finish() - return v2_event.TestResult(evaluator=evaluator) + return v2_event.TestResult( + evaluator=evaluator, cost=total_cost / num_samples) def __check_train_args__(reader, event_handler, **kwargs): From d3c755df3fe6009ed2cde1b5dca41196e4024aa7 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Thu, 2 Mar 2017 13:41:51 +0800 Subject: [PATCH 12/22] Refine code --- demo/sentiment/train_v2.py | 95 ++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 56 deletions(-) diff --git a/demo/sentiment/train_v2.py b/demo/sentiment/train_v2.py index 779bfee5b6..3d595fad30 100644 --- a/demo/sentiment/train_v2.py +++ b/demo/sentiment/train_v2.py @@ -1,3 +1,4 @@ +import sys from os.path import join as join_path import paddle.trainer_config_helpers.attrs as attrs from paddle.trainer_config_helpers.poolings import MaxPooling @@ -188,88 +189,69 @@ def stacked_lstm_net(input_dim, return cost -def data_reader(): - data_dir = "./data/pre-imdb" - train_file = "train_part_000" - test_file = "test_part_000" - dict_file = "dict.txt" - train_file = join_path(data_dir, train_file) - test_file = join_path(data_dir, test_file) - dict_file = join_path(data_dir, dict_file) - - with open(dict_file, 'r') as fdict, open(train_file, 'r') as fdata: - dictionary = dict() - for i, line in enumerate(fdict): - dictionary[line.split('\t')[0]] = i - - for line_count, line in enumerate(fdata): - label, comment = line.strip().split('\t\t') - label = int(label) - words = comment.split() - word_slot = [dictionary[w] for w in words if w in dictionary] - yield (word_slot, label) - - -def test_reader(): - data_dir = "./data/pre-imdb" - train_file = "train_part_000" - test_file = "test_part_000" - dict_file = "dict.txt" - train_file = join_path(data_dir, train_file) - test_file = join_path(data_dir, test_file) - dict_file = join_path(data_dir, dict_file) - - with open(dict_file, 'r') as fdict, open(test_file, 'r') as ftest: - dictionary = dict() - for i, line in enumerate(fdict): - dictionary[line.split('\t')[0]] = i - - for line_count, line in enumerate(ftest): - label, comment = line.strip().split('\t\t') - label = int(label) - words = comment.split() - word_slot = [dictionary[w] for w in words if w in dictionary] - yield (word_slot, label) +def data_reader(data_file, dict_file): + def reader(): + with open(dict_file, 'r') as fdict, open(data_file, 'r') as fdata: + dictionary = dict() + for i, line in enumerate(fdict): + dictionary[line.split('\t')[0]] = i + + for line_count, line in enumerate(fdata): + label, comment = line.strip().split('\t\t') + label = int(label) + words = comment.split() + word_slot = [dictionary[w] for w in words if w in dictionary] + yield (word_slot, label) + + return reader if __name__ == '__main__': - data_dir = "./data/pre-imdb" - train_list = "train.list" - test_list = "test.list" - dict_file = "dict.txt" - dict_dim = len(open(join_path(data_dir, "dict.txt")).readlines()) - class_dim = len(open(join_path(data_dir, 'labels.list')).readlines()) - is_predict = False + # data file + train_file = "./data/pre-imdb/train_part_000" + test_file = "./data/pre-imdb/test_part_000" + dict_file = "./data/pre-imdb/dict.txt" + labels = "./data/pre-imdb/labels.list" # init paddle.init(use_gpu=True, trainer_count=4) # network config - # cost = convolution_net(dict_dim, class_dim=class_dim, is_predict=is_predict) - cost = stacked_lstm_net( - dict_dim, class_dim=class_dim, stacked_num=3, is_predict=is_predict) + dict_dim = len(open(dict_file).readlines()) + class_dim = len(open(labels).readlines()) + + # Please choose the way to build the network + # by uncommenting the corresponding line. + cost = convolution_net(dict_dim, class_dim=class_dim) + # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3) # create parameters parameters = paddle.parameters.create(cost) + # create optimizer adam_optimizer = paddle.optimizer.Adam( learning_rate=2e-3, regularization=paddle.optimizer.L2Regularization(rate=8e-4), model_average=paddle.optimizer.ModelAverage(average_window=0.5)) + # End batch and end pass event handler def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 100 == 0: - print "Pass %d, Batch %d, Cost %f, %s" % ( + print "\nPass %d, Batch %d, Cost %f, %s" % ( event.pass_id, event.batch_id, event.cost, event.metrics) + else: + sys.stdout.write('.') + sys.stdout.flush() if isinstance(event, paddle.event.EndPass): result = trainer.test( reader=paddle.reader.batched( - test_reader, batch_size=128), + data_reader(test_file, dict_file), batch_size=128), reader_dict={'word': 0, 'label': 1}) - print "Test with Pass %d, %s" % (event.pass_id, result.metrics) + print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) + # create trainer trainer = paddle.trainer.SGD(cost=cost, parameters=parameters, update_equation=adam_optimizer) @@ -277,7 +259,8 @@ if __name__ == '__main__': trainer.train( reader=paddle.reader.batched( paddle.reader.shuffle( - data_reader, buf_size=4096), batch_size=128), + data_reader(train_file, dict_file), buf_size=4096), + batch_size=128), event_handler=event_handler, reader_dict={'word': 0, 'label': 1}, From e4007337ae88d0874f8d7c0bc41e9aa641de38b7 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 2 Mar 2017 15:09:21 +0800 Subject: [PATCH 13/22] Follow comments --- python/paddle/v2/trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index 58ec6dd5fe..21a1642c36 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -122,7 +122,6 @@ class SGD(ITrainer): self.__gradient_machine__.eval(batch_evaluator) for each_param in self.__gradient_machine__.getParameters(): updater.update(each_param) - # Get cost. We use numpy to calculate total cost for this batch. cost_sum = out_args.sumCosts() cost = cost_sum / len(data_batch) updater.finishBatch(cost) From 3b8a8f81142e7eaea06e8e43e41bcb7bc73b0e09 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 2 Mar 2017 15:10:54 +0800 Subject: [PATCH 14/22] Follow comments --- demo/mnist/api_train_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py index 00d1022175..575a32b322 100644 --- a/demo/mnist/api_train_v2.py +++ b/demo/mnist/api_train_v2.py @@ -30,7 +30,7 @@ def main(): result = trainer.test(reader=paddle.reader.batched( paddle.dataset.mnist.test(), batch_size=256)) - print "Pass %d, Batch %d, Cost %.2f, %s, " \ + print "Pass %d, Batch %d, Cost %.2f, %s\n" \ "Testing cost %.2f metrics %s" % ( event.pass_id, event.batch_id, event.cost, event.metrics, From 9059eea4f46cf47a9f1382b97f25ab5f4586a5da Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Thu, 2 Mar 2017 15:58:39 +0800 Subject: [PATCH 15/22] Fix k8s cluster job rerunable --- doc/howto/usage/k8s/src/k8s_train/start_paddle.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py index f1a770ccb5..935c12bb67 100755 --- a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py +++ b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py @@ -132,7 +132,8 @@ def startPaddle(idMap={}, train_args_dict=None): logDir = JOB_PATH_OUTPUT + "/node_" + str(trainerId) if not os.path.exists(JOB_PATH_OUTPUT): os.makedirs(JOB_PATH_OUTPUT) - os.mkdir(logDir) + if not os.path.exists(logDir): + os.mkdir(logDir) copyCommand = 'cp -rf ' + JOB_PATH + \ "/" + str(trainerId) + "/data/*" + " ./data/" os.system(copyCommand) From 4a94f8a4473b96161b721341fd0a889d34367aed Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Thu, 2 Mar 2017 16:00:59 +0800 Subject: [PATCH 16/22] refine api training --- demo/semantic_role_labeling/api_train_v2.py | 221 +++++++++++++------- demo/semantic_role_labeling/model_v2.py | 121 ----------- python/paddle/v2/dataset/__init__.py | 2 +- python/paddle/v2/dataset/conll05.py | 6 +- 4 files changed, 147 insertions(+), 203 deletions(-) delete mode 100644 demo/semantic_role_labeling/model_v2.py diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py index cfbd2a0224..8ce6faaa1b 100644 --- a/demo/semantic_role_labeling/api_train_v2.py +++ b/demo/semantic_role_labeling/api_train_v2.py @@ -1,69 +1,142 @@ +import sys +import math import numpy as np import paddle.v2 as paddle -from model_v2 import db_lstm +import paddle.v2.dataset.conll05 as conll05 UNK_IDX = 0 -word_dict_file = './data/wordDict.txt' -label_dict_file = './data/targetDict.txt' -predicate_file = './data/verbDict.txt' -word_dict = dict() -label_dict = dict() -predicate_dict = dict() - -with open(word_dict_file, 'r') as f_word, \ - open(label_dict_file, 'r') as f_label, \ - open(predicate_file, 'r') as f_pre: - for i, line in enumerate(f_word): - w = line.strip() - word_dict[w] = i - - for i, line in enumerate(f_label): - w = line.strip() - label_dict[w] = i - - for i, line in enumerate(f_pre): - w = line.strip() - predicate_dict[w] = i - -word_dict_len = len(word_dict) -label_dict_len = len(label_dict) -pred_len = len(predicate_dict) - - -def train_reader(file_name="data/feature"): - def reader(): - with open(file_name, 'r') as fdata: - for line in fdata: - sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = \ - line.strip().split('\t') - - words = sentence.split() - sen_len = len(words) - word_slot = [word_dict.get(w, UNK_IDX) for w in words] - - predicate_slot = [predicate_dict.get(predicate)] * sen_len - ctx_n2_slot = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len - ctx_n1_slot = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len - ctx_0_slot = [word_dict.get(ctx_0, UNK_IDX)] * sen_len - ctx_p1_slot = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len - ctx_p2_slot = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len - - marks = mark.split() - mark_slot = [int(w) for w in marks] - - label_list = label.split() - label_slot = [label_dict.get(w) for w in label_list] - yield word_slot, ctx_n2_slot, ctx_n1_slot, \ - ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot, label_slot - - return reader +def db_lstm(): + word_dict, verb_dict, label_dict = conll05.get_dict() + word_dict_len = len(word_dict) + label_dict_len = len(label_dict) + pred_len = len(verb_dict) + print 'word_dict_len,', word_dict_len + print 'label_dict_len,', label_dict_len + print 'pred_len,', pred_len + + mark_dict_len = 2 + word_dim = 32 + mark_dim = 5 + hidden_dim = 512 + depth = 8 + + #8 features + def d_type(size): + return paddle.data_type.integer_value_sequence(size) + + word = paddle.layer.data(name='word_data', type=d_type(word_dict_len)) + predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len)) + + ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len)) + ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len)) + ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len)) + ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len)) + ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len)) + mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len)) + + target = paddle.layer.data(name='target', type=d_type(label_dict_len)) + + default_std = 1 / math.sqrt(hidden_dim) / 3.0 + + emb_para = paddle.attr.Param(name='emb', initial_std=0., learning_rate=0.) + std_0 = paddle.attr.Param(initial_std=0.) + std_default = paddle.attr.Param(initial_std=default_std) + + predicate_embedding = paddle.layer.embedding( + size=word_dim, + input=predicate, + param_attr=paddle.attr.Param( + name='vemb', initial_std=default_std)) + mark_embedding = paddle.layer.embedding( + size=mark_dim, input=mark, param_attr=std_0) + + word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] + emb_layers = [ + paddle.layer.embedding( + size=word_dim, input=x, param_attr=emb_para) for x in word_input + ] + emb_layers.append(predicate_embedding) + emb_layers.append(mark_embedding) + + hidden_0 = paddle.layer.mixed( + size=hidden_dim, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=emb, param_attr=std_default) for emb in emb_layers + ]) + + mix_hidden_lr = 1e-3 + lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0) + hidden_para_attr = paddle.attr.Param( + initial_std=default_std, learning_rate=mix_hidden_lr) + + lstm_0 = paddle.layer.lstmemory( + input=hidden_0, + act=paddle.activation.Relu(), + gate_act=paddle.activation.Sigmoid(), + state_act=paddle.activation.Sigmoid(), + bias_attr=std_0, + param_attr=lstm_para_attr) + + #stack L-LSTM and R-LSTM with direct edges + input_tmp = [hidden_0, lstm_0] + + for i in range(1, depth): + mix_hidden = paddle.layer.mixed( + size=hidden_dim, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=input_tmp[0], param_attr=hidden_para_attr), + paddle.layer.full_matrix_projection( + input=input_tmp[1], param_attr=lstm_para_attr) + ]) + + lstm = paddle.layer.lstmemory( + input=mix_hidden, + act=paddle.activation.Relu(), + gate_act=paddle.activation.Sigmoid(), + state_act=paddle.activation.Sigmoid(), + reverse=((i % 2) == 1), + bias_attr=std_0, + param_attr=lstm_para_attr) + + input_tmp = [mix_hidden, lstm] + + feature_out = paddle.layer.mixed( + size=label_dict_len, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=input_tmp[0], param_attr=hidden_para_attr), + paddle.layer.full_matrix_projection( + input=input_tmp[1], param_attr=lstm_para_attr) + ], ) + + crf_cost = paddle.layer.crf(size=label_dict_len, + input=feature_out, + label=target, + param_attr=paddle.attr.Param( + name='crfw', + initial_std=default_std, + learning_rate=mix_hidden_lr)) + + crf_dec = paddle.layer.crf_decoding( + name='crf_dec_l', + size=label_dict_len, + input=feature_out, + label=target, + param_attr=paddle.attr.Param(name='crfw')) + + return crf_cost, crf_dec def load_parameter(file_name, h, w): with open(file_name, 'rb') as f: - f.read(16) # skip header for float type. + f.read(16) # skip header. return np.fromfile(f, dtype=np.float32).reshape(h, w) @@ -71,44 +144,36 @@ def main(): paddle.init(use_gpu=False, trainer_count=1) # define network topology - crf_cost, crf_dec = db_lstm(word_dict_len, label_dict_len, pred_len) + crf_cost, crf_dec = db_lstm() + # create parameters parameters = paddle.parameters.create([crf_cost, crf_dec]) - optimizer = paddle.optimizer.Momentum(momentum=0.01, learning_rate=2e-2) + + # create optimizer + optimizer = paddle.optimizer.Momentum( + momentum=0, + learning_rate=2e-2, + regularization=paddle.optimizer.L2Regularization(rate=8e-4), + model_average=paddle.optimizer.ModelAverage( + average_window=0.5, max_average_window=10000), ) def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 100 == 0: print "Pass %d, Batch %d, Cost %f, %s" % ( event.pass_id, event.batch_id, event.cost, event.metrics) - else: - pass trainer = paddle.trainer.SGD(cost=crf_cost, parameters=parameters, update_equation=optimizer) + parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32)) - parameters.set('emb', load_parameter("data/emb", 44068, 32)) - - reader_dict = { - 'word_data': 0, - 'ctx_n2_data': 1, - 'ctx_n1_data': 2, - 'ctx_0_data': 3, - 'ctx_p1_data': 4, - 'ctx_p2_data': 5, - 'verb_data': 6, - 'mark_data': 7, - 'target': 8, - } trn_reader = paddle.reader.batched( paddle.reader.shuffle( - train_reader(), buf_size=8192), batch_size=10) + conll05.test, buf_size=8192), batch_size=10) + trainer.train( - reader=trn_reader, - event_handler=event_handler, - num_passes=10000, - reader_dict=reader_dict) + reader=trn_reader, event_handler=event_handler, num_passes=10000) if __name__ == '__main__': diff --git a/demo/semantic_role_labeling/model_v2.py b/demo/semantic_role_labeling/model_v2.py deleted file mode 100644 index cec58e52c7..0000000000 --- a/demo/semantic_role_labeling/model_v2.py +++ /dev/null @@ -1,121 +0,0 @@ -import math -import paddle.v2 as paddle - - -def db_lstm(word_dict_len, label_dict_len, pred_len): - mark_dict_len = 2 - word_dim = 32 - mark_dim = 5 - hidden_dim = 512 - depth = 8 - - #8 features - def d_type(size): - return paddle.data_type.integer_value_sequence(size) - - word = paddle.layer.data(name='word_data', type=d_type(word_dict_len)) - predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len)) - - ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len)) - ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len)) - ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len)) - ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len)) - ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len)) - mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len)) - - target = paddle.layer.data(name='target', type=d_type(label_dict_len)) - - default_std = 1 / math.sqrt(hidden_dim) / 3.0 - - emb_para = paddle.attr.Param(name='emb', initial_std=0., learning_rate=0.) - std_0 = paddle.attr.Param(initial_std=0.) - std_default = paddle.attr.Param(initial_std=default_std) - - predicate_embedding = paddle.layer.embedding( - size=word_dim, - input=predicate, - param_attr=paddle.attr.Param( - name='vemb', initial_std=default_std)) - mark_embedding = paddle.layer.embedding( - size=mark_dim, input=mark, param_attr=std_0) - - word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] - emb_layers = [ - paddle.layer.embedding( - size=word_dim, input=x, param_attr=emb_para) for x in word_input - ] - emb_layers.append(predicate_embedding) - emb_layers.append(mark_embedding) - - hidden_0 = paddle.layer.mixed( - size=hidden_dim, - bias_attr=std_default, - input=[ - paddle.layer.full_matrix_projection( - input=emb, param_attr=std_default) for emb in emb_layers - ]) - - mix_hidden_lr = 1e-3 - lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0) - hidden_para_attr = paddle.attr.Param( - initial_std=default_std, learning_rate=mix_hidden_lr) - - lstm_0 = paddle.layer.lstmemory( - input=hidden_0, - act=paddle.activation.Relu(), - gate_act=paddle.activation.Sigmoid(), - state_act=paddle.activation.Sigmoid(), - bias_attr=std_0, - param_attr=lstm_para_attr) - - #stack L-LSTM and R-LSTM with direct edges - input_tmp = [hidden_0, lstm_0] - - for i in range(1, depth): - mix_hidden = paddle.layer.mixed( - size=hidden_dim, - bias_attr=std_default, - input=[ - paddle.layer.full_matrix_projection( - input=input_tmp[0], param_attr=hidden_para_attr), - paddle.layer.full_matrix_projection( - input=input_tmp[1], param_attr=lstm_para_attr) - ]) - - lstm = paddle.layer.lstmemory( - input=mix_hidden, - act=paddle.activation.Relu(), - gate_act=paddle.activation.Sigmoid(), - state_act=paddle.activation.Sigmoid(), - reverse=((i % 2) == 1), - bias_attr=std_0, - param_attr=lstm_para_attr) - - input_tmp = [mix_hidden, lstm] - - feature_out = paddle.layer.mixed( - size=label_dict_len, - bias_attr=std_default, - input=[ - paddle.layer.full_matrix_projection( - input=input_tmp[0], param_attr=hidden_para_attr), - paddle.layer.full_matrix_projection( - input=input_tmp[1], param_attr=lstm_para_attr) - ], ) - - crf_cost = paddle.layer.crf(size=label_dict_len, - input=feature_out, - label=target, - param_attr=paddle.attr.Param( - name='crfw', - initial_std=default_std, - learning_rate=mix_hidden_lr)) - - crf_dec = paddle.layer.crf_decoding( - name='crf_dec_l', - size=label_dict_len, - input=feature_out, - label=target, - param_attr=paddle.attr.Param(name='crfw')) - - return crf_cost, crf_dec diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py index 15460b820d..90803628e3 100644 --- a/python/paddle/v2/dataset/__init__.py +++ b/python/paddle/v2/dataset/__init__.py @@ -14,4 +14,4 @@ import mnist -__all__ = ['mnist'] +__all__ = ['mnist', 'cifar', 'imdb', 'conll05', 'imikolov', 'movielens'] diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py index 7874161a05..52f19d2115 100644 --- a/python/paddle/v2/dataset/conll05.py +++ b/python/paddle/v2/dataset/conll05.py @@ -160,7 +160,6 @@ def reader_creator(corpus_reader, ctx_p2 = 'eos' word_idx = [word_dict.get(w, UNK_IDX) for w in sentence] - pred_idx = [predicate_dict.get(predicate)] * sen_len ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len @@ -168,10 +167,11 @@ def reader_creator(corpus_reader, ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len + pred_idx = [predicate_dict.get(predicate)] * sen_len label_idx = [label_dict.get(w) for w in labels] - yield word_idx, pred_idx, ctx_n2_idx, ctx_n1_idx, \ - ctx_0_idx, ctx_p1_idx, ctx_p2_idx, mark, label_idx + yield word_idx, ctx_n2_idx, ctx_n1_idx, \ + ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx return reader() From 11fdb4f041250d906a115c00d4e37b76a4bf8905 Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 2 Mar 2017 16:11:30 +0800 Subject: [PATCH 17/22] Check system's protobuf for internal users --- cmake/external/protobuf.cmake | 82 ++++++++++++++++++----------------- cmake/external/python.cmake | 4 -- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 84f459033f..26da7e8e38 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -14,46 +14,50 @@ INCLUDE(ExternalProject) -SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf) -SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf) -SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE) +FIND_PACKAGE(Protobuf) -INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR}) +IF(NOT PROTOBUF_FOUND) + SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf) + SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf) + SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE) + + IF(WIN32) + SET(PROTOBUF_LITE_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE) + SET(PROTOBUF_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE) + SET(PROTOBUF_PROTOC_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE) + SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE) + ELSE(WIN32) + SET(PROTOBUF_LITE_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE) + SET(PROTOBUF_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE) + SET(PROTOBUF_PROTOC_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE) + SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE) + ENDIF(WIN32) -IF(WIN32) - SET(PROTOBUF_LITE_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE) - SET(PROTOBUF_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE) - SET(PROTOBUF_PROTOC_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE) - SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE) -ELSE(WIN32) - SET(PROTOBUF_LITE_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE) - SET(PROTOBUF_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE) - SET(PROTOBUF_PROTOC_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE) - SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE) -ENDIF(WIN32) + ExternalProject_Add( + protobuf + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${PROTOBUF_SOURCES_DIR} + UPDATE_COMMAND "" + DEPENDS zlib + GIT_REPOSITORY "https://github.com/google/protobuf.git" + GIT_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546" + CONFIGURE_COMMAND + ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake + -Dprotobuf_BUILD_TESTS=OFF + -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=Release + -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=lib + ) -ExternalProject_Add( - protobuf - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${PROTOBUF_SOURCES_DIR} - UPDATE_COMMAND "" - DEPENDS zlib - GIT_REPOSITORY "https://github.com/google/protobuf.git" - GIT_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546" - CONFIGURE_COMMAND - ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake - -Dprotobuf_BUILD_TESTS=OFF - -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT} - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=Release - -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=lib -) + LIST(APPEND external_project_dependencies protobuf) +ENDIF(NOT PROTOBUF_FOUND) -LIST(APPEND external_project_dependencies protobuf) +INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR}) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index 6372a9a768..0accf1a8dd 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -221,7 +221,3 @@ ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND) INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) - -MESSAGE("[Paddle] Python Executable: ${PYTHON_EXECUTABLE}") -MESSAGE("[Paddle] Python Include: ${PYTHON_INCLUDE_DIRS}") -MESSAGE("[Paddle] Python Libraries: ${PYTHON_LIBRARIES}") From 5ce504b19b96d94dec5ceab2e2443a091b690cf0 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 2 Mar 2017 20:50:23 +0800 Subject: [PATCH 18/22] Fix duplicated forward/backward in trainer. --- python/paddle/v2/trainer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index 5003f55f3e..5cff75e39d 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -108,9 +108,6 @@ class SGD(ITrainer): pass_evaluator.start() updater.startPass() for batch_id, data_batch in enumerate(reader()): - pass_type = updater.startBatch(len(data_batch)) - self.__gradient_machine__.forwardBackward( - feeder(data_batch), out_args, pass_type) batch_evaluator.start() event_handler( v2_event.BeginIteration( From 172ac8af7abb0b54f47abb7eb067fbd538ab5b57 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Thu, 2 Mar 2017 20:58:05 +0800 Subject: [PATCH 19/22] update --- demo/semantic_role_labeling/api_train_v2.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py index 8ce6faaa1b..c582724185 100644 --- a/demo/semantic_role_labeling/api_train_v2.py +++ b/demo/semantic_role_labeling/api_train_v2.py @@ -4,17 +4,12 @@ import numpy as np import paddle.v2 as paddle import paddle.v2.dataset.conll05 as conll05 -UNK_IDX = 0 - def db_lstm(): word_dict, verb_dict, label_dict = conll05.get_dict() word_dict_len = len(word_dict) label_dict_len = len(label_dict) pred_len = len(verb_dict) - print 'word_dict_len,', word_dict_len - print 'label_dict_len,', label_dict_len - print 'pred_len,', pred_len mark_dict_len = 2 word_dim = 32 From 4a265b5200bb86ef81f08d9fce516330b2c2f41a Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Thu, 2 Mar 2017 21:42:11 +0800 Subject: [PATCH 20/22] Use reader in dataset imdb.py --- demo/sentiment/train_v2.py | 38 ++++++++------------------------ python/paddle/v2/dataset/imdb.py | 5 +++++ 2 files changed, 14 insertions(+), 29 deletions(-) diff --git a/demo/sentiment/train_v2.py b/demo/sentiment/train_v2.py index 3d595fad30..0fa7494853 100644 --- a/demo/sentiment/train_v2.py +++ b/demo/sentiment/train_v2.py @@ -2,10 +2,11 @@ import sys from os.path import join as join_path import paddle.trainer_config_helpers.attrs as attrs from paddle.trainer_config_helpers.poolings import MaxPooling -import paddle.v2 as paddle import paddle.v2.layer as layer import paddle.v2.activation as activation import paddle.v2.data_type as data_type +import paddle.v2.dataset.imdb as imdb +import paddle.v2 as paddle def sequence_conv_pool(input, @@ -189,36 +190,15 @@ def stacked_lstm_net(input_dim, return cost -def data_reader(data_file, dict_file): - def reader(): - with open(dict_file, 'r') as fdict, open(data_file, 'r') as fdata: - dictionary = dict() - for i, line in enumerate(fdict): - dictionary[line.split('\t')[0]] = i - - for line_count, line in enumerate(fdata): - label, comment = line.strip().split('\t\t') - label = int(label) - words = comment.split() - word_slot = [dictionary[w] for w in words if w in dictionary] - yield (word_slot, label) - - return reader - - if __name__ == '__main__': - # data file - train_file = "./data/pre-imdb/train_part_000" - test_file = "./data/pre-imdb/test_part_000" - dict_file = "./data/pre-imdb/dict.txt" - labels = "./data/pre-imdb/labels.list" - # init paddle.init(use_gpu=True, trainer_count=4) # network config - dict_dim = len(open(dict_file).readlines()) - class_dim = len(open(labels).readlines()) + print 'load dictionary...' + word_dict = imdb.word_dict() + dict_dim = len(word_dict) + class_dim = 2 # Please choose the way to build the network # by uncommenting the corresponding line. @@ -246,7 +226,7 @@ if __name__ == '__main__': if isinstance(event, paddle.event.EndPass): result = trainer.test( reader=paddle.reader.batched( - data_reader(test_file, dict_file), batch_size=128), + lambda: imdb.test(word_dict), batch_size=128), reader_dict={'word': 0, 'label': 1}) print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) @@ -259,8 +239,8 @@ if __name__ == '__main__': trainer.train( reader=paddle.reader.batched( paddle.reader.shuffle( - data_reader(train_file, dict_file), buf_size=4096), - batch_size=128), + lambda: imdb.train(word_dict), buf_size=1000), + batch_size=100), event_handler=event_handler, reader_dict={'word': 0, 'label': 1}, diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index 433e37380f..db388be1e0 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -118,3 +118,8 @@ def test(word_idx): return reader_creator( re.compile("aclImdb/test/pos/.*\.txt$"), re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000) + + +def word_dict(): + return build_dict( + re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150) From 8ebfe554bda96e2ff64d64fe9ae6ca461938411a Mon Sep 17 00:00:00 2001 From: Helin Wang Date: Thu, 2 Mar 2017 15:38:48 -0800 Subject: [PATCH 21/22] add batch reader into reader design doc --- doc/design/reader/README.md | 91 +++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 25 deletions(-) diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md index 17d52b9e20..03119fdd74 100644 --- a/doc/design/reader/README.md +++ b/doc/design/reader/README.md @@ -4,9 +4,10 @@ At training and testing time, PaddlePaddle programs need to read data. To ease t - A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items. - A *reader creator* is a function that returns a reader function. -- A *reader* decorator is a function, which accepts one or more readers, and returns a reader. +- A *reader decorator* is a function, which accepts one or more readers, and returns a reader. +- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items. -and provide frequently used reader creators and reader decorators. +and provide function which converts reader to batch reader, frequently used reader creators and reader decorators. ## Data Reader Interface @@ -37,9 +38,54 @@ def reader_creator_random_imageand_label(widht, height, label): return reader ``` +## Batch Reader Interface + +*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple. + +Here are valid outputs: +```python +# a mini batch of three data items. Each data item consist three columns of data, each of which is 1. +[(1, 1, 1), +(2, 2, 2), +(3, 3, 3)] + +# a mini batch of three data items, each data item is a list (single column). +[([1,1,1],), +([2,2,2],), +([3,3,3],), +``` + +Please note that each item inside the list must be a tuple, below is an invalid output: +```python + # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],). + # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1], + # or three column of datas, each of which is 1. +[[1,1,1], +[2,2,2], +[3,3,3]] +``` + +It's easy to convert from reader to batch reader: +```python +mnist_train = paddle.dataset.mnist.train() +mnist_train_batch_reader = paddle.batch(mnist_train, 128) +``` + +Also easy to create custom batch reader: +```python +def custom_batch_reader(): + while True: + batch = [] + for i in xrange(128): + batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended. + yield batch + +mnist_random_image_batch_reader = custom_batch_reader +``` + ## Usage -data reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`: +batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`: ```python # two data layer is created: @@ -47,8 +93,8 @@ image_layer = paddle.layer.data("image", ...) label_layer = paddle.layer.data("label", ...) # ... - -paddle.train(paddle.dataset.mnist, {"image":0, "label":1}, 128, 10, ...) +batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128) +paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...) ``` ## Data Reader Decorator @@ -64,7 +110,7 @@ Since reading data may take time and training can not proceed without data. It i Use `paddle.reader.buffered` to prefetch data: ```python -buffered_reader = paddle.reader.buffered(paddle.dataset.mnist, 100) +buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100) ``` `buffered_reader` will try to buffer (prefetch) `100` data entries. @@ -91,10 +137,10 @@ def reader_creator_bool(t): true_reader = reader_creator_bool(True) false_reader = reader_creator_bool(False) -reader = paddle.reader.compose(paddle.dataset.mnist, data_reader_creator_random_image(20, 20), true_reader, false_reader) -# Skipped 1 because paddle.dataset.mnist produces two items per data entry. +reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader) +# Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry. # And we don't care second item at this time. -paddle.train(reader, {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...) +paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...) ``` ### Shuffle @@ -103,16 +149,20 @@ Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader Example: ```python -reader = paddle.reader.shuffle(paddle.dataset.mnist, 512) +reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512) ``` ## Q & A -### Why return only a single entry, but not a mini batch? +### Why reader return only a single entry, but not a mini batch? + +Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2). + +We provide function `paddle.batch` to turn (single entry) reader into batch reader. -If a mini batch is returned, data reader need to take care of batch size. But batch size is a concept for training, it makes more sense for user to specify batch size as a parameter for `train`. +### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient? -Practically, always return a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2). +In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically. ### Why use a dictionary but not a list to provide mapping? @@ -137,7 +187,7 @@ def image_reader_creator(image_path, label_path, n): # images_reader_creator creates a reader reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024) -paddle.train(reader, {"image":0, "label":1}, ...) +paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...) ``` ### How is `paddle.train` implemented @@ -145,17 +195,8 @@ paddle.train(reader, {"image":0, "label":1}, ...) An example implementation of paddle.train could be: ```python -def make_minibatch(reader, minibatch_size): - def ret(): - r = reader() - buf = [r.next() for x in xrange(minibatch_size)] - while len(buf) > 0: - yield buf - buf = [r.next() for x in xrange(minibatch_size)] - return ret - -def train(reader, mapping, batch_size, total_pass): +def train(batch_reader, mapping, batch_size, total_pass): for pass_idx in range(total_pass): - for mini_batch in make_minibatch(reader): # this loop will never end in online learning. + for mini_batch in batch_reader(): # this loop will never end in online learning. do_forward_backward(mini_batch, mapping) ``` From f9ea339dd03bdf8df068e6936801db82ffd39bcd Mon Sep 17 00:00:00 2001 From: Helin Wang Date: Thu, 2 Mar 2017 22:14:14 +0000 Subject: [PATCH 22/22] remove osx build from CI We don't officially support OSX. And Travis CI takes a lot of time to build osx. (long time in pending state. Probably because travis ci don't have enough osx machines. --- .travis.yml | 9 -------- paddle/scripts/travis/before_install.osx.sh | 4 ---- paddle/scripts/travis/build_and_test.sh | 23 +++++++-------------- 3 files changed, 8 insertions(+), 28 deletions(-) delete mode 100755 paddle/scripts/travis/before_install.osx.sh diff --git a/.travis.yml b/.travis.yml index 28d1f51be7..5a7f45a748 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,22 +4,14 @@ cache: - $HOME/third_party - $HOME/.ccache - $HOME/.cache/pip - - $HOME/Library/Caches/Homebrew sudo: required dist: trusty os: - linux - - osx env: - JOB=DOCS - JOB=BUILD_AND_TEST - JOB=PRE_COMMIT -matrix: - exclude: - - os: osx - env: JOB=DOCS # Only generate documentation in linux. - - os: osx - env: JOB=PRE_COMMIT # Only check pre-commit hook in linux addons: apt: @@ -53,7 +45,6 @@ before_install: fi fi fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python # protobuf version. diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh deleted file mode 100755 index 80f031a74e..0000000000 --- a/paddle/scripts/travis/before_install.osx.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -brew update -brew tap homebrew/science -brew install openblas swig md5sha1sum diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh index 5e6350b574..7deb3e62e8 100755 --- a/paddle/scripts/travis/build_and_test.sh +++ b/paddle/scripts/travis/build_and_test.sh @@ -2,18 +2,11 @@ source ./common.sh NPROC=1 -if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then - export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages - export PYTHONHOME=/opt/python/2.7.12 - export PATH=/opt/python/2.7.12/bin:${PATH} - cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS} - NRPOC=`nproc` - make -j $NPROC - make coveralls - sudo make install -elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then - export PYTHONPATH=/usr/local/lib/python2.7/site-packages - cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS} - NPROC=`sysctl -n hw.ncpu` - make -j $NPROC -fi +export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages +export PYTHONHOME=/opt/python/2.7.12 +export PATH=/opt/python/2.7.12/bin:${PATH} +cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS} +NRPOC=`nproc` +make -j $NPROC +make coveralls +sudo make install