From 3f1151a54cf06acb7176b0c59671585b8276e650 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 25 Apr 2017 19:04:22 +0800 Subject: [PATCH 1/4] Add error clipping to MT demo. * Compose GRU step naive layer in trainer config helpers. * It is uses mixed_layer for gate. * It supports ERROR_CLIPPING, DROPOUT * Add error clipping in MT demo. * Fix #1143 * Fix #1891 --- demo/seqToseq/seqToseq_net.py | 23 ++++-- demo/seqToseq/translation/train.conf | 1 - python/paddle/trainer_config_helpers/attrs.py | 15 ++-- .../paddle/trainer_config_helpers/layers.py | 75 ++++++++++++++++++- .../paddle/trainer_config_helpers/networks.py | 22 ++++-- 5 files changed, 117 insertions(+), 19 deletions(-) diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py index e523a34d5a..3d1f86ec3b 100644 --- a/demo/seqToseq/seqToseq_net.py +++ b/demo/seqToseq/seqToseq_net.py @@ -69,7 +69,8 @@ def gru_encoder_decoder(data_conf, encoder_size=512, decoder_size=512, beam_size=3, - max_length=250): + max_length=250, + error_clipping=50): """ A wrapper for an attention version of GRU Encoder-Decoder network is_generating: whether this config is used for generating @@ -90,9 +91,19 @@ def gru_encoder_decoder(data_conf, input=src_word_id, size=word_vector_dim, param_attr=ParamAttr(name='_source_language_embedding')) - src_forward = simple_gru(input=src_embedding, size=encoder_size) + src_forward = simple_gru( + input=src_embedding, + size=encoder_size, + naive=True, + gru_layer_attr=ExtraLayerAttribute( + error_clipping_threshold=error_clipping)) src_backward = simple_gru( - input=src_embedding, size=encoder_size, reverse=True) + input=src_embedding, + size=encoder_size, + reverse=True, + naive=True, + gru_layer_attr=ExtraLayerAttribute( + error_clipping_threshold=error_clipping)) encoded_vector = concat_layer(input=[src_forward, src_backward]) with mixed_layer(size=decoder_size) as encoded_proj: @@ -117,11 +128,13 @@ def gru_encoder_decoder(data_conf, decoder_inputs += full_matrix_projection(input=context) decoder_inputs += full_matrix_projection(input=current_word) - gru_step = gru_step_layer( + gru_step = gru_step_naive_layer( name='gru_decoder', input=decoder_inputs, output_mem=decoder_mem, - size=decoder_size) + size=decoder_size, + layer_attr=ExtraLayerAttribute( + error_clipping_threshold=error_clipping)) with mixed_layer( size=target_dict_dim, bias_attr=True, diff --git a/demo/seqToseq/translation/train.conf b/demo/seqToseq/translation/train.conf index 72b7ccdbb9..0718f00f68 100644 --- a/demo/seqToseq/translation/train.conf +++ b/demo/seqToseq/translation/train.conf @@ -28,7 +28,6 @@ train_conf = seq_to_seq_data(data_dir = data_dir, ### Algorithm Configuration settings( - learning_method = AdamOptimizer(), batch_size = 50, learning_rate = 5e-4) diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py index bf02088346..7b76e87f04 100644 --- a/python/paddle/trainer_config_helpers/attrs.py +++ b/python/paddle/trainer_config_helpers/attrs.py @@ -208,12 +208,15 @@ class ExtraLayerAttribute(object): drop_rate=None, device=None): self.attr = dict() - if isinstance(error_clipping_threshold, float): - assert error_clipping_threshold > 0 - self.attr["error_clipping_threshold"] = error_clipping_threshold - - if isinstance(drop_rate, float): - assert drop_rate > 0 + if error_clipping_threshold is not None: + error_clipping_threshold = float(error_clipping_threshold) + if error_clipping_threshold < 0: + raise ValueError("Error clipping must > 0") + self.attr['error_clipping_threshold'] = error_clipping_threshold + if drop_rate is not None: + drop_rate = float(drop_rate) + if drop_rate < 0: + raise ValueError("Dropout rate must > 0") self.attr["drop_rate"] = drop_rate if isinstance(device, int): diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index f906126d87..635e280ca5 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -84,6 +84,7 @@ __all__ = [ 'GeneratedInput', 'SubsequenceInput', 'gru_step_layer', + 'gru_step_naive_layer', 'recurrent_layer', 'BaseGeneratedInput', 'conv_operator', @@ -2284,7 +2285,7 @@ def img_pool_layer(input, type_name = pool_type.name + '-projection' \ if ( - isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \ + isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \ else pool_type.name pool_size_y = pool_size if pool_size_y is None else pool_size_y @@ -3084,6 +3085,78 @@ def gru_step_layer(input, activation=act) +@wrap_bias_attr_default() +@wrap_param_attr_default() +@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation()) +@wrap_act_default(act=TanhActivation()) +@wrap_name_default('gru_step') +@layer_support(ERROR_CLIPPING, DROPOUT) +def gru_step_naive_layer(input, + output_mem, + size=None, + name=None, + act=None, + gate_act=None, + bias_attr=None, + param_attr=None, + layer_attr=None): + """ + GRU Step Layer, but using MixedLayer to generate. It support ERROR_CLIPPING + and DROPOUT. + + :param input: + :param output_mem: + :param size: + :param name: + :param act: + :param gate_act: + :param bias_attr: + :param param_attr: + :param layer_attr: + :return: + """ + if input.size % 3 != 0: + raise ValueError("GruStep input size must be divided by 3") + if size is None: + size = input.size / 3 + + def __gate__(gate_name, offset): + with mixed_layer( + name=name + "_" + gate_name, + size=size, + layer_attr=layer_attr, + bias_attr=bias_attr, + act=gate_act) as gate: + gate += identity_projection(input=input, offset=offset) + gate += full_matrix_projection( + input=output_mem, param_attr=param_attr) + return gate + + update_gate = __gate__("update", 0) + reset_gate = __gate__("reset", size) + + with mixed_layer( + name=name + "_reset_output", bias_attr=False) as reset_output: + reset_output += dotmul_operator(a=output_mem, b=reset_gate) + + with mixed_layer( + name=name + "_output_candidate", + size=size, + layer_attr=layer_attr, + bias_attr=bias_attr, + act=act) as output_candidate: + output_candidate += identity_projection(input=input, offset=2 * size) + output_candidate += full_matrix_projection( + input=reset_output, param_attr=param_attr) + + with mixed_layer(name=name) as output: + output += identity_projection(output_mem) + output += dotmul_operator(a=output_mem, b=update_gate, scale=-1.0) + output += dotmul_operator(a=output_candidate, b=update_gate) + + return output + + @wrap_name_default() @layer_support() def get_output_layer(input, arg_name, name=None, layer_attr=None): diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index cadde11ff8..fb533a47e0 100755 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -825,7 +825,8 @@ def gru_unit(input, gru_param_attr=None, act=None, gate_act=None, - gru_layer_attr=None): + gru_layer_attr=None, + naive=False): """ Define calculations that a gated recurrent unit performs in a single time step. This function itself is not a recurrent layer, so that it can not be @@ -857,7 +858,12 @@ def gru_unit(input, out_mem = memory(name=name, size=size) - gru_out = gru_step_layer( + if naive: + __step__ = gru_step_naive_layer + else: + __step__ = gru_step_layer + + gru_out = __step__( name=name, input=input, output_mem=out_mem, @@ -879,7 +885,8 @@ def gru_group(input, gru_param_attr=None, act=None, gate_act=None, - gru_layer_attr=None): + gru_layer_attr=None, + naive=False): """ gru_group is a recurrent layer group version of Gated Recurrent Unit. It does exactly the same calculation as the grumemory layer does. A promising @@ -928,7 +935,8 @@ def gru_group(input, gru_param_attr=gru_param_attr, act=act, gate_act=gate_act, - gru_layer_attr=gru_layer_attr) + gru_layer_attr=gru_layer_attr, + naive=naive) return recurrent_group( name='%s_recurrent_group' % name, @@ -949,7 +957,8 @@ def simple_gru(input, gru_param_attr=None, act=None, gate_act=None, - gru_layer_attr=None): + gru_layer_attr=None, + naive=False): """ You maybe see gru_step_layer, grumemory in layers.py, gru_unit, gru_group, simple_gru in network.py. The reason why there are so many interfaces is @@ -1018,7 +1027,8 @@ def simple_gru(input, gru_param_attr=gru_param_attr, act=act, gate_act=gate_act, - gru_layer_attr=gru_layer_attr) + gru_layer_attr=gru_layer_attr, + naive=naive) @wrap_name_default('simple_gru2') From c6a8e23a9e68b7121c6bfd94eb6a9b1aea0cf6e3 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 26 Apr 2017 10:40:24 +0800 Subject: [PATCH 2/4] Fix unittests --- paddle/gserver/tests/sequence_layer_group.conf | 3 +-- .../tests/configs/protostr/projections.protostr | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/gserver/tests/sequence_layer_group.conf b/paddle/gserver/tests/sequence_layer_group.conf index 68d150d553..50f2d89d02 100644 --- a/paddle/gserver/tests/sequence_layer_group.conf +++ b/paddle/gserver/tests/sequence_layer_group.conf @@ -48,8 +48,7 @@ lstm = lstmemory_group( size=hidden_dim, act=TanhActivation(), gate_act=SigmoidActivation(), - state_act=TanhActivation(), - lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50)) + state_act=TanhActivation()) lstm_last = last_seq(input=lstm) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr index 2afc3afef6..d8bd7b9dfb 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr @@ -320,6 +320,7 @@ layers { } } drop_rate: 0.5 + error_clipping_threshold: 40.0 } parameters { name: "___embedding_0__.w0" From 11c8ab80ec806635b6b010a425edca34292456e9 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 26 Apr 2017 10:48:59 +0800 Subject: [PATCH 3/4] Revert unchanged files --- demo/seqToseq/translation/train.conf | 1 + 1 file changed, 1 insertion(+) diff --git a/demo/seqToseq/translation/train.conf b/demo/seqToseq/translation/train.conf index 0718f00f68..72b7ccdbb9 100644 --- a/demo/seqToseq/translation/train.conf +++ b/demo/seqToseq/translation/train.conf @@ -28,6 +28,7 @@ train_conf = seq_to_seq_data(data_dir = data_dir, ### Algorithm Configuration settings( + learning_method = AdamOptimizer(), batch_size = 50, learning_rate = 5e-4) From da2adea964180147bdd26f7f953b6760476d8108 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 26 Apr 2017 11:42:36 +0800 Subject: [PATCH 4/4] Fix unittest --- paddle/gserver/tests/sequence_nest_layer_group.conf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/gserver/tests/sequence_nest_layer_group.conf b/paddle/gserver/tests/sequence_nest_layer_group.conf index 88cb42798b..c01b95f7a2 100644 --- a/paddle/gserver/tests/sequence_nest_layer_group.conf +++ b/paddle/gserver/tests/sequence_nest_layer_group.conf @@ -51,8 +51,7 @@ def lstm_group(lstm_group_input): size=hidden_dim, act=TanhActivation(), gate_act=SigmoidActivation(), - state_act=TanhActivation(), - lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50)) + state_act=TanhActivation()) return lstm_output