make gru_group parameters sharable

8 years ago · 6da7283475
parent ecbff689fb
commit 6da7283475
9 changed files with 355 additions and 9 deletions
--- a/paddle/gserver/layers/GruStepLayer.cpp
+++ b/paddle/gserver/layers/GruStepLayer.cpp
@ -68,8 +68,8 @@ bool GruStepLayer::init(const LayerMap& layerMap,
  if (!Layer::init(layerMap, parameterMap)) return false;
  CHECK_EQ(2U, inputLayers_.size());

-  CHECK_EQ(getSize() * getSize() * 3, parameters_[0]->getSize());
-  weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
+  CHECK_EQ(getSize() * getSize() * 3, parameters_[1]->getSize());
+  weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[1]));

  if (biasParameter_.get() != NULL) {
    CHECK_EQ(getSize() * 3, biasParameter_->getSize());
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@ -2996,7 +2996,7 @@ class GruStepLayer(LayerBase):
        config_assert(input_layer1.size == size,
                      'input_layer1.size != layer.size')
        self.config.active_gate_type = active_gate_type
-        self.create_input_parameter(0, size * size * 3, [size, size * 3])
+        self.create_input_parameter(1, size * size * 3, [size, size * 3])
        self.create_bias_parameter(bias, size * 3)


--- a/python/paddle/trainer/recurrent_units.py
+++ b/python/paddle/trainer/recurrent_units.py
@ -19,6 +19,9 @@
 #   to use these units, import this module in your config_file:
 #     import trainer.recurrent_units 
 # 
+# The modules in this file are DEPRECATED.
+# If you would like to use lstm/gru
+# please use the functions defined in paddle.trainer_config_helpers.

 from paddle.trainer.config_parser import *

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@ -2682,6 +2682,7 @@ def lstm_step_layer(input,


@wrap_bias_attr_default()
+@wrap_param_attr_default()
@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
@wrap_act_default(act=TanhActivation())
@wrap_name_default('gru_step')
@ -2693,6 +2694,7 @@ def gru_step_layer(input,
                   name=None,
                   gate_act=None,
                   bias_attr=None,
+                   param_attr=None,
                   layer_attr=None):
    """

@ -2714,7 +2716,7 @@ def gru_step_layer(input,
    Layer(
        name=name,
        type=LayerType.GRU_STEP_LAYER,
-        inputs=[input.name, output_mem.name],
+        inputs=[input.name, Input(output_mem.name, **param_attr.attr)],
        bias=ParamAttr.to_bias(bias_attr),
        size=size,
        active_type=act.name,
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@ -822,6 +822,7 @@ def gru_unit(input,
             size=None,
             name=None,
             gru_bias_attr=None,
+             gru_param_attr=None,
             act=None,
             gate_act=None,
             gru_layer_attr=None):
@ -862,6 +863,7 @@ def gru_unit(input,
        output_mem=out_mem,
        size=size,
        bias_attr=gru_bias_attr,
+        param_attr=gru_param_attr,
        act=act,
        gate_act=gate_act,
        layer_attr=gru_layer_attr)
@ -874,6 +876,7 @@ def gru_group(input,
              name=None,
              reverse=False,
              gru_bias_attr=None,
+              gru_param_attr=None,
              act=None,
              gate_act=None,
              gru_layer_attr=None):
@ -922,6 +925,7 @@ def gru_group(input,
            name=name,
            size=size,
            gru_bias_attr=gru_bias_attr,
+            gru_param_attr=gru_param_attr,
            act=act,
            gate_act=gate_act,
            gru_layer_attr=gru_layer_attr)
@ -942,6 +946,7 @@ def simple_gru(input,
               mixed_bias_param_attr=None,
               mixed_layer_attr=None,
               gru_bias_attr=None,
+               gru_param_attr=None,
               act=None,
               gate_act=None,
               gru_layer_attr=None):
@ -1010,6 +1015,7 @@ def simple_gru(input,
        input=m,
        reverse=reverse,
        gru_bias_attr=gru_bias_attr,
+        gru_param_attr=gru_param_attr,
        act=act,
        gate_act=gate_act,
        gru_layer_attr=gru_layer_attr)
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@ -3,7 +3,7 @@ export configs=(test_fc layer_activations projections test_print_layer
 test_sequence_pooling test_lstmemory_layer test_grumemory_layer
 last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
-test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
+test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops)

 export whole_configs=(test_split_datasource)
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
@ -307,10 +307,10 @@ layers {
  active_type: "tanh"
  inputs {
    input_layer_name: "__mixed_1__@__gru_group_0___recurrent_group"
-    input_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.w0"
  }
  inputs {
    input_layer_name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
+    input_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.w1"
  }
  bias_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias"
  active_gate_type: "sigmoid"
@ -462,14 +462,14 @@ parameters {
  initial_smart: false
 }
 parameters {
-  name: "___gru_group_0__@__gru_group_0___recurrent_group.w0"
+  name: "___gru_group_0__@__gru_group_0___recurrent_group.w1"
  size: 30000
  initial_mean: 0.0
-  initial_std: 0.01
+  initial_std: 0.1
  dims: 100
  dims: 300
  initial_strategy: 0
-  initial_smart: false
+  initial_smart: true
 }
 parameters {
  name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias"
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
@ -0,0 +1,40 @@
+from paddle.trainer_config_helpers import *
+
+settings(learning_rate=1e-4, batch_size=1000)
+
+data_1 = data_layer(name='data_a', size=100)
+data_2 = data_layer(name='data_b', size=100)
+
+mixed_param = ParamAttr(name='mixed_param')
+
+gru_param = ParamAttr(name='gru_param')
+gru_bias = ParamAttr(name='gru_bias', initial_mean=0., initial_std=0.)
+
+gru1 = simple_gru(
+    input=data_1,
+    size=200,
+    mixed_param_attr=mixed_param,
+    mixed_bias_param_attr=False,
+    gru_bias_attr=gru_bias,
+    gru_param_attr=gru_param)
+
+gru2 = simple_gru(
+    input=data_2,
+    size=200,
+    mixed_param_attr=mixed_param,
+    mixed_bias_param_attr=False,
+    gru_bias_attr=gru_bias,
+    gru_param_attr=gru_param)
+
+softmax_param = ParamAttr(name='softmax_param')
+
+predict = fc_layer(
+    input=[last_seq(input=gru1), last_seq(input=gru2)],
+    size=10,
+    param_attr=[softmax_param, softmax_param],
+    bias_attr=False,
+    act=SoftmaxActivation())
+outputs(
+    classification_cost(
+        input=predict, label=data_layer(
+            name='label', size=10)))