Merge pull request #2641 from lcy-seso/enable_boot_memory_for_lstm

enable users to set intial memory states for lstm/gru group.
8 years ago · d011514e5f
parent c8e56d31f2 5c68aacad1
commit d011514e5f
2 changed files with 58 additions and 43 deletions
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@ -1149,10 +1149,10 @@ def pooling_layer(input,
@layer_support(DROPOUT)
 def lstmemory(input,
              name=None,
+              size=None,
              reverse=False,
              act=None,
              gate_act=None,
-              size=None,
              state_act=None,
              bias_attr=None,
              param_attr=None,
@ -1194,6 +1194,8 @@ def lstmemory(input,

    :param name: The lstmemory layer name.
    :type name: basestring
+    :param size: DEPRECATED. size of the lstm cell
+    :type size: int
    :param input: input layer name.
    :type input: LayerOutput
    :param reverse: is sequence process reversed or not.
@ -1220,15 +1222,15 @@ def lstmemory(input,
    assert state_act.support_hppl
    assert act.support_hppl
    assert input.size is not None and input.size % 4 == 0
+
    if size is not None:
        if input.size / 4 == size:
            plog = logger.warning
        else:
            plog = logger.fatal
-
-        plog("NOTE: The lstmemory layer[%s]'s size is set by previous input "
-             "layer. The lstm size should be equal with input layer size/4. The"
-             " size which is set explicitly will be ignored." % name)
+        plog("size of lstmemory layer: %s is automatically set to "
+             "size of input layer / 4. The parameter size passing to "
+             "this layer is ignored." % (name))

    Layer(
        name=name,
@ -1255,11 +1257,11 @@ def lstmemory(input,
@wrap_name_default("gru")
@layer_support(DROPOUT)
 def grumemory(input,
+              size=None,
              name=None,
              reverse=False,
              act=None,
              gate_act=None,
-              size=None,
              bias_attr=None,
              param_attr=None,
              layer_attr=None):
@ -1318,6 +1320,8 @@ def grumemory(input,
    :type name: None|basestring
    :param input: input layer.
    :type input: LayerOutput.
+    :param size: DEPRECATED. size of the gru cell
+    :type size: int
    :param reverse: Whether sequence process is reversed or not.
    :type reverse: bool
    :param act: activation type, TanhActivation by default. This activation
@ -1334,9 +1338,6 @@ def grumemory(input,
    :type param_attr: ParameterAttribute|None|False
    :param layer_attr: Extra Layer attribute
    :type layer_attr: ExtraLayerAttribute|None
-    :param size: Stub parameter of size, but actually not used. If set this size
-                 will get a warning.
-    :type size: None
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@ -1348,9 +1349,9 @@ def grumemory(input,
            plog = logger.warning
        else:
            plog = logger.fatal
-        plog("NOTE: the gru memory layer's size is set by previous input layer,"
-             " and should be input size / 3. Set size explicitly will be "
-             "ignored.")
+        plog("size of grumemory layer: %s is automatically set to "
+             "size of input layer / 3. The parameter size passing to this "
+             "layer is ignored." % (name))

    Layer(
        name=name,
@ -2524,8 +2525,8 @@ def img_cmrnorm_layer(input,


@wrap_bias_attr_default()
-@wrap_param_attr_default(default_factory=lambda _: ParamAttr(initial_mean=1.0,
-                                                             initial_std=0.))
+@wrap_param_attr_default(
+    default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.))
@wrap_act_default(act=ReluActivation())
@wrap_name_default("batch_norm")
@layer_support(DROPOUT)
@ -3013,25 +3014,25 @@ def lstm_step_layer(input,
                    bias_attr=None,
                    layer_attr=None):
    """
-    LSTM Step Layer. It used in recurrent_group. The lstm equations are shown
-    as follow.
+    LSTM Step Layer. This function is used only in recurrent_group.
+    The lstm equations are shown as follows.

    ..  math::

-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)

-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)

-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)

-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)

        h_t & = o_t tanh(c_t)


    The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
    :code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
-    input vector.
+    input vectors.

    The state of lstm step is :math:`c_{t-1}`. And lstm step layer will do

@ -3042,14 +3043,14 @@ def lstm_step_layer(input,
        ...


-    This layer contains two outputs. Default output is :math:`h_t`. The other
-    output is :math:`o_t`, which name is 'state' and can use
+    This layer has two outputs. Default output is :math:`h_t`. The other
+    output is :math:`o_t`, whose name is 'state' and can use
    :code:`get_output_layer` to extract this output.

    :param name: Layer's name.
    :type name: basestring
-    :param size: Layer's size. NOTE: lstm layer's size, should be equal as
-                 :code:`input.size/4`, and should be equal as
+    :param size: Layer's size. NOTE: lstm layer's size, should be equal to
+                 :code:`input.size/4`, and should be equal to
                 :code:`state.size`.
    :type size: int
    :param input: input layer. :math:`Wx_t + Wh_{t-1}`
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@ -614,6 +614,7 @@ def simple_lstm(input,

@wrap_name_default('lstm_unit')
 def lstmemory_unit(input,
+                   memory_boot=None,
                   name=None,
                   size=None,
                   param_attr=None,
@ -626,9 +627,9 @@ def lstmemory_unit(input,
                   lstm_layer_attr=None,
                   get_output_layer_attr=None):
    """
-    Define calculations that a LSTM unit performs in a single time step.
-    This function itself is not a recurrent layer, so that it can not be
-    directly applied to sequence input. This function is always used in
+    Define calculations that a LSTM unit performs during a single time step.
+    This function itself is not a recurrent layer, so it can not be
+    directly used to process sequence inputs. This function is always used in
    recurrent_group (see layers.py for more details) to implement attention
    mechanism.

@ -638,13 +639,13 @@ def lstmemory_unit(input,

    ..  math::

-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)

-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)

-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)

-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)

        h_t & = o_t tanh(c_t)

@ -661,6 +662,8 @@ def lstmemory_unit(input,

    :param input: input layer name.
    :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
    :param name: lstmemory unit name.
    :type name: basestring
    :param size: lstmemory unit size.
@ -692,7 +695,8 @@ def lstmemory_unit(input,
        assert input.size % 4 == 0
        size = input.size / 4
    out_mem = memory(name=name, size=size)
-    state_mem = memory(name="%s_state" % name, size=size)
+    state_mem = memory(
+        name="%s_state" % name, size=size, boot_layer=memory_boot)

    with mixed_layer(
            name="%s_input_recurrent" % name,
@ -726,6 +730,7 @@ def lstmemory_unit(input,
 def lstmemory_group(input,
                    size=None,
                    name=None,
+                    memory_boot=None,
                    reverse=False,
                    param_attr=None,
                    act=None,
@ -737,7 +742,7 @@ def lstmemory_group(input,
                    lstm_layer_attr=None,
                    get_output_layer_attr=None):
    """
-    lstm_group is a recurrent layer group version of Long Short Term Memory. It
+    lstm_group is a recurrent_group version of Long Short Term Memory. It
    does exactly the same calculation as the lstmemory layer (see lstmemory in
    layers.py for the maths) does. A promising benefit is that LSTM memory
    cell states, or hidden states in every time step are accessible to the
@ -748,8 +753,8 @@ def lstmemory_group(input,

    NOTE: In PaddlePaddle's implementation, the following input-to-hidden
    multiplications:
-    :math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`,
-    :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in lstmemory_unit to
+    :math:`W_{x_i}x_{t}` , :math:`W_{x_f}x_{t}`,
+    :math:`W_{x_c}x_t`, :math:`W_{x_o}x_{t}` are not done in lstmemory_unit to
    speed up the calculations. Consequently, an additional mixed_layer with
    full_matrix_projection must be included before lstmemory_unit is called.

@ -765,10 +770,12 @@ def lstmemory_group(input,

    :param input: input layer name.
    :type input: LayerOutput
-    :param name: lstmemory group name.
-    :type name: basestring
    :param size: lstmemory group size.
    :type size: int
+    :param name: name of the lstmemory group.
+    :type name: basestring
+    :param memory_boot: the initialization state of LSTM cell.
+    :type memory_boot: LayerOutput | None
    :param reverse: is lstm reversed
    :type reverse: bool
    :param param_attr: Parameter config, None if use default.
@ -798,6 +805,7 @@ def lstmemory_group(input,
    def __lstm_step__(ipt):
        return lstmemory_unit(
            input=ipt,
+            memory_boot=memory_boot,
            name=name,
            size=size,
            mixed_bias_attr=mixed_bias_attr,
@ -819,6 +827,7 @@ def lstmemory_group(input,

@wrap_name_default('gru_unit')
 def gru_unit(input,
+             memory_boot=None,
             size=None,
             name=None,
             gru_bias_attr=None,
@ -829,8 +838,8 @@ def gru_unit(input,
             naive=False):
    """
    Define calculations that a gated recurrent unit performs in a single time
-    step. This function itself is not a recurrent layer, so that it can not be
-    directly applied to sequence input. This function is almost always used in
+    step. This function itself is not a recurrent layer, so it can not be
+    directly used to process sequence inputs. This function is always used in
    the recurrent_group (see layers.py for more details) to implement attention
    mechanism.

@ -838,6 +847,8 @@ def gru_unit(input,

    :param input: input layer name.
    :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
    :param name: name of the gru group.
    :type name: basestring
    :param size: hidden size of the gru.
@ -856,7 +867,7 @@ def gru_unit(input,
    if size is None:
        size = input.size / 3

-    out_mem = memory(name=name, size=size)
+    out_mem = memory(name=name, size=size, boot_layer=memory_boot)

    if naive:
        __step__ = gru_step_naive_layer
@ -878,6 +889,7 @@ def gru_unit(input,

@wrap_name_default('gru_group')
 def gru_group(input,
+              memory_boot=None,
              size=None,
              name=None,
              reverse=False,
@ -888,7 +900,7 @@ def gru_group(input,
              gru_layer_attr=None,
              naive=False):
    """
-    gru_group is a recurrent layer group version of Gated Recurrent Unit. It
+    gru_group is a recurrent_group version of Gated Recurrent Unit. It
    does exactly the same calculation as the grumemory layer does. A promising
    benefit is that gru hidden states are accessible to the user. This is
    especially useful in attention model. If you do not need to access
@ -908,6 +920,8 @@ def gru_group(input,

    :param input: input layer name.
    :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
    :param name: name of the gru group.
    :type name: basestring
    :param size: hidden size of the gru.
@ -929,6 +943,7 @@ def gru_group(input,
    def __gru_step__(ipt):
        return gru_unit(
            input=ipt,
+            memory_boot=memory_boot,
            name=name,
            size=size,
            gru_bias_attr=gru_bias_attr,
@ -1083,7 +1098,6 @@ def simple_gru2(input,

    return grumemory(
        name=name,
-        size=size,
        input=m,
        reverse=reverse,
        bias_attr=gru_bias_attr,