Transfer GRU unit

test=develop
6 years ago · f49d9b393c
parent 7e7b4500a6
commit f49d9b393c
1 changed files with 136 additions and 1 deletions
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@ -22,7 +22,7 @@ from . import layers
 from ..framework import Variable, OpProtoHolder
 from ..param_attr import ParamAttr
 from ..initializer import Normal, Constant
-__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding']
+__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit']


 class Conv2D(layers.Layer):
@ -496,3 +496,138 @@ class Embedding(layers.Layer):
            })

        return out
+
+
+class GRUUnit(layers.Layer):
+    """
+    **GRU unit layer**
+
+    if origin_mode is True, then the equation of a gru step is from paper
+    `Learning Phrase Representations using RNN Encoder-Decoder for Statistical
+    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
+
+        .. math::
+            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
+
+            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
+
+            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
+
+            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
+
+    if origin_mode is False, then the equation of a gru step is from paper
+    `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
+    Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_
+
+        .. math::
+            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
+
+            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
+
+            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
+
+            h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t)
+
+
+    The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
+    of the equation above, the :math:`z_t` is split into 3 parts -
+    :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to
+    implement a full GRU unit operator for an input, a fully
+    connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.
+
+    The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
+    of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
+    an intermediate candidate hidden output, which is denoted by :math:`m_t`.
+    This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
+    and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
+
+    Args:
+        input (Variable): The fc transformed input value of current step.
+        hidden (Variable): The hidden value of gru unit from previous step.
+        size (integer): The input dimension value.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            hidden-hidden weight matrix. Note:
+
+            - The shape of the weight matrix is :math:`(T \\times 3D)`, where
+              :math:`D` is the hidden size.
+            - All elements in the weight matrix can be divided into two parts.
+              The first part are weights of the update gate and reset gate with
+              shape :math:`(D \\times 2D)`, and the second part are weights for
+              candidate hidden state with shape :math:`(D \\times D)`.
+
+            If it is set to None or one attribute of ParamAttr, gru_unit will
+            create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
+            of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
+            the bias in the update gate, reset gate and candidate calculations.
+            If it is set to False, no bias will be applied to the update gate,
+            reset gate and candidate calculations. If it is set to None or one
+            attribute of ParamAttr, gru_unit will create ParamAttr as
+            bias_attr. If the Initializer of the bias_attr is not set, the bias
+            is initialized zero. Default: None.
+        activation (string): The activation type for cell (actNode).
+                             Default: 'tanh'
+        gate_activation (string): The activation type for gates (actGate).
+                                  Default: 'sigmoid'
+
+    Returns:
+        tuple: The hidden value, reset-hidden value and gate values.
+    """
+
+    def __init__(self,
+                 hidden,
+                 size,
+                 param_attr=None,
+                 bias_attr=None,
+                 activation='tanh',
+                 gate_activation='sigmoid',
+                 origin_mode=False,
+                 dtype='float32'):
+
+        super(GRUUnit, self).__init__()
+        activation_dict = dict(
+            identity=0,
+            sigmoid=1,
+            tanh=2,
+            relu=3, )
+        activation = activation_dict[activation]
+        gate_activation = activation_dict[gate_activation]
+
+        helper = LayerHelper('gru_unit', **locals())
+        dtype = helper.input_dtype()
+        size = size // 3
+
+        # create weight
+        weight = helper.create_parameter(
+            attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
+
+        gate = helper.create_variable_for_type_inference(dtype)
+        reset_hidden_pre = helper.create_variable_for_type_inference(dtype)
+        updated_hidden = helper.create_variable_for_type_inference(dtype)
+        inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight}
+        # create bias
+        if helper.bias_attr:
+            bias_size = [1, 3 * size]
+            bias = helper.create_parameter(
+                attr=helper.bias_attr,
+                shape=bias_size,
+                dtype=dtype,
+                is_bias=True)
+            inputs['Bias'] = bias
+
+    def forward(self, input):
+        self._helper.append_op(
+            type='gru_unit',
+            inputs=inputs,
+            outputs={
+                'Gate': gate,
+                'ResetHiddenPrev': reset_hidden_pre,
+                'Hidden': updated_hidden,
+            },
+            attrs={
+                'activation': 2,  # tanh
+                'gate_activation': 1,  # sigmoid
+            })
+
+        return updated_hidden, reset_hidden_pre, gate