|
|
|
@ -22,7 +22,7 @@ from . import layers
|
|
|
|
|
from ..framework import Variable, OpProtoHolder
|
|
|
|
|
from ..param_attr import ParamAttr
|
|
|
|
|
from ..initializer import Normal, Constant
|
|
|
|
|
__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding']
|
|
|
|
|
__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Conv2D(layers.Layer):
|
|
|
|
@ -496,3 +496,138 @@ class Embedding(layers.Layer):
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GRUUnit(layers.Layer):
|
|
|
|
|
"""
|
|
|
|
|
**GRU unit layer**
|
|
|
|
|
|
|
|
|
|
if origin_mode is True, then the equation of a gru step is from paper
|
|
|
|
|
`Learning Phrase Representations using RNN Encoder-Decoder for Statistical
|
|
|
|
|
Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
|
|
|
|
|
|
|
|
|
|
.. math::
|
|
|
|
|
u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
|
|
|
|
|
|
|
|
|
|
r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
|
|
|
|
|
|
|
|
|
|
m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
|
|
|
|
|
|
|
|
|
|
h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
|
|
|
|
|
|
|
|
|
|
if origin_mode is False, then the equation of a gru step is from paper
|
|
|
|
|
`Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
|
|
|
|
|
Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_
|
|
|
|
|
|
|
|
|
|
.. math::
|
|
|
|
|
u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
|
|
|
|
|
|
|
|
|
|
r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
|
|
|
|
|
|
|
|
|
|
m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
|
|
|
|
|
|
|
|
|
|
h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
|
|
|
|
|
of the equation above, the :math:`z_t` is split into 3 parts -
|
|
|
|
|
:math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to
|
|
|
|
|
implement a full GRU unit operator for an input, a fully
|
|
|
|
|
connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.
|
|
|
|
|
|
|
|
|
|
The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
|
|
|
|
|
of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
|
|
|
|
|
an intermediate candidate hidden output, which is denoted by :math:`m_t`.
|
|
|
|
|
This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
|
|
|
|
|
and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
input (Variable): The fc transformed input value of current step.
|
|
|
|
|
hidden (Variable): The hidden value of gru unit from previous step.
|
|
|
|
|
size (integer): The input dimension value.
|
|
|
|
|
param_attr(ParamAttr|None): The parameter attribute for the learnable
|
|
|
|
|
hidden-hidden weight matrix. Note:
|
|
|
|
|
|
|
|
|
|
- The shape of the weight matrix is :math:`(T \\times 3D)`, where
|
|
|
|
|
:math:`D` is the hidden size.
|
|
|
|
|
- All elements in the weight matrix can be divided into two parts.
|
|
|
|
|
The first part are weights of the update gate and reset gate with
|
|
|
|
|
shape :math:`(D \\times 2D)`, and the second part are weights for
|
|
|
|
|
candidate hidden state with shape :math:`(D \\times D)`.
|
|
|
|
|
|
|
|
|
|
If it is set to None or one attribute of ParamAttr, gru_unit will
|
|
|
|
|
create ParamAttr as param_attr. If the Initializer of the param_attr
|
|
|
|
|
is not set, the parameter is initialized with Xavier. Default: None.
|
|
|
|
|
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
|
|
|
|
|
of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
|
|
|
|
|
the bias in the update gate, reset gate and candidate calculations.
|
|
|
|
|
If it is set to False, no bias will be applied to the update gate,
|
|
|
|
|
reset gate and candidate calculations. If it is set to None or one
|
|
|
|
|
attribute of ParamAttr, gru_unit will create ParamAttr as
|
|
|
|
|
bias_attr. If the Initializer of the bias_attr is not set, the bias
|
|
|
|
|
is initialized zero. Default: None.
|
|
|
|
|
activation (string): The activation type for cell (actNode).
|
|
|
|
|
Default: 'tanh'
|
|
|
|
|
gate_activation (string): The activation type for gates (actGate).
|
|
|
|
|
Default: 'sigmoid'
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
tuple: The hidden value, reset-hidden value and gate values.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self,
|
|
|
|
|
hidden,
|
|
|
|
|
size,
|
|
|
|
|
param_attr=None,
|
|
|
|
|
bias_attr=None,
|
|
|
|
|
activation='tanh',
|
|
|
|
|
gate_activation='sigmoid',
|
|
|
|
|
origin_mode=False,
|
|
|
|
|
dtype='float32'):
|
|
|
|
|
|
|
|
|
|
super(GRUUnit, self).__init__()
|
|
|
|
|
activation_dict = dict(
|
|
|
|
|
identity=0,
|
|
|
|
|
sigmoid=1,
|
|
|
|
|
tanh=2,
|
|
|
|
|
relu=3, )
|
|
|
|
|
activation = activation_dict[activation]
|
|
|
|
|
gate_activation = activation_dict[gate_activation]
|
|
|
|
|
|
|
|
|
|
helper = LayerHelper('gru_unit', **locals())
|
|
|
|
|
dtype = helper.input_dtype()
|
|
|
|
|
size = size // 3
|
|
|
|
|
|
|
|
|
|
# create weight
|
|
|
|
|
weight = helper.create_parameter(
|
|
|
|
|
attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
|
|
|
|
|
|
|
|
|
|
gate = helper.create_variable_for_type_inference(dtype)
|
|
|
|
|
reset_hidden_pre = helper.create_variable_for_type_inference(dtype)
|
|
|
|
|
updated_hidden = helper.create_variable_for_type_inference(dtype)
|
|
|
|
|
inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight}
|
|
|
|
|
# create bias
|
|
|
|
|
if helper.bias_attr:
|
|
|
|
|
bias_size = [1, 3 * size]
|
|
|
|
|
bias = helper.create_parameter(
|
|
|
|
|
attr=helper.bias_attr,
|
|
|
|
|
shape=bias_size,
|
|
|
|
|
dtype=dtype,
|
|
|
|
|
is_bias=True)
|
|
|
|
|
inputs['Bias'] = bias
|
|
|
|
|
|
|
|
|
|
def forward(self, input):
|
|
|
|
|
self._helper.append_op(
|
|
|
|
|
type='gru_unit',
|
|
|
|
|
inputs=inputs,
|
|
|
|
|
outputs={
|
|
|
|
|
'Gate': gate,
|
|
|
|
|
'ResetHiddenPrev': reset_hidden_pre,
|
|
|
|
|
'Hidden': updated_hidden,
|
|
|
|
|
},
|
|
|
|
|
attrs={
|
|
|
|
|
'activation': 2, # tanh
|
|
|
|
|
'gate_activation': 1, # sigmoid
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
return updated_hidden, reset_hidden_pre, gate
|
|
|
|
|