|
|
|
@ -26,6 +26,7 @@ __all__ = [
|
|
|
|
|
'fc',
|
|
|
|
|
'embedding',
|
|
|
|
|
'dynamic_lstm',
|
|
|
|
|
'dynamic_lstmp',
|
|
|
|
|
'dynamic_gru',
|
|
|
|
|
'gru_unit',
|
|
|
|
|
'linear_chain_crf',
|
|
|
|
@ -282,7 +283,7 @@ def dynamic_lstm(input,
|
|
|
|
|
W_{fc}, W_{oc}` are diagonal weight matrices for peephole connections. In
|
|
|
|
|
our implementation, we use vectors to reprenset these diagonal weight
|
|
|
|
|
matrices. The :math:`b` terms denote bias vectors (:math:`b_i` is the input
|
|
|
|
|
gate bias vector), :math:`\sigma` is the non-line activations, such as
|
|
|
|
|
gate bias vector), :math:`\sigma` is the non-linear activations, such as
|
|
|
|
|
logistic sigmoid function, and :math:`i, f, o` and :math:`c` are the input
|
|
|
|
|
gate, forget gate, output gate, and cell activation vectors, respectively,
|
|
|
|
|
all of which have the same size as the cell output activation vector :math:`h`.
|
|
|
|
@ -389,6 +390,181 @@ def dynamic_lstm(input,
|
|
|
|
|
return hidden, cell
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def dynamic_lstmp(input,
|
|
|
|
|
size,
|
|
|
|
|
proj_size,
|
|
|
|
|
param_attr=None,
|
|
|
|
|
bias_attr=None,
|
|
|
|
|
use_peepholes=True,
|
|
|
|
|
is_reverse=False,
|
|
|
|
|
gate_activation='sigmoid',
|
|
|
|
|
cell_activation='tanh',
|
|
|
|
|
candidate_activation='tanh',
|
|
|
|
|
proj_activation='tanh',
|
|
|
|
|
dtype='float32'):
|
|
|
|
|
"""
|
|
|
|
|
**Dynamic LSTMP Layer**
|
|
|
|
|
|
|
|
|
|
LSTMP (LSTM with recurrent projection) layer has a separate projection
|
|
|
|
|
layer after the LSTM layer, projecting the original hidden state to a
|
|
|
|
|
lower-dimensional one, which is proposed to reduce the number of total
|
|
|
|
|
parameters and furthermore computational complexity for the LSTM,
|
|
|
|
|
espeacially for the case that the size of output units is relative
|
|
|
|
|
large (https://research.google.com/pubs/archive/43905.pdf).
|
|
|
|
|
|
|
|
|
|
The formula is as follows:
|
|
|
|
|
|
|
|
|
|
.. math::
|
|
|
|
|
|
|
|
|
|
i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i) \\
|
|
|
|
|
|
|
|
|
|
f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f) \\
|
|
|
|
|
|
|
|
|
|
\tilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c) \\
|
|
|
|
|
|
|
|
|
|
o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o) \\
|
|
|
|
|
|
|
|
|
|
c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
|
|
|
|
|
|
|
|
|
|
h_t = o_t \odot act_h(c_t) \\
|
|
|
|
|
|
|
|
|
|
r_t = \overline{act_h}(W_{rh}h_t)
|
|
|
|
|
|
|
|
|
|
where the :math:`W` terms denote weight matrices (e.g. :math:`W_{xi}` is
|
|
|
|
|
the matrix of weights from the input gate to the input), :math:`W_{ic}`,
|
|
|
|
|
:math:`W_{fc}`, :math:`W_{oc}` are diagonal weight matrices for peephole
|
|
|
|
|
connections. In our implementation, we use vectors to reprenset these
|
|
|
|
|
diagonal weight matrices. The :math:`b` terms denote bias vectors
|
|
|
|
|
(:math:`b_i` is the input gate bias vector), :math:`\sigma` is the
|
|
|
|
|
activation, such as logistic sigmoid function, and :math:`i, f, o` and
|
|
|
|
|
:math:`c` are the input gate, forget gate, output gate, and cell activation
|
|
|
|
|
vectors, respectively, all of which have the same size as the cell output
|
|
|
|
|
activation vector :math:`h`. Here :math:`h` is usually called the hidden
|
|
|
|
|
state and :math:`r` denotes its recurrent projection. And
|
|
|
|
|
:math:`\tilde{c_t}` is also called the candidate hidden state, whose
|
|
|
|
|
computation is based on the current input and previous hidden state.
|
|
|
|
|
|
|
|
|
|
The :math:`\odot` is the element-wise product of the vectors. :math:`act_g`
|
|
|
|
|
and :math:`act_h` are the cell input and cell output activation functions
|
|
|
|
|
and `tanh` is usually used for them. :math:`\overline{act_h}` is the
|
|
|
|
|
activation function for the projection output, usually using `identity` or
|
|
|
|
|
same as :math:`act_h`.
|
|
|
|
|
|
|
|
|
|
Set `use_peepholes` to `False` to disable peephole connection. The formula
|
|
|
|
|
is omitted here, please refer to the paper
|
|
|
|
|
http://www.bioinf.jku.at/publications/older/2604.pdf for details.
|
|
|
|
|
|
|
|
|
|
Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}`
|
|
|
|
|
operations on the input :math:`x_{t}` are NOT included in this operator.
|
|
|
|
|
Users can choose to use fully-connected layer before LSTMP layer.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
input(Variable): The input of dynamic_lstmp layer, which supports
|
|
|
|
|
variable-time length input sequence. The underlying
|
|
|
|
|
tensor in this Variable is a matrix with shape
|
|
|
|
|
(T X 4D), where T is the total time steps in this
|
|
|
|
|
mini-batch, D is the hidden size.
|
|
|
|
|
size(int): 4 * hidden size.
|
|
|
|
|
proj_size(int): The size of projection output.
|
|
|
|
|
param_attr(ParamAttr): The parameter attribute for the learnable
|
|
|
|
|
hidden-hidden weight and projection weight.
|
|
|
|
|
|
|
|
|
|
- The shape of hidden-hidden weight is (P x 4D),
|
|
|
|
|
where P is the projection size and D the hidden
|
|
|
|
|
size.
|
|
|
|
|
- The shape of projection weight is (D x P).
|
|
|
|
|
- Hidden-hidden weight = {:math:`W_{ch}, W_{ih}, \
|
|
|
|
|
W_{fh}, W_{oh}`}.
|
|
|
|
|
- Projection weight = {:math:`W_{rh}`}.
|
|
|
|
|
bias_attr(ParamAttr): The bias attribute for the learnable bias
|
|
|
|
|
weights, which contains two parts, input-hidden
|
|
|
|
|
bias weights and peephole connections weights if
|
|
|
|
|
setting `use_peepholes` to `True`.
|
|
|
|
|
|
|
|
|
|
1. `use_peepholes = False`
|
|
|
|
|
- The shape is (1 x 4D).
|
|
|
|
|
- Biases = {:math:`b_c, b_i, b_f, b_o`}.
|
|
|
|
|
2. `use_peepholes = True`
|
|
|
|
|
- The shape is (1 x 7D).
|
|
|
|
|
- Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
|
|
|
|
|
W_{fc}, W_{oc}`}.
|
|
|
|
|
use_peepholes(bool): Whether to enable diagonal/peephole connections,
|
|
|
|
|
default `True`.
|
|
|
|
|
is_reverse(bool): Whether to compute reversed LSTM, default `False`.
|
|
|
|
|
gate_activation(str): The activation for input gate, forget gate and
|
|
|
|
|
output gate. Choices = ["sigmoid", "tanh", "relu",
|
|
|
|
|
"identity"], default "sigmoid".
|
|
|
|
|
cell_activation(str): The activation for cell output. Choices = ["sigmoid",
|
|
|
|
|
"tanh", "relu", "identity"], default "tanh".
|
|
|
|
|
candidate_activation(str): The activation for candidate hidden state.
|
|
|
|
|
Choices = ["sigmoid", "tanh", "relu", "identity"],
|
|
|
|
|
default "tanh".
|
|
|
|
|
proj_activation(str): The activation for projection output.
|
|
|
|
|
Choices = ["sigmoid", "tanh", "relu", "identity"],
|
|
|
|
|
default "tanh".
|
|
|
|
|
dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
tuple: The projection of hidden state, and cell state of LSTMP. The
|
|
|
|
|
shape of projection is (T x P), for the cell state which is
|
|
|
|
|
(T x D), and both LoD is the same with the `input`.
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
hidden_dim = 512
|
|
|
|
|
proj_dim = 256
|
|
|
|
|
fc_out = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
|
|
|
|
|
act=None, bias_attr=None)
|
|
|
|
|
proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out,
|
|
|
|
|
size=hidden_dim * 4, proj_size=proj_dim, use_peepholes=False)
|
|
|
|
|
"""
|
|
|
|
|
helper = LayerHelper('lstmp', **locals())
|
|
|
|
|
size = size / 4
|
|
|
|
|
weight = helper.create_parameter(
|
|
|
|
|
attr=helper.param_attr, shape=[proj_size, 4 * size], dtype=dtype)
|
|
|
|
|
proj_weight = helper.create_parameter(
|
|
|
|
|
attr=helper.param_attr, shape=[size, proj_size], dtype=dtype)
|
|
|
|
|
bias_size = [1, 7 * size]
|
|
|
|
|
if not use_peepholes:
|
|
|
|
|
bias_size[1] = 4 * size
|
|
|
|
|
bias = helper.create_parameter(
|
|
|
|
|
attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
|
|
|
|
|
|
|
|
|
|
projection = helper.create_tmp_variable(dtype)
|
|
|
|
|
cell = helper.create_tmp_variable(dtype)
|
|
|
|
|
ordered_proj0 = helper.create_tmp_variable(dtype)
|
|
|
|
|
batch_hidden = helper.create_tmp_variable(dtype)
|
|
|
|
|
batch_gate = helper.create_tmp_variable(dtype)
|
|
|
|
|
batch_cell_pre_act = helper.create_tmp_variable(dtype)
|
|
|
|
|
|
|
|
|
|
helper.append_op(
|
|
|
|
|
type='lstmp',
|
|
|
|
|
inputs={
|
|
|
|
|
'Input': input,
|
|
|
|
|
'Weight': weight,
|
|
|
|
|
'ProjWeight': proj_weight,
|
|
|
|
|
'Bias': bias
|
|
|
|
|
},
|
|
|
|
|
outputs={
|
|
|
|
|
'Projection': projection,
|
|
|
|
|
'Cell': cell,
|
|
|
|
|
'OrderedP0': ordered_proj0,
|
|
|
|
|
'BatchHidden': batch_hidden,
|
|
|
|
|
'BatchGate': batch_gate,
|
|
|
|
|
'BatchCellPreAct': batch_cell_pre_act
|
|
|
|
|
},
|
|
|
|
|
attrs={
|
|
|
|
|
'use_peepholes': use_peepholes,
|
|
|
|
|
'is_reverse': is_reverse,
|
|
|
|
|
'gate_activation': gate_activation,
|
|
|
|
|
'cell_activation': cell_activation,
|
|
|
|
|
'candidate_activation': candidate_activation,
|
|
|
|
|
'proj_activation': proj_activation
|
|
|
|
|
})
|
|
|
|
|
return projection, cell
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def dynamic_gru(input,
|
|
|
|
|
size,
|
|
|
|
|
param_attr=None,
|
|
|
|
|