You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1337 lines
56 KiB
1337 lines
56 KiB
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import copy
|
|
import collections
|
|
import itertools
|
|
import six
|
|
import math
|
|
import sys
|
|
import warnings
|
|
from functools import partial, reduce
|
|
|
|
import paddle
|
|
from paddle import framework
|
|
from paddle.nn import functional as F
|
|
from paddle.nn import initializer as I
|
|
from paddle.fluid.dygraph import Layer, LayerList
|
|
from paddle.fluid.layers import utils
|
|
from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
|
|
from paddle.fluid.data_feeder import convert_dtype
|
|
|
|
__all__ = [
|
|
'RNNCellBase',
|
|
'SimpleRNNCell',
|
|
'LSTMCell',
|
|
'GRUCell',
|
|
'RNN',
|
|
'BiRNN',
|
|
'SimpleRNN',
|
|
'LSTM',
|
|
'GRU',
|
|
]
|
|
|
|
|
|
def split_states(states, bidirectional=False, state_components=1):
|
|
r"""
|
|
Split states of RNN network into possibly nested list or tuple of
|
|
states of each RNN cells of the RNN network.
|
|
|
|
Arguments:
|
|
states (Tensor|tuple|list): the concatenated states for RNN network.
|
|
When `state_components` is 1, states in a Tensor with shape
|
|
`(L*D, N, C)` where `L` is the number of layers of the RNN
|
|
network, `D` is the number of directions of the RNN network(1
|
|
for unidirectional RNNs and 2 for bidirectional RNNs), `N` is
|
|
the batch size of the input to the RNN network, `C` is the
|
|
hidden size of the RNN network.
|
|
|
|
When `state_components` is larger than 1, `states` is a tuple of
|
|
`state_components` Tensors that meet the requirements described
|
|
above.
|
|
|
|
For SimpleRNNs and GRUs, `state_components` is 1, and for LSTMs,
|
|
`state_components` is 2.
|
|
bidirectional (bool): whether the state is of a bidirectional RNN
|
|
network. Defaults to False.
|
|
state_components (int): the number of the components of the states. see
|
|
`states` above. Defaults to 1.
|
|
|
|
Returns:
|
|
A nested list or tuple of RNN cell states.
|
|
If `bidirectional` is True, it can be indexed twice to get an RNN
|
|
cell state. The first index indicates the layer, the second index
|
|
indicates the direction.
|
|
If `bidirectional` is False, it can be indexed once to get an RNN
|
|
cell state. The index indicates the layer.
|
|
Note that if `state_components` is larger than 1, an RNN cell state
|
|
can be indexed one more time to get a tensor of shape(N, C), where
|
|
`N` is the batch size of the input to the RNN cell, and `C` is the
|
|
hidden size of the RNN cell.
|
|
"""
|
|
if state_components == 1:
|
|
states = paddle.unstack(states)
|
|
if not bidirectional:
|
|
return states
|
|
else:
|
|
return list(zip(states[::2], states[1::2]))
|
|
else:
|
|
assert len(states) == state_components
|
|
states = tuple([paddle.unstack(item) for item in states])
|
|
if not bidirectional:
|
|
return list(zip(*states))
|
|
else:
|
|
states = list(zip(*states))
|
|
return list(zip(states[::2], states[1::2]))
|
|
|
|
|
|
def concat_states(states, bidirectional=False, state_components=1):
|
|
r"""
|
|
Concatenate a possibly nested list or tuple of RNN cell states into a
|
|
compact form.
|
|
|
|
Arguments:
|
|
states (list|tuple): a possibly nested list or tuple of RNN cell
|
|
states.
|
|
If `bidirectional` is True, it can be indexed twice to get an
|
|
RNN cell state. The first index indicates the layer, the second
|
|
index indicates the direction.
|
|
If `bidirectional` is False, it can be indexed once to get an RNN
|
|
cell state. The index indicates the layer.
|
|
Note that if `state_components` is larger than 1, an RNN cell
|
|
state can be indexed one more time to get a tensor of shape(N, C),
|
|
where `N` is the batch size of the input to the RNN cell, and
|
|
`C` is the hidden size of the RNN cell.
|
|
bidirectional (bool): whether the state is of a bidirectional RNN
|
|
network. Defaults to False.
|
|
state_components (int): the number of the components of the states. see
|
|
`states` above. Defaults to 1.
|
|
|
|
Returns:
|
|
Concatenated states for RNN network.
|
|
When `state_components` is 1, states in a Tensor with shape
|
|
`(L\*D, N, C)` where `L` is the number of layers of the RNN
|
|
network, `D` is the number of directions of the RNN network(1 for
|
|
unidirectional RNNs and 2 for bidirectional RNNs), `N` is the batch
|
|
size of the input to the RNN network, `C` is the hidden size of the
|
|
RNN network.
|
|
|
|
"""
|
|
if state_components == 1:
|
|
return paddle.stack(flatten(states))
|
|
else:
|
|
states = flatten(states)
|
|
componnets = []
|
|
for i in range(state_components):
|
|
componnets.append(states[i::state_components])
|
|
return [paddle.stack(item) for item in componnets]
|
|
|
|
|
|
class RNNCellBase(Layer):
|
|
r"""
|
|
RNNCellBase is the base class for abstraction representing the calculations
|
|
mapping the input and state to the output and new state. It is suitable to
|
|
and mostly used in RNN.
|
|
"""
|
|
|
|
def get_initial_states(self,
|
|
batch_ref,
|
|
shape=None,
|
|
dtype=None,
|
|
init_value=0.,
|
|
batch_dim_idx=0):
|
|
r"""
|
|
Generate initialized states according to provided shape, data type and
|
|
value.
|
|
Arguments:
|
|
batch_ref (Tensor): A tensor, which shape would be used to
|
|
determine the batch size, which is used to generate initial
|
|
states. For `batch_ref`'s shape d, `d[batch_dim_idx]` is
|
|
treated as batch size.
|
|
shape (list|tuple, optional): A (possibly nested structure of) shape[s],
|
|
where a shape is a list/tuple of integer). `-1` (for batch size)
|
|
will be automatically prepended if a shape does not starts with
|
|
it. If None, property `state_shape` will be used. Defaults to
|
|
None.
|
|
dtype (str|list|tuple, optional): A (possibly nested structure of)
|
|
data type[s]. The structure must be same as that of `shape`,
|
|
except when all tensors' in states has the same data type, a
|
|
single data type can be used. If None and property `cell.state_shape`
|
|
is not available, current default floating type of paddle is
|
|
used. Defaults to None.
|
|
init_value (float, optional): A float value used to initialize states.
|
|
Defaults to 0.
|
|
batch_dim_idx (int, optional): An integer indicating which
|
|
dimension of the of `batch_ref` represents batch. Defaults to 0.
|
|
Returns:
|
|
init_states (Tensor|tuple|list): tensor of the provided shape and
|
|
dtype, or list of tensors that each satisfies the requirements,
|
|
packed in the same structure as `shape` and `type` does.
|
|
"""
|
|
# TODO: use inputs and batch_size
|
|
batch_ref = flatten(batch_ref)[0]
|
|
|
|
def _is_shape_sequence(seq):
|
|
if sys.version_info < (3, ):
|
|
integer_types = (
|
|
int,
|
|
long, )
|
|
else:
|
|
integer_types = (int, )
|
|
"""For shape, list/tuple of integer is the finest-grained objection"""
|
|
if (isinstance(seq, list) or isinstance(seq, tuple)):
|
|
if reduce(lambda flag, x: isinstance(x, integer_types) and flag,
|
|
seq, True):
|
|
return False
|
|
# TODO: Add check for the illegal
|
|
if isinstance(seq, dict):
|
|
return True
|
|
return (isinstance(seq, collections.Sequence) and
|
|
not isinstance(seq, six.string_types))
|
|
|
|
class Shape(object):
|
|
def __init__(self, shape):
|
|
self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
|
|
|
|
# nested structure of shapes
|
|
states_shapes = self.state_shape if shape is None else shape
|
|
is_sequence_ori = utils.is_sequence
|
|
utils.is_sequence = _is_shape_sequence
|
|
states_shapes = map_structure(lambda shape: Shape(shape), states_shapes)
|
|
utils.is_sequence = is_sequence_ori
|
|
|
|
# nested structure of dtypes
|
|
try:
|
|
states_dtypes = self.state_dtype if dtype is None else dtype
|
|
except NotImplementedError:
|
|
states_dtypes = framework.get_default_dtype()
|
|
if len(flatten(states_dtypes)) == 1:
|
|
dtype = flatten(states_dtypes)[0]
|
|
states_dtypes = map_structure(lambda shape: dtype, states_shapes)
|
|
|
|
init_states = map_structure(
|
|
lambda shape, dtype: paddle.fluid.layers.fill_constant_batch_size_like(
|
|
input=batch_ref,
|
|
shape=shape.shape,
|
|
dtype=dtype,
|
|
value=init_value,
|
|
input_dim_idx=batch_dim_idx), states_shapes, states_dtypes)
|
|
return init_states
|
|
|
|
@property
|
|
def state_shape(self):
|
|
r"""
|
|
Abstract method (property).
|
|
Used to initialize states.
|
|
A (possiblely nested structure of) shape[s], where a shape is a
|
|
list/tuple of integers (-1 for batch size would be automatically
|
|
inserted into a shape if shape is not started with it).
|
|
Not necessary to be implemented if states are not initialized by
|
|
`get_initial_states` or the `shape` argument is provided when using
|
|
`get_initial_states`.
|
|
"""
|
|
raise NotImplementedError(
|
|
"Please add implementaion for `state_shape` in the used cell.")
|
|
|
|
@property
|
|
def state_dtype(self):
|
|
r"""
|
|
Abstract method (property).
|
|
Used to initialize states.
|
|
A (possiblely nested structure of) data types[s]. The structure must be
|
|
same as that of `shape`, except when all tensors' in states has the same
|
|
data type, a signle data type can be used.
|
|
Not necessary to be implemented if states are not initialized
|
|
by `get_initial_states` or the `dtype` argument is provided when using
|
|
`get_initial_states`.
|
|
"""
|
|
raise NotImplementedError(
|
|
"Please add implementaion for `state_dtype` in the used cell.")
|
|
|
|
|
|
class SimpleRNNCell(RNNCellBase):
|
|
r"""
|
|
Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
|
|
computes the outputs and updates states.
|
|
|
|
The formula used is as follows:
|
|
|
|
.. math::
|
|
h_{t} & = \mathrm{tanh}(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh})
|
|
y_{t} & = h_{t}
|
|
|
|
where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise
|
|
multiplication operator.
|
|
|
|
Please refer to `Finding Structure in Time
|
|
<https://crl.ucsd.edu/~elman/Papers/fsit.pdf>`_ for more details.
|
|
|
|
Arguments:
|
|
input_size (int): The input size.
|
|
hidden_size (int): The hidden size.
|
|
activation (str, optional): The activation in the SimpleRNN cell.
|
|
It can be `tanh` or `relu`. Defaults to `tanh`.
|
|
weight_ih_attr (ParamAttr, optional): The parameter attribute for
|
|
`weight_ih`. Default: None.
|
|
weight_hh_attr(ParamAttr, optional): The parameter attribute for
|
|
`weight_hh`. Default: None.
|
|
bias_ih_attr (ParamAttr, optional): The parameter attribute for the
|
|
`bias_ih`. Default: None.
|
|
bias_hh_attr (ParamAttr, optional): The parameter attribute for the
|
|
`bias_hh`. Default: None.
|
|
name (str, optional): Name for the operation (optional, default is
|
|
None). For more information, please refer to :ref:`api_guide_Name`.
|
|
|
|
Parameters:
|
|
weight_ih (Parameter): shape (hidden_size, input_size), input to hidden
|
|
weight, corresponding to :math:`W_{ih}` in the formula.
|
|
weight_hh (Parameter): shape (hidden_size, hidden_size), hidden to
|
|
hidden weight, corresponding to :math:`W_{hh}` in the formula.
|
|
bias_ih (Parameter): shape (hidden_size, ), input to hidden bias,
|
|
corresponding to :math:`b_{ih}` in the formula.
|
|
bias_hh (Parameter): shape (hidden_size, ), hidden to hidden bias,
|
|
corresponding to :math:`b_{hh}` in the formula.
|
|
|
|
Inputs:
|
|
inputs (Tensor): shape `[batch_size, input_size]`, the input,
|
|
corresponding to :math:`x_t` in the formula.
|
|
states (Tensor, optional): shape `[batch_size, hidden_size]`, the
|
|
previous hidden state, corresponding to :math:`h_{t-1}` in the
|
|
formula. When states is None, zero state is used. Defaults to
|
|
None.
|
|
|
|
Returns:
|
|
(outputs, new_states)
|
|
outputs (Tensor): shape `[batch_size, hidden_size]`, the output,
|
|
corresponding to :math:`h_{t}` in the formula.
|
|
states (Tensor): shape `[batch_size, hidden_size]`, the new hidden
|
|
state, corresponding to :math:`h_{t}` in the formula.
|
|
|
|
Notes:
|
|
All the weights and bias are initialized with `Uniform(-std, std)` by
|
|
default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more
|
|
information about parameter initialization, please refer to
|
|
:ref:`api_fluid_ParamAttr`.
|
|
|
|
Examples:
|
|
|
|
.. code-block:: python
|
|
|
|
import paddle
|
|
paddle.disable_static()
|
|
|
|
x = paddle.randn((4, 16))
|
|
prev_h = paddle.randn((4, 32))
|
|
|
|
cell = paddle.nn.SimpleRNNCell(16, 32)
|
|
y, h = cell(x, prev_h)
|
|
|
|
"""
|
|
|
|
def __init__(self,
|
|
input_size,
|
|
hidden_size,
|
|
activation="tanh",
|
|
weight_ih_attr=None,
|
|
weight_hh_attr=None,
|
|
bias_ih_attr=None,
|
|
bias_hh_attr=None,
|
|
name=None):
|
|
super(SimpleRNNCell, self).__init__()
|
|
std = 1.0 / math.sqrt(hidden_size)
|
|
self.weight_ih = self.create_parameter(
|
|
(hidden_size, input_size),
|
|
weight_ih_attr,
|
|
default_initializer=I.Uniform(-std, std))
|
|
self.weight_hh = self.create_parameter(
|
|
(hidden_size, hidden_size),
|
|
weight_hh_attr,
|
|
default_initializer=I.Uniform(-std, std))
|
|
self.bias_ih = self.create_parameter(
|
|
(hidden_size, ),
|
|
bias_ih_attr,
|
|
is_bias=True,
|
|
default_initializer=I.Uniform(-std, std))
|
|
self.bias_hh = self.create_parameter(
|
|
(hidden_size, ),
|
|
bias_hh_attr,
|
|
is_bias=True,
|
|
default_initializer=I.Uniform(-std, std))
|
|
|
|
self.input_size = input_size
|
|
self.hidden_size = hidden_size
|
|
if activation not in ["tanh", "relu"]:
|
|
raise ValueError(
|
|
"activation for SimpleRNNCell should be tanh or relu, "
|
|
"but get {}".format(activation))
|
|
self.activation = activation
|
|
self._activation_fn = paddle.tanh \
|
|
if activation == "tanh" \
|
|
else F.relu
|
|
|
|
def forward(self, inputs, states=None):
|
|
if states is None:
|
|
states = self.get_initial_states(inputs, self.state_shape)
|
|
pre_h = states
|
|
i2h = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
|
|
if self.bias_ih is not None:
|
|
i2h += self.bias_ih
|
|
h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
|
|
if self.bias_hh is not None:
|
|
h2h += self.bias_hh
|
|
h = self._activation_fn(i2h + h2h)
|
|
return h, h
|
|
|
|
@property
|
|
def state_shape(self):
|
|
return (self.hidden_size, )
|
|
|
|
|
|
class LSTMCell(RNNCellBase):
|
|
r"""
|
|
Long-Short Term Memory(LSTM) RNN cell. Given the inputs and previous states,
|
|
it computes the outputs and updates states.
|
|
|
|
The formula used is as follows:
|
|
|
|
.. math::
|
|
i_{t} & = \sigma(W_{ii}x_{t} + b_{ii} + W_{hi}h_{t-1} + b_{hi})
|
|
f_{t} & = \sigma(W_{if}x_{t} + b_{if} + W_{hf}h_{t-1} + b_{hf})
|
|
o_{t} & = \sigma(W_{io}x_{t} + b_{io} + W_{ho}h_{t-1} + b_{ho})
|
|
\\widetilde{c}_{t} & = \\tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg})
|
|
c_{t} & = f_{t} \* c{t-1} + i{t} \* \\widetile{c}_{t}
|
|
h_{t} & = o_{t} \* \\tanh(c_{t})
|
|
y_{t} & = h_{t}
|
|
|
|
where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise
|
|
multiplication operator.
|
|
|
|
Please refer to `An Empirical Exploration of Recurrent Network Architectures
|
|
<http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
|
|
|
|
Arguments:
|
|
input_size (int): The input size.
|
|
hidden_size (int): The hidden size.
|
|
weight_ih_attr(ParamAttr, optional): The parameter attribute for
|
|
`weight_ih`. Default: None.
|
|
weight_hh_attr(ParamAttr, optional): The parameter attribute for
|
|
`weight_hh`. Default: None.
|
|
bias_ih_attr (ParamAttr, optional): The parameter attribute for the
|
|
`bias_ih`. Default: None.
|
|
bias_hh_attr (ParamAttr, optional): The parameter attribute for the
|
|
`bias_hh`. Default: None.
|
|
name (str, optional): Name for the operation (optional, default is
|
|
None). For more information, please refer to :ref:`api_guide_Name`.
|
|
|
|
Parameters:
|
|
weight_ih (Parameter): shape (4 * hidden_size, input_size), input to
|
|
hidden weight, which corresponds to the concatenation of
|
|
:math:`W_{ii}, W_{if}, W_{ig}, W_{io}` in the formula.
|
|
weight_hh (Parameter): shape (4 * hidden_size, hidden_size), hidden to
|
|
hidden weight, which corresponds to the concatenation of
|
|
:math:`W_{hi}, W_{hf}, W_{hg}, W_{ho}` in the formula.
|
|
bias_ih (Parameter): shape (4 * hidden_size, ), input to hidden bias,
|
|
which corresponds to the concatenation of
|
|
:math:`b_{ii}, b_{if}, b_{ig}, b_{io}` in the formula.
|
|
bias_hh (Parameter): shape (4 * hidden_size, ), hidden to hidden bias,
|
|
which corresponds to the concatenation of
|
|
:math:`b_{hi}, b_{hf}, b_{hg}, b_{ho}` in the formula.
|
|
|
|
Inputs:
|
|
inputs (Tensor): shape `[batch_size, input_size]`, the input,
|
|
corresponding to :math:`x_t` in the formula.
|
|
states (tuple, optional): a tuple of two tensors, each of shape
|
|
`[batch_size, hidden_size]`, the previous hidden state,
|
|
corresponding to :math:`h_{t-1}, c_{t-1}` in the formula.
|
|
When states is None, zero state is used. Defaults to None.
|
|
|
|
Returns:
|
|
(outputs, new_states)
|
|
outputs (Tensor): shape `[batch_size, hidden_size]`, the output,
|
|
corresponding to :math:`h_{t}` in the formula.
|
|
states (tuple): a tuple of two tensors, each of shape
|
|
`[batch_size, hidden_size]`, the new hidden states,
|
|
corresponding to :math:`h_{t}, c{t}` in the formula.
|
|
|
|
Notes:
|
|
All the weights and bias are initialized with `Uniform(-std, std)` by
|
|
default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more
|
|
information about parameter initialization, please refer to
|
|
:ref:`api_fluid_ParamAttr`.
|
|
|
|
Examples:
|
|
|
|
.. code-block:: python
|
|
|
|
import paddle
|
|
paddle.disable_static()
|
|
|
|
x = paddle.randn((4, 16))
|
|
prev_h = paddle.randn((4, 32))
|
|
prev_c = paddle.randn((4, 32))
|
|
|
|
cell = paddle.nn.LSTMCell(16, 32)
|
|
y, (h, c) = cell(x, (prev_h, prev_c))
|
|
|
|
"""
|
|
|
|
def __init__(self,
|
|
input_size,
|
|
hidden_size,
|
|
weight_ih_attr=None,
|
|
weight_hh_attr=None,
|
|
bias_ih_attr=None,
|
|
bias_hh_attr=None,
|
|
name=None):
|
|
super(LSTMCell, self).__init__()
|
|
std = 1.0 / math.sqrt(hidden_size)
|
|
self.weight_ih = self.create_parameter(
|
|
(4 * hidden_size, input_size),
|
|
weight_ih_attr,
|
|
default_initializer=I.Uniform(-std, std))
|
|
self.weight_hh = self.create_parameter(
|
|
(4 * hidden_size, hidden_size),
|
|
weight_hh_attr,
|
|
default_initializer=I.Uniform(-std, std))
|
|
self.bias_ih = self.create_parameter(
|
|
(4 * hidden_size, ),
|
|
bias_ih_attr,
|
|
is_bias=True,
|
|
default_initializer=I.Uniform(-std, std))
|
|
self.bias_hh = self.create_parameter(
|
|
(4 * hidden_size, ),
|
|
bias_hh_attr,
|
|
is_bias=True,
|
|
default_initializer=I.Uniform(-std, std))
|
|
|
|
self.hidden_size = hidden_size
|
|
self.input_size = input_size
|
|
self._gate_activation = F.sigmoid
|
|
self._activation = paddle.tanh
|
|
|
|
def forward(self, inputs, states=None):
|
|
if states is None:
|
|
states = self.get_initial_states(inputs, self.state_shape)
|
|
pre_hidden, pre_cell = states
|
|
gates = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
|
|
if self.bias_ih is not None:
|
|
gates = gates + self.bias_ih
|
|
gates += paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
|
|
if self.bias_hh is not None:
|
|
gates = gates + self.bias_hh
|
|
|
|
chunked_gates = paddle.split(gates, num_or_sections=4, axis=-1)
|
|
|
|
i = self._gate_activation(chunked_gates[0])
|
|
f = self._gate_activation(chunked_gates[1])
|
|
o = self._gate_activation(chunked_gates[3])
|
|
c = f * pre_cell + i * self._activation(chunked_gates[2])
|
|
h = o * self._activation(c)
|
|
|
|
return h, (h, c)
|
|
|
|
@property
|
|
def state_shape(self):
|
|
r"""
|
|
The `state_shape` of LSTMCell is a tuple with two shapes:
|
|
`((hidden_size, ), (hidden_size,))`. (-1 for batch size would be
|
|
automatically inserted into shape). These two shapes correspond
|
|
to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
|
|
"""
|
|
return ((self.hidden_size, ), (self.hidden_size, ))
|
|
|
|
|
|
class GRUCell(RNNCellBase):
|
|
r"""
|
|
Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
|
|
it computes the outputs and updates states.
|
|
|
|
The formula for GRU used is as follows:
|
|
|
|
.. math::
|
|
|
|
r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t} + b_{hr})
|
|
z_{t} & = \sigma(W_{iz)x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz})
|
|
\\widetilde{h}_{t} & = \\tanh(W_{ic)x_{t} + b_{ic} + r_{t} \* (W_{hc}x_{t} + b{hc}))
|
|
h_{t} & = z_{t} \* h_{t-1} + (1 - z_{t}) \* \\widetilde{h}_{t}
|
|
y_{t} & = h_{t}
|
|
|
|
where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise
|
|
multiplication operator.
|
|
|
|
Please refer to `An Empirical Exploration of Recurrent Network Architectures
|
|
<http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
|
|
|
|
Parameters:
|
|
input_size (int): The input size..
|
|
hidden_size (int): The hidden size.
|
|
weight_ih_attr(ParamAttr, optional): The parameter attribute for
|
|
`weight_ih`. Default: None.
|
|
weight_hh_attr(ParamAttr, optional): The parameter attribute for
|
|
`weight_hh`. Default: None.
|
|
bias_ih_attr (ParamAttr, optional): The parameter attribute for the
|
|
`bias_ih`. Default: None.
|
|
bias_hh_attr (ParamAttr, optional): The parameter attribute for the
|
|
`bias_hh`. Default: None.
|
|
name (str, optional): Name for the operation (optional, default is
|
|
None). For more information, please refer to :ref:`api_guide_Name`.
|
|
|
|
Parameters:
|
|
weight_ih (Parameter): shape (3 * hidden_size, input_size), input to
|
|
hidden weight, which corresponds to the concatenation of
|
|
:math:`W_{ir}, W_{iz}, W_{ic}` in the formula.
|
|
weight_hh (Parameter): shape (3 * hidden_size, hidden_size), hidden to
|
|
hidden weight, which corresponds to the concatenation of
|
|
:math:`W_{hr}, W_{hz}, W_{hc}` in the formula.
|
|
bias_ih (Parameter): shape (3 * hidden_size, ), input to hidden bias,
|
|
which corresponds to the concatenation of
|
|
:math:`b_{ir}, b_{iz}, b_{ic}` in the formula.
|
|
bias_hh (Parameter): shape (3 * hidden_size, ), hidden to hidden bias,
|
|
which corresponds to the concatenation of
|
|
:math:`b_{hr}, b_{hz}, b_{hc}` in the formula.
|
|
|
|
Inputs:
|
|
inputs (Tensor): A tensor with shape `[batch_size, input_size]`,
|
|
corresponding to :math:`x_t` in the formula.
|
|
states (Tensor): A tensor with shape `[batch_size, hidden_size]`.
|
|
corresponding to :math:`h_{t-1}` in the formula.
|
|
|
|
Returns:
|
|
(outputs, new_states)
|
|
outputs (Tensor): shape `[batch_size, hidden_size]`, the output,
|
|
corresponding to :math:`h_{t}` in the formula.
|
|
states (Tensor): shape `[batch_size, hidden_size]`, the new hidden
|
|
state, corresponding to :math:`h_{t}` in the formula.
|
|
|
|
Notes:
|
|
All the weights and bias are initialized with `Uniform(-std, std)` by
|
|
default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more
|
|
information about parameter initialization, please refer to
|
|
:ref:`api_fluid_ParamAttr`.
|
|
|
|
Examples:
|
|
|
|
.. code-block:: python
|
|
|
|
import paddle
|
|
paddle.disable_static()
|
|
|
|
x = paddle.randn((4, 16))
|
|
prev_h = paddle.randn((4, 32))
|
|
|
|
cell = paddle.nn.GRUCell(16, 32)
|
|
y, h = cell(x, prev_h)
|
|
|
|
"""
|
|
|
|
def __init__(self,
|
|
input_size,
|
|
hidden_size,
|
|
weight_ih_attr=None,
|
|
weight_hh_attr=None,
|
|
bias_ih_attr=None,
|
|
bias_hh_attr=None,
|
|
name=None):
|
|
super(GRUCell, self).__init__()
|
|
std = 1.0 / math.sqrt(hidden_size)
|
|
self.weight_ih = self.create_parameter(
|
|
(3 * hidden_size, input_size),
|
|
weight_ih_attr,
|
|
default_initializer=I.Uniform(-std, std))
|
|
self.weight_hh = self.create_parameter(
|
|
(3 * hidden_size, hidden_size),
|
|
weight_hh_attr,
|
|
default_initializer=I.Uniform(-std, std))
|
|
self.bias_ih = self.create_parameter(
|
|
(3 * hidden_size, ),
|
|
bias_ih_attr,
|
|
is_bias=True,
|
|
default_initializer=I.Uniform(-std, std))
|
|
self.bias_hh = self.create_parameter(
|
|
(3 * hidden_size, ),
|
|
bias_hh_attr,
|
|
is_bias=True,
|
|
default_initializer=I.Uniform(-std, std))
|
|
|
|
self.hidden_size = hidden_size
|
|
self.input_size = input_size
|
|
self._gate_activation = F.sigmoid
|
|
self._activation = paddle.tanh
|
|
|
|
def forward(self, inputs, states=None):
|
|
if states is None:
|
|
states = self.get_initial_states(inputs, self.state_shape)
|
|
|
|
pre_hidden = states
|
|
x_gates = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
|
|
if self.bias_ih is not None:
|
|
x_gates = x_gates + self.bias_ih
|
|
h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
|
|
if self.bias_hh is not None:
|
|
h_gates = h_gates + self.bias_hh
|
|
|
|
x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1)
|
|
h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1)
|
|
|
|
r = self._gate_activation(x_r + h_r)
|
|
z = self._gate_activation(x_z + h_z)
|
|
c = self._activation(x_c + r * h_c) # apply reset gate after mm
|
|
h = (pre_hidden - c) * z + c
|
|
|
|
return h, h
|
|
|
|
@property
|
|
def state_shape(self):
|
|
r"""
|
|
The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
|
|
size would be automatically inserted into shape). The shape corresponds
|
|
to the shape of :math:`h_{t-1}`.
|
|
"""
|
|
return (self.hidden_size, )
|
|
|
|
|
|
class RNN(Layer):
|
|
r"""
|
|
Wrapper for RNN, which creates a recurrent neural network with an RNN cell.
|
|
It performs :code:`cell.forward()` repeatedly until reaches to the maximum
|
|
length of `inputs`.
|
|
|
|
Arguments:
|
|
cell(RNNCellBase): An instance of `RNNCellBase`.
|
|
is_reverse (bool, optional): Indicate whether to calculate in the reverse
|
|
order of input sequences. Defaults to False.
|
|
time_major (bool): Whether the first dimension of the input means the
|
|
time steps. Defaults to False.
|
|
|
|
Inputs:
|
|
inputs (Tensor): A (possibly nested structure of) tensor[s]. The input
|
|
sequences.
|
|
If time major is True, the shape is `[batch_size, time_steps, input_size]`
|
|
If time major is False, the shape is [time_steps, batch_size, input_size]`
|
|
where `input_size` is the input size of the cell.
|
|
initial_states (Tensor|list|tuple, optional): Tensor of a possibly
|
|
nested structure of tensors, representing the initial state for
|
|
the rnn cell. If not provided, `cell.get_initial_states` would be
|
|
called to produce the initial states. Defaults to None.
|
|
sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64
|
|
or int32. The valid lengths of input sequences. Defaults to None.
|
|
If `sequence_length` is not None, the inputs are treated as
|
|
padded sequences. In each input sequence, elements whose time step
|
|
index are not less than the valid length are treated as paddings.
|
|
**kwargs: Additional keyword arguments to pass to `forward` of the cell.
|
|
|
|
Returns:
|
|
(outputs, final_states)
|
|
outputs (Tensor|list|tuple): the output sequences.
|
|
If `time_major` is True, the shape is
|
|
`[time_steps, batch_size, hidden_size]`, else
|
|
`[batch_size, time_steps, hidden_size]`.
|
|
final_states (Tensor|list|tuple): final states of the cell. Tensor or
|
|
a possibly nested structure of tensors which has the same structure
|
|
with intial state. Each tensor in final states has the same shape
|
|
and dtype as the corresponding tensor in initial states.
|
|
|
|
Notes:
|
|
This class is a low level API for wrapping rnn cell into a RNN network.
|
|
Users should take care of the state of the cell. If `initial_states` is
|
|
passed to the `forward` method, make sure that it satisfies the
|
|
requirements of the cell.
|
|
|
|
Examples:
|
|
|
|
.. code-block:: python
|
|
|
|
import paddle
|
|
paddle.disable_static()
|
|
|
|
inputs = paddle.rand((4, 23, 16))
|
|
prev_h = paddle.randn((4, 32))
|
|
|
|
cell = paddle.nn.SimpleRNNCell(16, 32)
|
|
rnn = paddle.nn.RNN(cell)
|
|
outputs, final_states = rnn(inputs, prev_h)
|
|
|
|
"""
|
|
|
|
def __init__(self, cell, is_reverse=False, time_major=False):
|
|
super(RNN, self).__init__()
|
|
self.cell = cell
|
|
if not hasattr(self.cell, "call"):
|
|
# for non-dygraph mode, `rnn` api uses cell.call
|
|
self.cell.call = self.cell.forward
|
|
self.is_reverse = is_reverse
|
|
self.time_major = time_major
|
|
|
|
def forward(self,
|
|
inputs,
|
|
initial_states=None,
|
|
sequence_length=None,
|
|
**kwargs):
|
|
final_outputs, final_states = F.rnn(self.cell,
|
|
inputs,
|
|
initial_states=initial_states,
|
|
sequence_length=sequence_length,
|
|
time_major=self.time_major,
|
|
is_reverse=self.is_reverse,
|
|
**kwargs)
|
|
return final_outputs, final_states
|
|
|
|
|
|
class BiRNN(Layer):
|
|
r"""
|
|
Wrapper for bidirectional RNN, which builds a bidiretional RNN given the
|
|
forward rnn cell and backward rnn cell. A BiRNN applies forward RNN and
|
|
backward RNN with coresponding cells separately and concats the outputs
|
|
along the last axis.
|
|
|
|
Arguments:
|
|
cell_fw (RNNCellBase): A RNNCellBase instance used for forward RNN.
|
|
cell_bw (RNNCellBase): A RNNCellBase instance used for backward RNN.
|
|
time_major (bool): Whether the first dimension of the input means the
|
|
time steps. Defaults to False.
|
|
|
|
Inputs:
|
|
inputs (Tensor): the input sequences of both RNN.
|
|
If time_major is True, the shape of is
|
|
`[time_steps, batch_size, input_size]`, else the shape is
|
|
`[batch_size, time_steps, input_size]`, where input_size is the
|
|
input size of both cells.
|
|
initial_states (list|tuple, optional): A tuple/list of the initial
|
|
states of the forward cell and backward cell. Defaults to None.
|
|
If not provided, `cell.get_initial_states` would be called to
|
|
produce the initial states for each cell. Defaults to None.
|
|
sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64
|
|
or int32. The valid lengths of input sequences. Defaults to None.
|
|
If `sequence_length` is not None, the inputs are treated as
|
|
padded sequences. In each input sequence, elements whose time step
|
|
index are not less than the valid length are treated as paddings.
|
|
**kwargs: Additional keyword arguments. Arguments passed to `forward`
|
|
for each cell.
|
|
|
|
Outputs:
|
|
(outputs, final_states)
|
|
outputs (Tensor): the outputs of the bidirectional RNN. It is the
|
|
concatenation of the outputs from the forward RNN and backward
|
|
RNN along the last axis.
|
|
If time major is True, the shape is `[time_steps, batch_size, size]`,
|
|
else the shape is `[batch_size, time_steps, size]`, where size is
|
|
`cell_fw.hidden_size + cell_bw.hidden_size`.
|
|
final_states (tuple): A tuple of the final states of the forward
|
|
cell and backward cell.
|
|
|
|
Notes:
|
|
This class is a low level API for wrapping rnn cells into a BiRNN
|
|
network. Users should take care of the states of the cells.
|
|
If `initial_states` is passed to the `forward` method, make sure that
|
|
it satisfies the requirements of the cells.
|
|
|
|
Examples:
|
|
|
|
.. code-block:: python
|
|
|
|
import paddle
|
|
paddle.disable_static()
|
|
|
|
cell_fw = paddle.nn.LSTMCell(16, 32)
|
|
cell_bw = paddle.nn.LSTMCell(16, 32)
|
|
rnn = paddle.nn.BiRNN(cell_fw, cell_bw)
|
|
|
|
inputs = paddle.rand((2, 23, 16))
|
|
outputs, final_states = rnn(inputs)
|
|
|
|
"""
|
|
|
|
def __init__(self, cell_fw, cell_bw, time_major=False):
|
|
super(BiRNN, self).__init__()
|
|
self.cell_fw = cell_fw
|
|
self.cell_bw = cell_bw
|
|
if cell_fw.input_size != cell_bw.input_size:
|
|
raise ValueError("input size of forward cell({}) does not equals"
|
|
"that of backward cell({})".format(
|
|
cell_fw.input_size, cell_bw.input_size))
|
|
for cell in [self.cell_fw, self.cell_bw]:
|
|
if not hasattr(cell, "call"):
|
|
# for non-dygraph mode, `rnn` api uses cell.call
|
|
cell.call = cell.forward
|
|
self.time_major = time_major
|
|
|
|
def forward(self,
|
|
inputs,
|
|
initial_states=None,
|
|
sequence_length=None,
|
|
**kwargs):
|
|
if isinstance(initial_states, (list, tuple)):
|
|
assert len(initial_states) == 2, \
|
|
"length of initial_states should be 2 when it is a list/tuple"
|
|
|
|
outputs, final_states = F.birnn(self.cell_fw, self.cell_bw, inputs,
|
|
initial_states, sequence_length,
|
|
self.time_major, **kwargs)
|
|
return outputs, final_states
|
|
|
|
|
|
class RNNMixin(LayerList):
|
|
r"""
|
|
A Mixin class for RNN networks. It provides `forward` method for SimpleRNN,
|
|
LSTM and GRU.
|
|
"""
|
|
|
|
def forward(self, inputs, initial_states=None, sequence_length=None):
|
|
batch_index = 1 if self.time_major else 0
|
|
dtype = inputs.dtype
|
|
if initial_states is None:
|
|
state_shape = (self.num_layers * self.num_directions, -1,
|
|
self.hidden_size)
|
|
if self.state_components == 1:
|
|
initial_states = paddle.fluid.layers.fill_constant_batch_size_like(
|
|
inputs, state_shape, dtype, 0, batch_index, 1)
|
|
else:
|
|
initial_states = tuple([
|
|
paddle.fluid.layers.fill_constant_batch_size_like(
|
|
inputs, state_shape, dtype, 0, batch_index, 1)
|
|
for _ in range(self.state_components)
|
|
])
|
|
|
|
states = split_states(initial_states, self.num_directions == 2,
|
|
self.state_components)
|
|
final_states = []
|
|
|
|
for i, rnn_layer in enumerate(self):
|
|
if i > 0:
|
|
inputs = F.dropout(
|
|
inputs,
|
|
self.dropout,
|
|
training=self.training,
|
|
mode="upscale_in_train")
|
|
outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
|
|
final_states.append(final_state)
|
|
inputs = outputs
|
|
|
|
final_states = concat_states(final_states, self.num_directions == 2,
|
|
self.state_components)
|
|
return outputs, final_states
|
|
|
|
|
|
class SimpleRNN(RNNMixin):
|
|
r"""
|
|
Multilayer Elman network(SimpleRNN). It takes input sequences and initial
|
|
states as inputs, and returns the output sequences and the final states.
|
|
|
|
Each layer inside the SimpleRNN maps the input sequences and initial states
|
|
to the output sequences and final states in the following manner: at each
|
|
step, it takes step inputs(:math:`x_{t}`) and previous
|
|
states(:math:`h_{t-1}`) as inputs, and returns step outputs(:math:`y_{t}`)
|
|
and new states(:math:`h_{t}`).
|
|
|
|
.. math::
|
|
|
|
h_{t} & = \mathrm{tanh}(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh})
|
|
y_{t} & = h_{t}
|
|
|
|
where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise
|
|
multiplication operator.
|
|
|
|
Arguments:
|
|
input_size (int): The input size for the first layer's cell.
|
|
hidden_size (int): The hidden size for each layer's cell.
|
|
num_layers (int, optional): Number of layers. Defaults to 1.
|
|
activation (str, optional): The activation in each SimpleRNN cell. It can be
|
|
`tanh` or `relu`. Defaults to `tanh`.
|
|
direction (str, optional): The direction of the network. It can be "forward",
|
|
"backward" and "bidirectional". Defaults to "forward".
|
|
dropout (float, optional): The droput probability. Dropout is applied to the
|
|
input of each layer except for the first layer. Defaults to 0.
|
|
time_major (bool, optional): Whether the first dimension of the input means the
|
|
time steps. Defaults to False.
|
|
weight_ih_attr (ParamAttr, optional): The parameter attribute for
|
|
`weight_ih` of each cell. Defaults to None.
|
|
weight_hh_attr (ParamAttr, optional): The parameter attribute for
|
|
`weight_hh` of each cell. Defaults to None.
|
|
bias_ih_attr (ParamAttr, optional): The parameter attribute for the
|
|
`bias_ih` of each cells. Defaults to None.
|
|
bias_hh_attr (ParamAttr, optional): The parameter attribute for the
|
|
`bias_hh` of each cells. Defaults to None.
|
|
name (str, optional): Name for the operation (optional, default is
|
|
None). For more information, please refer to :ref:`api_guide_Name`.
|
|
|
|
Inputs:
|
|
inputs (Tensor): the input sequence.
|
|
If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
|
|
else, the shape is `[batch_size, time_steps, hidden_size]`.
|
|
initial_states (Tensor, optional): the initial state. The shape is
|
|
`[num_lauers * num_directions, batch_size, hidden_size]`.
|
|
If initial_state is not given, zero initial states are used.
|
|
sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64
|
|
or int32. The valid lengths of input sequences. Defaults to None.
|
|
If `sequence_length` is not None, the inputs are treated as
|
|
padded sequences. In each input sequence, elements whose time step
|
|
index are not less than the valid length are treated as paddings.
|
|
|
|
Returns:
|
|
(outputs, final_states)
|
|
outputs (Tensor): the output sequence.
|
|
If `time_major` is True, the shape is
|
|
`[time_steps, batch_size, num_directions * hidden_size]`,
|
|
else, the shape is
|
|
`[batch_size, time_steps, num_directions * hidden_size]`.
|
|
Note that `num_directions` is 2 if direction is "bidirectional"
|
|
else 1.
|
|
final_states (Tensor): final states. The shape is
|
|
`[num_lauers * num_directions, batch_size, hidden_size]`.
|
|
Note that `num_directions` is 2 if direction is "bidirectional"
|
|
else 1.
|
|
|
|
Examples:
|
|
|
|
.. code-block:: python
|
|
|
|
import paddle
|
|
paddle.disable_static()
|
|
|
|
rnn = paddle.nn.SimpleRNN(16, 32, 2)
|
|
|
|
x = paddle.randn((4, 23, 16))
|
|
prev_h = paddle.randn((2, 4, 32))
|
|
y, h = rnn(x, prev_h)
|
|
|
|
"""
|
|
|
|
def __init__(self,
|
|
input_size,
|
|
hidden_size,
|
|
num_layers=1,
|
|
activation="tanh",
|
|
direction="forward",
|
|
dropout=0.,
|
|
time_major=False,
|
|
weight_ih_attr=None,
|
|
weight_hh_attr=None,
|
|
bias_ih_attr=None,
|
|
bias_hh_attr=None,
|
|
name=None):
|
|
super(SimpleRNN, self).__init__()
|
|
|
|
if direction in ["forward", "backward"]:
|
|
is_reverse = direction == "backward"
|
|
cell = SimpleRNNCell(input_size, hidden_size, activation,
|
|
weight_ih_attr, weight_hh_attr, bias_ih_attr,
|
|
bias_hh_attr)
|
|
self.append(RNN(cell, is_reverse, time_major))
|
|
for i in range(1, num_layers):
|
|
cell = SimpleRNNCell(hidden_size, hidden_size, activation,
|
|
weight_ih_attr, weight_hh_attr,
|
|
bias_ih_attr, bias_hh_attr)
|
|
self.append(RNN(cell, is_reverse, time_major))
|
|
elif direction == "bidirectional":
|
|
cell_fw = SimpleRNNCell(input_size, hidden_size, activation,
|
|
weight_ih_attr, weight_hh_attr,
|
|
bias_ih_attr, bias_hh_attr)
|
|
cell_bw = SimpleRNNCell(input_size, hidden_size, activation,
|
|
weight_ih_attr, weight_hh_attr,
|
|
bias_ih_attr, bias_hh_attr)
|
|
self.append(BiRNN(cell_fw, cell_bw, time_major))
|
|
for i in range(1, num_layers):
|
|
cell_fw = SimpleRNNCell(
|
|
2 * hidden_size, hidden_size, activation, weight_ih_attr,
|
|
weight_hh_attr, bias_ih_attr, bias_hh_attr)
|
|
cell_bw = SimpleRNNCell(
|
|
2 * hidden_size, hidden_size, activation, weight_ih_attr,
|
|
weight_hh_attr, bias_ih_attr, bias_hh_attr)
|
|
self.append(BiRNN(cell_fw, cell_bw, time_major))
|
|
else:
|
|
raise ValueError(
|
|
"direction should be forward, backward or bidirectional, "
|
|
"received direction = {}".format(direction))
|
|
|
|
self.input_size = input_size
|
|
self.hidden_size = hidden_size
|
|
self.dropout = dropout
|
|
self.num_directions = 2 if direction == "bidirectional" else 1
|
|
self.time_major = time_major
|
|
self.num_layers = num_layers
|
|
self.state_components = 1
|
|
|
|
|
|
class LSTM(RNNMixin):
|
|
r"""
|
|
Multilayer LSTM. It takes a sequence and an initial state as inputs, and
|
|
returns the output sequences and the final states.
|
|
|
|
Each layer inside the LSTM maps the input sequences and initial states
|
|
to the output sequences and final states in the following manner: at each
|
|
step, it takes step inputs(:math:`x_{t}`) and previous
|
|
states(:math:`h_{t-1}, c_{t-1}`) as inputs, and returns step
|
|
outputs(:math:`y_{t}`) and new states(:math:`h_{t}, c_{t}`).
|
|
|
|
.. math::
|
|
|
|
i_{t} & = \sigma(W_{ii}x_{t} + b_{ii} + W_{hi}h_{t-1} + b_{hi})
|
|
f_{t} & = \sigma(W_{if}x_{t} + b_{if} + W_{hf}h_{t-1} + b_{hf})
|
|
o_{t} & = \sigma(W_{io}x_{t} + b_{io} + W_{ho}h_{t-1} + b_{ho})
|
|
\\widetilde{c}_{t} & = \\tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg})
|
|
c_{t} & = f_{t} \* c{t-1} + i{t} \* \\widetile{c}_{t}
|
|
h_{t} & = o_{t} \* \\tanh(c_{t})
|
|
y_{t} & = h_{t}
|
|
|
|
where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise
|
|
multiplication operator.
|
|
|
|
Arguments:
|
|
input_size (int): The input size for the first layer's cell.
|
|
hidden_size (int): The hidden size for each layer's cell.
|
|
num_layers (int, optional): Number of layers. Defaults to 1.
|
|
direction (str, optional): The direction of the network. It can be
|
|
"forward", "backward" and "bidirectional". Defaults to "forward".
|
|
dropout (float, optional): The droput probability. Dropout is applied
|
|
to the input of each layer except for the first layer. Defaults to 0.
|
|
time_major (bool, optional): Whether the first dimension of the input
|
|
means the time steps. Defaults to False.
|
|
weight_ih_attr (ParamAttr, optional): The parameter attribute for
|
|
`weight_ih` of each cell. Default: None.
|
|
weight_hh_attr (ParamAttr, optional): The parameter attribute for
|
|
`weight_hh` of each cell. Default: None.
|
|
bias_ih_attr (ParamAttr, optional): The parameter attribute for the
|
|
`bias_ih` of each cells. Default: None.
|
|
bias_hh_attr (ParamAttr, optional): The parameter attribute for the
|
|
`bias_hh` of each cells. Default: None.
|
|
name (str, optional): Name for the operation (optional, default is
|
|
None). For more information, please refer to :ref:`api_guide_Name`.
|
|
|
|
Inputs:
|
|
inputs (Tensor): the input sequence.
|
|
If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
|
|
else, the shape is `[batch_size, time_steps, hidden_size]`.
|
|
initial_states (tuple, optional): the initial state, a tuple of (h, c),
|
|
the shape of each is `[num_lauers * num_directions, batch_size, hidden_size]`.
|
|
If initial_state is not given, zero initial states are used.
|
|
sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64
|
|
or int32. The valid lengths of input sequences. Defaults to None.
|
|
If `sequence_length` is not None, the inputs are treated as
|
|
padded sequences. In each input sequence, elements whos time step
|
|
index are not less than the valid length are treated as paddings.
|
|
|
|
Returns:
|
|
(outputs, final_states)
|
|
outputs (Tensor): the output sequence.
|
|
If `time_major` is True, the shape is
|
|
`[time_steps, batch_size, num_directions * hidden_size]`,
|
|
If `time_major` is False, the shape is
|
|
`[batch_size, time_steps, num_directions * hidden_size]`.
|
|
Note that `num_directions` is 2 if direction is "bidirectional"
|
|
else 1.
|
|
final_states (Tensor): the final state, a tuple of two tensors, h and c.
|
|
The shape of each is
|
|
`[num_lauers * num_directions, batch_size, hidden_size]`.
|
|
Note that `num_directions` is 2 if direction is "bidirectional"
|
|
else 1.
|
|
|
|
Examples:
|
|
|
|
.. code-block:: python
|
|
|
|
import paddle
|
|
paddle.disable_static()
|
|
|
|
rnn = paddle.nn.LSTM(16, 32, 2)
|
|
|
|
x = paddle.randn((4, 23, 16))
|
|
prev_h = paddle.randn((2, 4, 32))
|
|
prev_c = paddle.randn((2, 4, 32))
|
|
y, (h, c) = rnn(x, (prev_h, prev_c))
|
|
|
|
"""
|
|
|
|
def __init__(self,
|
|
input_size,
|
|
hidden_size,
|
|
num_layers=1,
|
|
direction="forward",
|
|
dropout=0.,
|
|
time_major=False,
|
|
weight_ih_attr=None,
|
|
weight_hh_attr=None,
|
|
bias_ih_attr=None,
|
|
bias_hh_attr=None,
|
|
name=None):
|
|
super(LSTM, self).__init__()
|
|
|
|
if direction in ["forward", "backward"]:
|
|
is_reverse = direction == "backward"
|
|
cell = LSTMCell(input_size, hidden_size, weight_ih_attr,
|
|
weight_hh_attr, bias_ih_attr, bias_hh_attr)
|
|
self.append(RNN(cell, is_reverse, time_major))
|
|
for i in range(1, num_layers):
|
|
cell = LSTMCell(hidden_size, hidden_size, weight_ih_attr,
|
|
weight_hh_attr, bias_ih_attr, bias_hh_attr)
|
|
self.append(RNN(cell, is_reverse, time_major))
|
|
elif direction == "bidirectional":
|
|
cell_fw = LSTMCell(input_size, hidden_size, weight_ih_attr,
|
|
weight_hh_attr, bias_ih_attr, bias_hh_attr)
|
|
cell_bw = LSTMCell(input_size, hidden_size, weight_ih_attr,
|
|
weight_hh_attr, bias_ih_attr, bias_hh_attr)
|
|
self.append(BiRNN(cell_fw, cell_bw, time_major))
|
|
for i in range(1, num_layers):
|
|
cell_fw = LSTMCell(2 * hidden_size, hidden_size, weight_ih_attr,
|
|
weight_hh_attr, bias_ih_attr, bias_hh_attr)
|
|
cell_bw = LSTMCell(2 * hidden_size, hidden_size, weight_ih_attr,
|
|
weight_hh_attr, bias_ih_attr, bias_hh_attr)
|
|
self.append(BiRNN(cell_fw, cell_bw, time_major))
|
|
else:
|
|
raise ValueError(
|
|
"direction should be forward, backward or bidirectional, "
|
|
"received direction = {}".format(direction))
|
|
|
|
self.input_size = input_size
|
|
self.hidden_size = hidden_size
|
|
self.dropout = dropout
|
|
self.num_directions = 2 if direction == "bidirectional" else 1
|
|
self.time_major = time_major
|
|
self.num_layers = num_layers
|
|
self.state_components = 2
|
|
|
|
|
|
class GRU(RNNMixin):
|
|
r"""
|
|
Multilayer GRU. It takes input sequencse and initial states as inputs, and
|
|
returns the output sequences and the final states.
|
|
|
|
Each layer inside the GRU maps the input sequences and initial states
|
|
to the output sequences and final states in the following manner: at each
|
|
step, it takes step inputs(:math:`x_{t}`) and previous
|
|
states(:math:`h_{t-1}`) as inputs, and returns step outputs(:math:`y_{t}`)
|
|
and new states(:math:`h_{t}`).
|
|
|
|
.. math::
|
|
|
|
r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t} + b_{hr})
|
|
z_{t} & = \sigma(W_{iz)x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz})
|
|
\\widetilde{h}_{t} & = \\tanh(W_{ic)x_{t} + b_{ic} + r_{t} \* (W_{hc}x_{t} + b{hc}))
|
|
h_{t} & = z_{t} \* h_{t-1} + (1 - z_{t}) \* \\widetilde{h}_{t}
|
|
y_{t} & = h_{t}
|
|
|
|
where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise
|
|
multiplication operator.
|
|
|
|
Arguments:
|
|
input_size (int): The input size for the first layer's cell.
|
|
hidden_size (int): The hidden size for each layer's cell.
|
|
num_layers (int, optional): Number of layers. Defaults to 1.
|
|
direction (str, optional): The direction of the network. It can be
|
|
"forward", "backward" and "bidirectional". Defaults to "forward".
|
|
dropout (float, optional): The droput probability. Dropout is applied
|
|
to the input of each layer except for the first layer. Defaults to 0.
|
|
time_major (bool, optional): Whether the first dimension of the input
|
|
means the time steps. Defaults to False.
|
|
weight_ih_attr (ParamAttr, optional): The parameter attribute for
|
|
`weight_ih` of each cell. Default: None.
|
|
weight_hh_attr (ParamAttr, optional): The parameter attribute for
|
|
`weight_hh` of each cell. Default: None.
|
|
bias_ih_attr (ParamAttr, optional): The parameter attribute for the
|
|
`bias_ih` of each cells. Default: None.
|
|
bias_hh_attr (ParamAttr, optional): The parameter attribute for the
|
|
`bias_hh` of each cells. Default: None.
|
|
name (str, optional): Name for the operation (optional, default is
|
|
None). For more information, please refer to :ref:`api_guide_Name`.
|
|
|
|
Inputs:
|
|
inputs (Tensor): the input sequence.
|
|
If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
|
|
else, the shape is `[batch_size, time_steps, hidden_size]`.
|
|
initial_states (Tensor, optional): the initial state. The shape is
|
|
`[num_lauers * num_directions, batch_size, hidden_size]`.
|
|
If initial_state is not given, zero initial states are used.
|
|
Defaults to None.
|
|
sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64
|
|
or int32. The valid lengths of input sequences. Defaults to None.
|
|
If `sequence_length` is not None, the inputs are treated as
|
|
padded sequences. In each input sequence, elements whos time step
|
|
index are not less than the valid length are treated as paddings.
|
|
|
|
Returns:
|
|
(outputs, final_states)
|
|
outputs (Tensor): the output sequence.
|
|
If `time_major` is True, the shape is
|
|
`[time_steps, batch_size, num_directions * hidden_size]`,
|
|
else, the shape is
|
|
`[batch_size, time_steps, num_directions * hidden_size]`.
|
|
Note that `num_directions` is 2 if direction is "bidirectional"
|
|
else 1.
|
|
final_states (Tensor): final states. The shape is
|
|
`[num_lauers * num_directions, batch_size, hidden_size]`.
|
|
Note that `num_directions` is 2 if direction is "bidirectional"
|
|
else 1.
|
|
|
|
Examples:
|
|
|
|
.. code-block:: python
|
|
|
|
import paddle
|
|
paddle.disable_static()
|
|
|
|
rnn = paddle.nn.GRU(16, 32, 2)
|
|
|
|
x = paddle.randn((4, 23, 16))
|
|
prev_h = paddle.randn((2, 4, 32))
|
|
y, h = rnn(x, prev_h)
|
|
|
|
"""
|
|
|
|
def __init__(self,
|
|
input_size,
|
|
hidden_size,
|
|
num_layers=1,
|
|
direction="forward",
|
|
dropout=0.,
|
|
time_major=False,
|
|
weight_ih_attr=None,
|
|
weight_hh_attr=None,
|
|
bias_ih_attr=None,
|
|
bias_hh_attr=None,
|
|
name=None):
|
|
super(GRU, self).__init__()
|
|
|
|
if direction in ["forward", "backward"]:
|
|
is_reverse = direction == "backward"
|
|
cell = GRUCell(input_size, hidden_size, weight_ih_attr,
|
|
weight_hh_attr, bias_ih_attr, bias_hh_attr)
|
|
self.append(RNN(cell, is_reverse, time_major))
|
|
for i in range(1, num_layers):
|
|
cell = GRUCell(hidden_size, hidden_size, weight_ih_attr,
|
|
weight_hh_attr, bias_ih_attr, bias_hh_attr)
|
|
self.append(RNN(cell, is_reverse, time_major))
|
|
elif direction == "bidirectional":
|
|
cell_fw = GRUCell(input_size, hidden_size, weight_ih_attr,
|
|
weight_hh_attr, bias_ih_attr, bias_hh_attr)
|
|
cell_bw = GRUCell(input_size, hidden_size, weight_ih_attr,
|
|
weight_hh_attr, bias_ih_attr, bias_hh_attr)
|
|
self.append(BiRNN(cell_fw, cell_bw, time_major))
|
|
for i in range(1, num_layers):
|
|
cell_fw = GRUCell(2 * hidden_size, hidden_size, weight_ih_attr,
|
|
weight_hh_attr, bias_ih_attr, bias_hh_attr)
|
|
cell_bw = GRUCell(2 * hidden_size, hidden_size, weight_ih_attr,
|
|
weight_hh_attr, bias_ih_attr, bias_hh_attr)
|
|
self.append(BiRNN(cell_fw, cell_bw, time_major))
|
|
else:
|
|
raise ValueError(
|
|
"direction should be forward, backward or bidirectional, "
|
|
"received direction = {}".format(direction))
|
|
|
|
self.input_size = input_size
|
|
self.hidden_size = hidden_size
|
|
self.dropout = dropout
|
|
self.num_directions = 2 if direction == "bidirectional" else 1
|
|
self.time_major = time_major
|
|
self.num_layers = num_layers
|
|
self.state_components = 1
|