|
|
|
@ -176,6 +176,7 @@ __all__ = [
|
|
|
|
|
'get_tensor_from_selected_rows',
|
|
|
|
|
'lstm',
|
|
|
|
|
'psroi_pool',
|
|
|
|
|
'huber_loss',
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
kIgnoreIndex = -100
|
|
|
|
@ -497,7 +498,7 @@ def lstm(input,
|
|
|
|
|
If Device is GPU, This op will use cudnn LSTM implementation
|
|
|
|
|
|
|
|
|
|
A four-gate Long Short-Term Memory network with no peephole connections.
|
|
|
|
|
In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
|
|
|
|
|
In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
|
|
|
|
|
the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
|
|
|
|
|
|
|
|
|
|
$$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
|
|
|
|
@ -524,19 +525,19 @@ def lstm(input,
|
|
|
|
|
- $\tilde{c_t}$ is also called candidate hidden state,
|
|
|
|
|
which is computed based on the current input and the previous hidden state.
|
|
|
|
|
|
|
|
|
|
Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
|
|
|
|
|
Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
|
|
|
|
|
X represensts a matrix multiplication
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size )
|
|
|
|
|
init_h(Variable): The initial hidden state of the LSTM
|
|
|
|
|
init_h(Variable): The initial hidden state of the LSTM
|
|
|
|
|
This is a tensor with shape ( num_layers x batch_size x hidden_size)
|
|
|
|
|
if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
|
|
|
|
|
init_c(Variable): The initial cell state of the LSTM.
|
|
|
|
|
This is a tensor with shape ( num_layers x batch_size x hidden_size )
|
|
|
|
|
if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
|
|
|
|
|
max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
|
|
|
|
|
max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
|
|
|
|
|
hidden_size (int): hidden size of the LSTM
|
|
|
|
|
num_layers (int): total layers number of the LSTM
|
|
|
|
|
dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
|
|
|
|
@ -555,10 +556,10 @@ def lstm(input,
|
|
|
|
|
if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
|
|
|
|
|
last_h(Tensor): the hidden state of the last step of LSTM
|
|
|
|
|
shape is ( num_layers x batch_size x hidden_size )
|
|
|
|
|
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
|
|
|
|
|
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
|
|
|
|
|
last_c(Tensor): the cell state of the last step of LSTM
|
|
|
|
|
shape is ( num_layers x batch_size x hidden_size )
|
|
|
|
|
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
|
|
|
|
|
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
@ -4658,7 +4659,7 @@ def ctc_greedy_decoder(input, blank, name=None):
|
|
|
|
|
[0.5, 0.1, 0.3, 0.1]]
|
|
|
|
|
|
|
|
|
|
input.lod = [[4, 4]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Computation:
|
|
|
|
|
|
|
|
|
|
step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
|
|
|
|
@ -4691,7 +4692,7 @@ def ctc_greedy_decoder(input, blank, name=None):
|
|
|
|
|
Returns:
|
|
|
|
|
Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1].
|
|
|
|
|
'Lp' is the sum if all output sequences' length. If all the sequences
|
|
|
|
|
in result were empty, the result LoDTensor will be [-1] with
|
|
|
|
|
in result were empty, the result LoDTensor will be [-1] with
|
|
|
|
|
LoD [[]] and dims [1, 1].
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
@ -5045,7 +5046,7 @@ def hsigmoid(input,
|
|
|
|
|
"""
|
|
|
|
|
The hierarchical sigmoid operator is used to accelerate the training
|
|
|
|
|
process of language model. This operator organizes the classes into a
|
|
|
|
|
complete binary tree, or you can use is_custom to pass your own tree to
|
|
|
|
|
complete binary tree, or you can use is_custom to pass your own tree to
|
|
|
|
|
implement hierarchical. Each leaf node represents a class(a word) and each
|
|
|
|
|
internal node acts as a binary classifier. For each word there's a unique
|
|
|
|
|
path from root to it's leaf node, hsigmoid calculate the cost for each
|
|
|
|
@ -5061,7 +5062,7 @@ def hsigmoid(input,
|
|
|
|
|
2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
|
|
|
|
|
3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
|
|
|
|
|
means label of each binary classification, using 1 indicate true, 0 indicate false.
|
|
|
|
|
4. now, each word should has its path and code along the path, you can pass a batch of path and code
|
|
|
|
|
4. now, each word should has its path and code along the path, you can pass a batch of path and code
|
|
|
|
|
related to the same batch of inputs.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -5071,8 +5072,8 @@ def hsigmoid(input,
|
|
|
|
|
and :math:`D` is the feature size.
|
|
|
|
|
label (Variable): The tensor variable contains labels of training data.
|
|
|
|
|
It's a tensor with shape is :math:`[N \\times 1]`.
|
|
|
|
|
num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set,
|
|
|
|
|
it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num
|
|
|
|
|
num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set,
|
|
|
|
|
it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num
|
|
|
|
|
which indicates the num of classes using by binary classify.
|
|
|
|
|
param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
|
|
|
|
|
of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid
|
|
|
|
@ -5085,15 +5086,15 @@ def hsigmoid(input,
|
|
|
|
|
is not set, the bias is initialized zero. Default: None.
|
|
|
|
|
name (str|None): A name for this layer(optional). If set None, the layer
|
|
|
|
|
will be named automatically. Default: None.
|
|
|
|
|
path_table: (Variable|None) this variable can store each batch of samples' path to root,
|
|
|
|
|
path_table: (Variable|None) this variable can store each batch of samples' path to root,
|
|
|
|
|
it should be in leaf -> root order
|
|
|
|
|
path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like
|
|
|
|
|
structure and each element in this array is indexes in parent nodes' Weight Matrix.
|
|
|
|
|
path_code: (Variable|None) this variable can store each batch of samples' code,
|
|
|
|
|
path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like
|
|
|
|
|
structure and each element in this array is indexes in parent nodes' Weight Matrix.
|
|
|
|
|
path_code: (Variable|None) this variable can store each batch of samples' code,
|
|
|
|
|
each code consist with every code of parent nodes. it should be in leaf -> root order
|
|
|
|
|
is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is
|
|
|
|
|
is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is
|
|
|
|
|
set you need to set path_table/path_code/num_classes, otherwise num_classes should be set
|
|
|
|
|
is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient
|
|
|
|
|
is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient
|
|
|
|
|
of W and input will be sparse.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
@ -9377,3 +9378,51 @@ def psroi_pool(input,
|
|
|
|
|
'pooled_width': pooled_width
|
|
|
|
|
})
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def huber_loss(input, label, delta):
|
|
|
|
|
"""
|
|
|
|
|
Huber loss is a loss function used in robust.
|
|
|
|
|
Huber loss can evaluate the fitness of input to label.
|
|
|
|
|
Different from MSE loss, Huber loss is more robust for outliers.
|
|
|
|
|
|
|
|
|
|
When the difference between input and label is large than delta
|
|
|
|
|
.. math::
|
|
|
|
|
|
|
|
|
|
huber\_loss = delta * (label - input) - 0.5 * delta * delta
|
|
|
|
|
|
|
|
|
|
When the difference between input and label is less than delta
|
|
|
|
|
.. math::
|
|
|
|
|
|
|
|
|
|
huber\_loss = 0.5 * (label - input) * (label - input)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
input (Variable): This input is a probability computed by the previous operator.
|
|
|
|
|
The first dimension is batch size, and the last dimension is 1.
|
|
|
|
|
label (Variable): The groud truth whose first dimension is batch size
|
|
|
|
|
and last dimension is 1.
|
|
|
|
|
delta (float): The parameter of huber loss, which controls
|
|
|
|
|
the range of outliers
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
huber\_loss (Variable): The huber loss with shape [batch_size, 1].
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
predictions = fluid.layers.softmax(x)
|
|
|
|
|
loss = fluid.layers.huber_loss(input=predictions, label=label, 1.0)
|
|
|
|
|
"""
|
|
|
|
|
helper = LayerHelper('huber_loss', **locals())
|
|
|
|
|
residual = helper.create_variable_for_type_inference(
|
|
|
|
|
dtype=helper.input_dtype())
|
|
|
|
|
out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
|
|
|
|
|
helper.append_op(
|
|
|
|
|
type='huber_loss',
|
|
|
|
|
inputs={'X': input,
|
|
|
|
|
'Y': label},
|
|
|
|
|
outputs={'Out': out,
|
|
|
|
|
'Residual': residual},
|
|
|
|
|
attrs={'delta': delta})
|
|
|
|
|
return out
|
|
|
|
|