diff --git a/doc/ui/api/trainer_config_helpers/activations.rst b/doc/ui/api/trainer_config_helpers/activations.rst index ee036ecbac..294f6e4d31 100644 --- a/doc/ui/api/trainer_config_helpers/activations.rst +++ b/doc/ui/api/trainer_config_helpers/activations.rst @@ -51,7 +51,7 @@ SequenceSoftmaxActivation ========================= .. automodule:: paddle.trainer_config_helpers.activations - :members: SequenceSoftmax + :members: SequenceSoftmaxActivation :noindex: ReluActivation diff --git a/doc/ui/api/trainer_config_helpers/layers.rst b/doc/ui/api/trainer_config_helpers/layers.rst index 8051d29716..a09d5e3d4d 100644 --- a/doc/ui/api/trainer_config_helpers/layers.rst +++ b/doc/ui/api/trainer_config_helpers/layers.rst @@ -136,6 +136,18 @@ gru_step_layer Recurrent Layer Group ===================== +recurrent_group +--------------- +.. automodule:: paddle.trainer_config_helpers.layers + :members: recurrent_group + :noindex: + +beam_search +------------ +.. automodule:: paddle.trainer_config_helpers.layers + :members: beam_search + :noindex: + get_output_layer ----------------- .. automodule:: paddle.trainer_config_helpers.layers diff --git a/doc/ui/api/trainer_config_helpers/networks.rst b/doc/ui/api/trainer_config_helpers/networks.rst index 59370b71a6..2a15b34eae 100644 --- a/doc/ui/api/trainer_config_helpers/networks.rst +++ b/doc/ui/api/trainer_config_helpers/networks.rst @@ -43,34 +43,52 @@ vgg_16_network Recurrent ========= +LSTM +---- + lstmemory_unit --------------- +`````````````` .. automodule:: paddle.trainer_config_helpers.networks :members: lstmemory_unit :noindex: lstmemory_group ---------------- +``````````````` .. automodule:: paddle.trainer_config_helpers.networks :members: lstmemory_group :noindex: +simple_lstm +``````````` +.. automodule:: paddle.trainer_config_helpers.networks + :members: simple_lstm + :noindex: + +bidirectional_lstm +`````````````````` +.. automodule:: paddle.trainer_config_helpers.networks + :members: bidirectional_lstm + :noindex: + +GRU +--- + gru_unit ---------- +```````` .. automodule:: paddle.trainer_config_helpers.networks :members: gru_unit :noindex: -simple_lstm ------------ +gru_group +````````` .. automodule:: paddle.trainer_config_helpers.networks - :members: simple_lstm + :members: gru_group :noindex: -bidirectional_lstm ------------------- +simple_gru +`````````` .. automodule:: paddle.trainer_config_helpers.networks - :members: bidirectional_lstm + :members: simple_gru :noindex: simple_attention diff --git a/doc/ui/api/trainer_config_helpers/optimizers.rst b/doc/ui/api/trainer_config_helpers/optimizers.rst index 31b9e057fb..3c683914f4 100644 --- a/doc/ui/api/trainer_config_helpers/optimizers.rst +++ b/doc/ui/api/trainer_config_helpers/optimizers.rst @@ -10,10 +10,10 @@ AdamOptimizer :members: AdamOptimizer :noindex: -AdamxOptimizer +AdamaxOptimizer ================ .. automodule:: paddle.trainer_config_helpers.optimizers - :members: AdamxOptimizer + :members: AdamaxOptimizer :noindex: AdaGradOptimizer diff --git a/paddle/.gitignore b/paddle/.gitignore index f46fece211..b89bd9d946 100644 --- a/paddle/.gitignore +++ b/paddle/.gitignore @@ -28,9 +28,8 @@ ld-linux-x86-64.so.2 x86_64-scm-linux-gnu/ .lint.*.md5 -examples/crf/*.bin - .idea/ +.test_env Paddle_wrap.cxx Paddle_wrap.h paddle.py diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h index cb3c51c7bd..e080a40141 100644 --- a/paddle/gserver/layers/LstmLayer.h +++ b/paddle/gserver/layers/LstmLayer.h @@ -97,13 +97,13 @@ protected: * @param starts Each start position of each samples. * @param inputValue The input values. */ - void forwardSequence(int batchSize, size_t numSequences, - const int *starts, MatrixPtr inputValue); + void forwardSequence(int batchSize, size_t numSequences, const int *starts, + MatrixPtr inputValue); /** * Compute lstm backward one sequence by one sequence. */ - void backwardSequence(int batchSize, size_t numSequences, - const int *starts, MatrixPtr inputGrad); + void backwardSequence(int batchSize, size_t numSequences, const int *starts, + MatrixPtr inputGrad); /** * Compute lstm forward one batch by one batch. The batch value is @@ -121,21 +121,21 @@ protected: * } * @endcode */ - void forwardBatch(int batchSize, size_t numSequences, - const int *starts, MatrixPtr inputValue); + void forwardBatch(int batchSize, size_t numSequences, const int *starts, + MatrixPtr inputValue); /** * Compute lstm backward one batch by one batch. */ - void backwardBatch(int batchSize, size_t numSequences, - const int *starts, MatrixPtr inputGrad); + void backwardBatch(int batchSize, size_t numSequences, const int *starts, + MatrixPtr inputGrad); /** * This function only supports GPU. It not need to reorganize input into * batch value. It will launch one kernel to parallelly compute forward * propagation in sequence level. */ - void forwardSeqParallel(int batchSize, size_t numSequences, - const int *starts, MatrixPtr inputValue); + void forwardSeqParallel(int batchSize, size_t numSequences, const int *starts, + MatrixPtr inputValue); /** * Backward propagation corresponding to forwardSeqParallel. */ @@ -157,7 +157,8 @@ protected: /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$. std::unique_ptr weight_; /// Learned bias parameter, shape: (1, 7 * size). - /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$. + /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, + /// W_{co}\f$. std::unique_ptr bias_; /// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$. MatrixPtr localBias_; diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index d8903ff818..1eaf26fdbf 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -669,7 +669,7 @@ def fc_layer(input, size, act=None, name=None, act=LinearActivation(), bias_attr=False) - which is equal to: + which is equal to: .. code-block:: python @@ -795,15 +795,15 @@ def lstmemory(input, name=None, reverse=False, act=None, .. math:: - i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) + i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) - f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) + f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) - c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) + c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) - o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) + o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) - h_t = o_t tanh(c_t) + h_t & = o_t tanh(c_t) NOTE: In paddle's implementation, the multiply operation @@ -1294,8 +1294,6 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=No label=data_layer, num_classes=3) - :param name: layer name - :type name: basestring :param input: Input layers. It could be a LayerOutput or list/tuple of LayerOutput. :type input: LayerOutput|list|tuple @@ -1303,6 +1301,8 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=No :type label: LayerOutput :param num_classes: number of classes. :type num_classes: int + :param name: layer name + :type name: basestring :param bias_attr: Bias attribute. None means default bias. False means no bias. :type bias_attr: ParameterAttribute|False @@ -1943,18 +1943,18 @@ def lstm_step_layer(input, state, size, act=None, .. math:: - i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) + i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) - f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) + f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) - c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) + c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) - o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) + o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) - h_t = o_t tanh(c_t) + h_t & = o_t tanh(c_t) - The input\_ of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use + The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use :code:`mixed_layer` and :code:`full_matrix_projection` to calculate these input vector. @@ -2347,12 +2347,12 @@ def eos_layer(input, eos_id, name=None, layer_attr=None): eos = eos_layer(input=layer, eos_id=id) + :param name: Layer name. + :type name: basestring :param input: Input layer name. :type input: LayerOutput :param eos_id: end id of sequence :type eos_id: int - :param name: Layer name. - :type name: basestring :param layer_attr: extra layer attributes. :type layer_attr: ExtraLayerAttribute. :return: layer name. @@ -2529,11 +2529,11 @@ def conv_operator(input, filter_size, num_filters, :param num_filter: channel of output data. :type num_filter: int :param num_channel: channel of input data. - :rtype num_channel: int + :type num_channel: int :param stride: The x dimension of the stride. - :rtype stride: int + :type stride: int :param stride_y: The y dimension of the stride. - :rtype stride_y: int + :type stride_y: int :param padding: The x dimension of padding. :type padding: int :param padding_y: The y dimension of padding. @@ -2632,7 +2632,7 @@ def tensor_layer(input, size, act=None, name=None, :param input: Input layer. :type input: LayerOutput|list|tuple. :param size: the layer dimension. - :rtype: int. + :type size: int. :param act: Activation Type. Default is tanh. :type act: BaseActivation :param param_attr: The Parameter Attribute. @@ -2840,7 +2840,7 @@ def convex_comb_layer(input, size, name=None): """ A layer for convex weighted average of vectors takes two inputs. - Input: a vector containing the convex weights (batchSize x weightdim), - and a matrix in a vector form (batchSize x (weightdim*datadim)). + and a matrix in a vector form (batchSize x (weightdim * datadim)). - Output: a vector (batchSize * datadim). .. math:: @@ -2893,8 +2893,8 @@ def block_expand_layer(input, name=None): """ Expand feature map to minibatch matrix. - - matrix width is: block_y * block_x * channel - - matirx height is: outputH * outputW + - matrix width is: block_y * block_x * channel + - matirx height is: outputH * outputW .. math:: @@ -3100,11 +3100,11 @@ def rank_cost(left, right, lable, weight=None, name=None, coeff=1.0): .. math:: - C_{i,j} = -\\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) + C_{i,j} & = -\\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) - o_{i,j} = o_i - o_j + o_{i,j} & = o_i - o_j - \\tilde{P_{i,j}} = \\{0, 0.5, 1\\} \ or \ \\{0, 1\\} + \\tilde{P_{i,j}} & = \\{0, 0.5, 1\\} \ or \ \\{0, 1\\} In this formula: - :math:`C_{i,j}` is the cross entropy cost. diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index 65bd9102dc..b162304b91 100644 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -440,20 +440,20 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None, """ Simple LSTM Cell. - It just combine a mix_layer with fully_matrix_projection and a lstmemory + It just combine a mixed layer with fully_matrix_projection and a lstmemory layer. The simple lstm cell was implemented as follow equations. .. math:: - i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) + i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) - f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) + f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) - c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) + c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) - o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) + o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) - h_t = o_t tanh(c_t) + h_t & = o_t tanh(c_t) Please refer **Generating Sequences With Recurrent Neural Networks** if you want to know what lstm is. Link_ is here. @@ -502,28 +502,42 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None, @wrap_name_default('lstm_unit') -def lstmemory_unit(input, name=None, size=None, - mixed_bias_attr=None, mixed_layer_attr=None, - param_attr=None, lstm_bias_attr=None, - act=None, gate_act=None, - state_act=None, lstm_layer_attr=None, +def lstmemory_unit(input, name=None, size=None, param_attr=None, + act=None, gate_act=None, state_act=None, + mixed_bias_attr=None, lstm_bias_attr=None, + mixed_layer_attr=None,lstm_layer_attr=None, get_output_layer_attr=None): """ TODO(yuyang18): complete docs - @param input: - @param name: - @param size: - @param mixed_bias_attr: - @param mixed_layer_attr: - @param param_attr: - @param lstm_bias_attr: - @param act: - @param gate_act: - @param state_act: - @param lstm_layer_attr: - @param get_output_layer_attr: - @return: + :param input: input layer name. + :type input: LayerOutput + :param name: lstmemory unit name. + :type name: basestring + :param size: lstmemory unit size. + :type size: int + :param param_attr: Parameter config, None if use default. + :type param_attr: ParameterAttribute + :param act: lstm final activate type + :type act: BaseActivation + :param gate_act: lstm gate activate type + :type gate_act: BaseActivation + :param state_act: lstm state activate type. + :type state_act: BaseActivation + :param mixed_bias_attr: bias parameter attribute of mixed layer. + False means no bias, None means default bias. + :type mixed_bias_attr: ParameterAttribute|False + :param lstm_bias_attr: bias parameter attribute of lstm layer. + False means no bias, None means default bias. + :type lstm_bias_attr: ParameterAttribute|False + :param mixed_layer_attr: mixed layer's extra attribute. + :type mixed_layer_attr: ExtraLayerAttribute + :param lstm_layer_attr: lstm layer's extra attribute. + :type lstm_layer_attr: ExtraLayerAttribute + :param get_output_layer_attr: get output layer's extra attribute. + :type get_output_layer_attr: ExtraLayerAttribute + :return: lstmemory unit name. + :rtype: LayerOutput """ if size is None: assert input.size % 4 == 0 @@ -560,32 +574,48 @@ def lstmemory_unit(input, name=None, size=None, @wrap_name_default('lstm_group') def lstmemory_group(input, size=None, name=None, reverse=False, param_attr=None, - mix_bias_attr=None, lstm_bias_attr=None, act=None, gate_act=None, state_act=None, + mixed_bias_attr=None, lstm_bias_attr=None, mixed_layer_attr=None, lstm_layer_attr=None, get_output_layer_attr=None): """ TODO(yuyang18): complete docs - @param input: - @param size: - @param name: - @param reverse: - @param param_attr: - @param mix_bias_attr: - @param lstm_bias_attr: - @param act: - @param gate_act: - @param state_act: - @param mixed_layer_attr: - @param lstm_layer_attr: - @param get_output_layer_attr: - @return: + :param input: input layer name. + :type input: LayerOutput + :param name: lstmemory group name. + :type name: basestring + :param size: lstmemory group size. + :type size: int + :param reverse: is lstm reversed + :type reverse: bool + :param param_attr: Parameter config, None if use default. + :type param_attr: ParameterAttribute + :param act: lstm final activate type + :type act: BaseActivation + :param gate_act: lstm gate activate type + :type gate_act: BaseActivation + :param state_act: lstm state activate type. + :type state_act: BaseActivation + :param mixed_bias_attr: bias parameter attribute of mixed layer. + False means no bias, None means default bias. + :type mixed_bias_attr: ParameterAttribute|False + :param lstm_bias_attr: bias parameter attribute of lstm layer. + False means no bias, None means default bias. + :type lstm_bias_attr: ParameterAttribute|False + :param mixed_layer_attr: mixed layer's extra attribute. + :type mixed_layer_attr: ExtraLayerAttribute + :param lstm_layer_attr: lstm layer's extra attribute. + :type lstm_layer_attr: ExtraLayerAttribute + :param get_output_layer_attr: get output layer's extra attribute. + :type get_output_layer_attr: ExtraLayerAttribute + :return: lstmemory group name. + :rtype: LayerOutput """ def __lstm_step__(ipt): return lstmemory_unit(input=ipt, name=name, - size=size, mixed_bias_attr=mix_bias_attr, + size=size, mixed_bias_attr=mixed_bias_attr, mixed_layer_attr=mixed_layer_attr, param_attr=param_attr, lstm_bias_attr=lstm_bias_attr, @@ -760,13 +790,14 @@ def simple_attention(encoded_sequence, Size of the context vector equals to size of encoded_sequence. .. math:: - a(s_{i-1},h_{j}) = v_{a}f(W_{a}s_{t-1} + U_{a}h_{j}) - .. math:: - e_{i,j} = a(s_{i-1}, h_{j}) - .. math:: - a_{i,j} = \\frac{exp(e_{i,i})}{\\sum_{k=1}^{T_{x}{exp(e_{i,k})}}} - .. math:: - c_{i} = \\sum_{j=1}^{T_{x}}a_{i,j}h_{j} + + a(s_{i-1},h_{j}) & = v_{a}f(W_{a}s_{t-1} + U_{a}h_{j}) + + e_{i,j} & = a(s_{i-1}, h_{j}) + + a_{i,j} & = \\frac{exp(e_{i,i})}{\\sum_{k=1}^{T_{x}{exp(e_{i,k})}}} + + c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}h_{j} where :math:`h_{j}` is the jth element of encoded_sequence, :math:`U_{a}h_{j}` is the jth element of encoded_proj @@ -778,6 +809,7 @@ def simple_attention(encoded_sequence, https://arxiv.org/abs/1409.0473. The example usage is: + .. code-block:: python context = simple_attention(encoded_sequence=enc_seq, diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py index c49a2e3652..f0e51c3de5 100644 --- a/python/paddle/trainer_config_helpers/optimizers.py +++ b/python/paddle/trainer_config_helpers/optimizers.py @@ -61,7 +61,7 @@ class BaseSGDOptimizer(Optimizer): .. math:: - w:= w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w) + w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w) where :math:`\\eta` is learning rate. And :math:`n` is batch size. @@ -99,9 +99,9 @@ class AdamOptimizer(BaseSGDOptimizer): .. math:: - m(w, t) &:= \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\ - v(w, t) &:= \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\ - w &:= w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}} + m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\ + v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\ + w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}} :param beta1: the :math:`\\beta_1` in equation. :type beta1: float @@ -136,11 +136,12 @@ class AdamaxOptimizer(BaseSGDOptimizer): The details of please refer this `Adam: A Method for Stochastic Optimization `_ + .. math:: - m_t &:= \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\ - u_t &:= max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\ - w_t &:= w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t + m_t & = \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\ + u_t & = max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\ + w_t & = w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t :param beta1: the :math:`\\beta_1` in the equation. :type beta1: float @@ -175,7 +176,7 @@ class AdaGradOptimizer(BaseSGDOptimizer): .. math:: G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\ - w &:= w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g + w & = w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g """ def to_setting_kwargs(self): @@ -197,8 +198,8 @@ class RMSPropOptimizer(BaseSGDOptimizer): .. math:: - v(w, t) &:= \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\ - w &:= w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w) + v(w, t) & = \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\ + w & = w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w) :param rho: the :math:`\\rho` in the equation. The forgetting factor. :type rho: float