fix bug in trainer_config_helpers

ISSUE=4592807 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1423 1ad973e4-5ce8-4261-8a94-b56d1f490c56
9 years ago · 260c734c95
parent 9c0895ee45
commit 260c734c95
9 changed files with 171 additions and 108 deletions
--- a/doc/ui/api/trainer_config_helpers/activations.rst
+++ b/doc/ui/api/trainer_config_helpers/activations.rst
@ -51,7 +51,7 @@ SequenceSoftmaxActivation
 =========================

 ..  automodule:: paddle.trainer_config_helpers.activations
-    :members: SequenceSoftmax
+    :members: SequenceSoftmaxActivation
    :noindex:
    
 ReluActivation
--- a/doc/ui/api/trainer_config_helpers/layers.rst
+++ b/doc/ui/api/trainer_config_helpers/layers.rst
@ -136,6 +136,18 @@ gru_step_layer
 Recurrent Layer Group
 =====================

+recurrent_group
+---------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: recurrent_group
+    :noindex:
+    
+beam_search
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: beam_search
+    :noindex:
+    
 get_output_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
--- a/doc/ui/api/trainer_config_helpers/networks.rst
+++ b/doc/ui/api/trainer_config_helpers/networks.rst
@ -43,34 +43,52 @@ vgg_16_network
 Recurrent
 =========

+LSTM
+----
+
 lstmemory_unit
--------------
+``````````````
 ..  automodule:: paddle.trainer_config_helpers.networks
    :members: lstmemory_unit
    :noindex:

 lstmemory_group
---------------
+```````````````
 ..  automodule:: paddle.trainer_config_helpers.networks
    :members: lstmemory_group
    :noindex:

+simple_lstm
+```````````
+..  automodule:: paddle.trainer_config_helpers.networks
+    :members: simple_lstm
+    :noindex:
+
+bidirectional_lstm
+``````````````````
+..  automodule:: paddle.trainer_config_helpers.networks
+    :members: bidirectional_lstm
+    :noindex:
+
+GRU
+---
+
 gru_unit
---------
+````````
 ..  automodule:: paddle.trainer_config_helpers.networks
    :members: gru_unit
    :noindex:

-simple_lstm
-----------
+gru_group
+`````````
 ..  automodule:: paddle.trainer_config_helpers.networks
-    :members: simple_lstm
+    :members: gru_group
    :noindex:

-bidirectional_lstm
------------------
+simple_gru
+``````````
 ..  automodule:: paddle.trainer_config_helpers.networks
-    :members: bidirectional_lstm
+    :members: simple_gru
    :noindex:

 simple_attention
--- a/doc/ui/api/trainer_config_helpers/optimizers.rst
+++ b/doc/ui/api/trainer_config_helpers/optimizers.rst
@ -10,10 +10,10 @@ AdamOptimizer
    :members: AdamOptimizer
    :noindex:

-AdamxOptimizer
+AdamaxOptimizer
 ================
 ..  automodule:: paddle.trainer_config_helpers.optimizers
-    :members: AdamxOptimizer
+    :members: AdamaxOptimizer
    :noindex:

 AdaGradOptimizer
--- a/paddle/.gitignore
+++ b/paddle/.gitignore
@ -28,9 +28,8 @@ ld-linux-x86-64.so.2
 x86_64-scm-linux-gnu/
 .lint.*.md5

-examples/crf/*.bin
-
 .idea/
+.test_env
 Paddle_wrap.cxx
 Paddle_wrap.h
 paddle.py
--- a/paddle/gserver/layers/LstmLayer.h
+++ b/paddle/gserver/layers/LstmLayer.h
@ -97,13 +97,13 @@ protected:
   * @param starts Each start position of each samples.
   * @param inputValue The input values.
   */
-  void forwardSequence(int batchSize, size_t numSequences,
-                       const int *starts, MatrixPtr inputValue);
+  void forwardSequence(int batchSize, size_t numSequences, const int *starts,
+                       MatrixPtr inputValue);
  /**
   * Compute lstm backward one sequence by one sequence.
   */
-  void backwardSequence(int batchSize, size_t numSequences,
-                        const int *starts, MatrixPtr inputGrad);
+  void backwardSequence(int batchSize, size_t numSequences, const int *starts,
+                        MatrixPtr inputGrad);

  /**
   * Compute lstm forward one batch by one batch. The batch value is
@ -121,21 +121,21 @@ protected:
   * }
   * @endcode
   */
-  void forwardBatch(int batchSize, size_t numSequences,
-                    const int *starts, MatrixPtr inputValue);
+  void forwardBatch(int batchSize, size_t numSequences, const int *starts,
+                    MatrixPtr inputValue);
  /**
   * Compute lstm backward one batch by one batch.
   */
-  void backwardBatch(int batchSize, size_t numSequences,
-                     const int *starts, MatrixPtr inputGrad);
+  void backwardBatch(int batchSize, size_t numSequences, const int *starts,
+                     MatrixPtr inputGrad);

  /**
   * This function only supports GPU. It not need to reorganize input into
   * batch value. It will launch one kernel to parallelly compute forward
   * propagation in sequence level.
   */
-  void forwardSeqParallel(int batchSize, size_t numSequences,
-                          const int *starts, MatrixPtr inputValue);
+  void forwardSeqParallel(int batchSize, size_t numSequences, const int *starts,
+                          MatrixPtr inputValue);
  /**
   * Backward propagation corresponding to forwardSeqParallel.
   */
@ -157,7 +157,8 @@ protected:
  /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
  std::unique_ptr<Weight> weight_;
  /// Learned bias parameter, shape: (1, 7 * size).
-  /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$.
+  /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf},
+  /// W_{co}\f$.
  std::unique_ptr<Weight> bias_;
  /// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$.
  MatrixPtr localBias_;
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@ -669,7 +669,7 @@ def fc_layer(input, size, act=None, name=None,
                     act=LinearActivation(),
                     bias_attr=False)

-   which is equal to:
+    which is equal to:

    .. code-block:: python

@ -795,15 +795,15 @@ def lstmemory(input, name=None, reverse=False, act=None,

    ..  math::

-        i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)

-        f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)

-        c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)

-        o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)

-        h_t = o_t tanh(c_t)
+        h_t & = o_t tanh(c_t)


    NOTE: In paddle's implementation, the multiply operation
@ -1294,8 +1294,6 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=No
                        label=data_layer,
                        num_classes=3)

-    :param name: layer name
-    :type name: basestring
    :param input: Input layers. It could be a LayerOutput or list/tuple of
                 LayerOutput.
    :type input: LayerOutput|list|tuple
@ -1303,6 +1301,8 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=No
    :type label: LayerOutput
    :param num_classes: number of classes.
    :type num_classes: int
+    :param name: layer name
+    :type name: basestring
    :param bias_attr: Bias attribute. None means default bias.
                      False means no bias.
    :type bias_attr: ParameterAttribute|False
@ -1943,18 +1943,18 @@ def lstm_step_layer(input, state, size, act=None,

    ..  math::

-        i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)

-        f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)

-        c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)

-        o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)

-        h_t = o_t tanh(c_t)
+        h_t & = o_t tanh(c_t)


-    The input\_ of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
+    The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
    :code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
    input vector.

@ -2347,12 +2347,12 @@ def eos_layer(input, eos_id, name=None, layer_attr=None):

       eos = eos_layer(input=layer, eos_id=id)

+    :param name: Layer name.
+    :type name: basestring
    :param input: Input layer name.
    :type input: LayerOutput
    :param eos_id: end id of sequence
    :type eos_id: int
-    :param name: Layer name.
-    :type name: basestring
    :param layer_attr: extra layer attributes.
    :type layer_attr: ExtraLayerAttribute.
    :return: layer name.
@ -2529,11 +2529,11 @@ def conv_operator(input, filter_size, num_filters,
    :param num_filter: channel of output data.
    :type num_filter: int
    :param num_channel: channel of input data.
-    :rtype num_channel: int
+    :type num_channel: int
    :param stride: The x dimension of the stride.
-    :rtype stride: int
+    :type stride: int
    :param stride_y: The y dimension of the stride.
-    :rtype stride_y: int
+    :type stride_y: int
    :param padding: The x dimension of padding.
    :type padding: int
    :param padding_y: The y dimension of padding.
@ -2632,7 +2632,7 @@ def tensor_layer(input, size, act=None, name=None,
    :param input: Input layer.
    :type input: LayerOutput|list|tuple.
    :param size: the layer dimension.
-    :rtype: int.
+    :type size: int.
    :param act: Activation Type. Default is tanh.
    :type act: BaseActivation
    :param param_attr: The Parameter Attribute.
@ -2840,7 +2840,7 @@ def convex_comb_layer(input, size, name=None):
    """
    A layer for convex weighted average of vectors takes two inputs.
      - Input: a vector containing the convex weights (batchSize x weightdim),
-             and a matrix in a vector form (batchSize x (weightdim*datadim)).
+               and a matrix in a vector form (batchSize x (weightdim * datadim)).
      - Output: a vector (batchSize * datadim).

    .. math::
@ -2893,8 +2893,8 @@ def block_expand_layer(input,
                       name=None):
    """
    Expand feature map to minibatch matrix.
-      - matrix width is: block_y * block_x * channel
-      - matirx height is: outputH * outputW
+       - matrix width is: block_y * block_x * channel
+       - matirx height is: outputH * outputW

    .. math::

@ -3100,11 +3100,11 @@ def rank_cost(left, right, lable, weight=None, name=None, coeff=1.0):

    .. math::

-       C_{i,j} = -\\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}})
+       C_{i,j} & = -\\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}})

-       o_{i,j} =  o_i - o_j
+       o_{i,j} & =  o_i - o_j

-       \\tilde{P_{i,j}} = \\{0, 0.5, 1\\} \ or \ \\{0, 1\\}
+       \\tilde{P_{i,j}} & = \\{0, 0.5, 1\\} \ or \ \\{0, 1\\}

    In this formula:
      - :math:`C_{i,j}` is the cross entropy cost.
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@ -440,20 +440,20 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
    """
    Simple LSTM Cell.

-    It just combine a mix_layer with fully_matrix_projection and a lstmemory
+    It just combine a mixed layer with fully_matrix_projection and a lstmemory
    layer. The simple lstm cell was implemented as follow equations.

    ..  math::

-        i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)

-        f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)

-        c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)

-        o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)

-        h_t = o_t tanh(c_t)
+        h_t & = o_t tanh(c_t)

    Please refer **Generating Sequences With Recurrent Neural Networks** if you
    want to know what lstm is. Link_ is here.
@ -502,28 +502,42 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,


@wrap_name_default('lstm_unit')
-def lstmemory_unit(input, name=None, size=None,
-                   mixed_bias_attr=None, mixed_layer_attr=None,
-                   param_attr=None, lstm_bias_attr=None,
-                   act=None, gate_act=None,
-                   state_act=None, lstm_layer_attr=None,
+def lstmemory_unit(input, name=None, size=None, param_attr=None,
+                   act=None, gate_act=None, state_act=None, 
+                   mixed_bias_attr=None, lstm_bias_attr=None,
+                   mixed_layer_attr=None,lstm_layer_attr=None,
                   get_output_layer_attr=None):
    """
    TODO(yuyang18): complete docs

-    @param input:
-    @param name:
-    @param size:
-    @param mixed_bias_attr:
-    @param mixed_layer_attr:
-    @param param_attr:
-    @param lstm_bias_attr:
-    @param act:
-    @param gate_act:
-    @param state_act:
-    @param lstm_layer_attr:
-    @param get_output_layer_attr:
-    @return:
+    :param input: input layer name.
+    :type input: LayerOutput
+    :param name: lstmemory unit name.
+    :type name: basestring
+    :param size: lstmemory unit size.
+    :type size: int
+    :param param_attr: Parameter config, None if use default.
+    :type param_attr: ParameterAttribute
+    :param act: lstm final activate type
+    :type act: BaseActivation
+    :param gate_act: lstm gate activate type
+    :type gate_act: BaseActivation
+    :param state_act: lstm state activate type.
+    :type state_act: BaseActivation
+    :param mixed_bias_attr: bias parameter attribute of mixed layer. 
+                            False means no bias, None means default bias.
+    :type mixed_bias_attr: ParameterAttribute|False
+    :param lstm_bias_attr: bias parameter attribute of lstm layer.
+                           False means no bias, None means default bias.
+    :type lstm_bias_attr: ParameterAttribute|False
+    :param mixed_layer_attr: mixed layer's extra attribute.
+    :type mixed_layer_attr: ExtraLayerAttribute
+    :param lstm_layer_attr: lstm layer's extra attribute.
+    :type lstm_layer_attr: ExtraLayerAttribute
+    :param get_output_layer_attr: get output layer's extra attribute.
+    :type get_output_layer_attr: ExtraLayerAttribute
+    :return: lstmemory unit name.
+    :rtype: LayerOutput
    """
    if size is None:
        assert input.size % 4 == 0
@ -560,32 +574,48 @@ def lstmemory_unit(input, name=None, size=None,
@wrap_name_default('lstm_group')
 def lstmemory_group(input, size=None, name=None,
                    reverse=False, param_attr=None,
-                    mix_bias_attr=None, lstm_bias_attr=None,
                    act=None, gate_act=None, state_act=None,
+                    mixed_bias_attr=None, lstm_bias_attr=None,
                    mixed_layer_attr=None, lstm_layer_attr=None,
                    get_output_layer_attr=None):
    """
    TODO(yuyang18): complete docs

-    @param input:
-    @param size:
-    @param name:
-    @param reverse:
-    @param param_attr:
-    @param mix_bias_attr:
-    @param lstm_bias_attr:
-    @param act:
-    @param gate_act:
-    @param state_act:
-    @param mixed_layer_attr:
-    @param lstm_layer_attr:
-    @param get_output_layer_attr:
-    @return:
+    :param input: input layer name.
+    :type input: LayerOutput
+    :param name: lstmemory group name.
+    :type name: basestring
+    :param size: lstmemory group size.
+    :type size: int
+    :param reverse: is lstm reversed
+    :type reverse: bool
+    :param param_attr: Parameter config, None if use default.
+    :type param_attr: ParameterAttribute
+    :param act: lstm final activate type
+    :type act: BaseActivation
+    :param gate_act: lstm gate activate type
+    :type gate_act: BaseActivation
+    :param state_act: lstm state activate type.
+    :type state_act: BaseActivation
+    :param mixed_bias_attr: bias parameter attribute of mixed layer. 
+                            False means no bias, None means default bias.
+    :type mixed_bias_attr: ParameterAttribute|False
+    :param lstm_bias_attr: bias parameter attribute of lstm layer.
+                           False means no bias, None means default bias.
+    :type lstm_bias_attr: ParameterAttribute|False
+    :param mixed_layer_attr: mixed layer's extra attribute.
+    :type mixed_layer_attr: ExtraLayerAttribute
+    :param lstm_layer_attr: lstm layer's extra attribute.
+    :type lstm_layer_attr: ExtraLayerAttribute
+    :param get_output_layer_attr: get output layer's extra attribute.
+    :type get_output_layer_attr: ExtraLayerAttribute
+    :return: lstmemory group name.
+    :rtype: LayerOutput
    """

    def __lstm_step__(ipt):
        return lstmemory_unit(input=ipt, name=name,
-                              size=size, mixed_bias_attr=mix_bias_attr,
+                              size=size, mixed_bias_attr=mixed_bias_attr,
                              mixed_layer_attr=mixed_layer_attr,
                              param_attr=param_attr,
                              lstm_bias_attr=lstm_bias_attr,
@ -760,13 +790,14 @@ def simple_attention(encoded_sequence,
    Size of the context vector equals to size of encoded_sequence.

    ..  math::
-        a(s_{i-1},h_{j}) = v_{a}f(W_{a}s_{t-1} + U_{a}h_{j})
-    ..  math::
-        e_{i,j} = a(s_{i-1}, h_{j})
-    ..  math::
-        a_{i,j} = \\frac{exp(e_{i,i})}{\\sum_{k=1}^{T_{x}{exp(e_{i,k})}}}
-    ..  math::
-        c_{i} = \\sum_{j=1}^{T_{x}}a_{i,j}h_{j}
+
+        a(s_{i-1},h_{j}) & = v_{a}f(W_{a}s_{t-1} + U_{a}h_{j})
+
+        e_{i,j} & = a(s_{i-1}, h_{j})
+
+        a_{i,j} & = \\frac{exp(e_{i,i})}{\\sum_{k=1}^{T_{x}{exp(e_{i,k})}}}
+
+        c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}h_{j}

    where :math:`h_{j}` is the jth element of encoded_sequence,
    :math:`U_{a}h_{j}` is the jth element of encoded_proj
@ -778,6 +809,7 @@ def simple_attention(encoded_sequence,
    https://arxiv.org/abs/1409.0473.

    The example usage is:
+
    ..  code-block:: python

        context = simple_attention(encoded_sequence=enc_seq,
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@ -61,7 +61,7 @@ class BaseSGDOptimizer(Optimizer):

    ..  math::

-        w:= w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
+        w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)

    where :math:`\\eta` is learning rate. And :math:`n` is batch size.

@ -99,9 +99,9 @@ class AdamOptimizer(BaseSGDOptimizer):

    ..  math::

-        m(w, t) &:= \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
-        v(w, t) &:= \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
-        w &:= w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
+        m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
+        v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
+        w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}

    :param beta1: the :math:`\\beta_1` in equation.
    :type beta1: float
@ -136,11 +136,12 @@ class AdamaxOptimizer(BaseSGDOptimizer):

    The details of please refer this `Adam: A Method for Stochastic Optimization
    <https://arxiv.org/abs/1412.6980>`_
+
    ..  math::

-        m_t &:= \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
-        u_t &:= max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
-        w_t &:= w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
+        m_t & = \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
+        u_t & = max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
+        w_t & = w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t

    :param beta1: the :math:`\\beta_1` in the equation.
    :type beta1: float
@ -175,7 +176,7 @@ class AdaGradOptimizer(BaseSGDOptimizer):
    ..  math::

        G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\
-        w &:= w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
+        w & = w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
    """

    def to_setting_kwargs(self):
@ -197,8 +198,8 @@ class RMSPropOptimizer(BaseSGDOptimizer):

    ..  math::

-        v(w, t) &:= \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
-        w &:= w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
+        v(w, t) & = \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
+        w & = w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)

    :param rho: the :math:`\\rho` in the equation. The forgetting factor.
    :type rho: float