diff --git a/doc/ui/api/trainer_config_helpers/activations.rst b/doc/ui/api/trainer_config_helpers/activations.rst
index ee036ecbac..294f6e4d31 100644
--- a/doc/ui/api/trainer_config_helpers/activations.rst
+++ b/doc/ui/api/trainer_config_helpers/activations.rst
@@ -51,7 +51,7 @@ SequenceSoftmaxActivation
 =========================
 
 ..  automodule:: paddle.trainer_config_helpers.activations
-    :members: SequenceSoftmax
+    :members: SequenceSoftmaxActivation
     :noindex:
     
 ReluActivation
diff --git a/doc/ui/api/trainer_config_helpers/layers.rst b/doc/ui/api/trainer_config_helpers/layers.rst
index 8051d29716..a09d5e3d4d 100644
--- a/doc/ui/api/trainer_config_helpers/layers.rst
+++ b/doc/ui/api/trainer_config_helpers/layers.rst
@@ -136,6 +136,18 @@ gru_step_layer
 Recurrent Layer Group
 =====================
 
+recurrent_group
+---------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: recurrent_group
+    :noindex:
+    
+beam_search
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: beam_search
+    :noindex:
+    
 get_output_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
diff --git a/doc/ui/api/trainer_config_helpers/networks.rst b/doc/ui/api/trainer_config_helpers/networks.rst
index 59370b71a6..2a15b34eae 100644
--- a/doc/ui/api/trainer_config_helpers/networks.rst
+++ b/doc/ui/api/trainer_config_helpers/networks.rst
@@ -43,34 +43,52 @@ vgg_16_network
 Recurrent
 =========
 
+LSTM
+----
+
 lstmemory_unit
---------------
+``````````````
 ..  automodule:: paddle.trainer_config_helpers.networks
     :members: lstmemory_unit
     :noindex:
 
 lstmemory_group
----------------
+```````````````
 ..  automodule:: paddle.trainer_config_helpers.networks
     :members: lstmemory_group
     :noindex:
 
+simple_lstm
+```````````
+..  automodule:: paddle.trainer_config_helpers.networks
+    :members: simple_lstm
+    :noindex:
+
+bidirectional_lstm
+``````````````````
+..  automodule:: paddle.trainer_config_helpers.networks
+    :members: bidirectional_lstm
+    :noindex:
+
+GRU
+---
+
 gru_unit
----------
+````````
 ..  automodule:: paddle.trainer_config_helpers.networks
     :members: gru_unit
     :noindex:
 
-simple_lstm
------------
+gru_group
+`````````
 ..  automodule:: paddle.trainer_config_helpers.networks
-    :members: simple_lstm
+    :members: gru_group
     :noindex:
 
-bidirectional_lstm
-------------------
+simple_gru
+``````````
 ..  automodule:: paddle.trainer_config_helpers.networks
-    :members: bidirectional_lstm
+    :members: simple_gru
     :noindex:
 
 simple_attention
diff --git a/doc/ui/api/trainer_config_helpers/optimizers.rst b/doc/ui/api/trainer_config_helpers/optimizers.rst
index 31b9e057fb..3c683914f4 100644
--- a/doc/ui/api/trainer_config_helpers/optimizers.rst
+++ b/doc/ui/api/trainer_config_helpers/optimizers.rst
@@ -10,10 +10,10 @@ AdamOptimizer
     :members: AdamOptimizer
     :noindex:
 
-AdamxOptimizer
+AdamaxOptimizer
 ================
 ..  automodule:: paddle.trainer_config_helpers.optimizers
-    :members: AdamxOptimizer
+    :members: AdamaxOptimizer
     :noindex:
 
 AdaGradOptimizer
diff --git a/paddle/.gitignore b/paddle/.gitignore
index f46fece211..b89bd9d946 100644
--- a/paddle/.gitignore
+++ b/paddle/.gitignore
@@ -28,9 +28,8 @@ ld-linux-x86-64.so.2
 x86_64-scm-linux-gnu/
 .lint.*.md5
 
-examples/crf/*.bin
-
 .idea/
+.test_env
 Paddle_wrap.cxx
 Paddle_wrap.h
 paddle.py
diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h
index cb3c51c7bd..e080a40141 100644
--- a/paddle/gserver/layers/LstmLayer.h
+++ b/paddle/gserver/layers/LstmLayer.h
@@ -97,13 +97,13 @@ protected:
    * @param starts Each start position of each samples.
    * @param inputValue The input values.
    */
-  void forwardSequence(int batchSize, size_t numSequences,
-                       const int *starts, MatrixPtr inputValue);
+  void forwardSequence(int batchSize, size_t numSequences, const int *starts,
+                       MatrixPtr inputValue);
   /**
    * Compute lstm backward one sequence by one sequence.
    */
-  void backwardSequence(int batchSize, size_t numSequences,
-                        const int *starts, MatrixPtr inputGrad);
+  void backwardSequence(int batchSize, size_t numSequences, const int *starts,
+                        MatrixPtr inputGrad);
 
   /**
    * Compute lstm forward one batch by one batch. The batch value is
@@ -121,21 +121,21 @@ protected:
    * }
    * @endcode
    */
-  void forwardBatch(int batchSize, size_t numSequences,
-                    const int *starts, MatrixPtr inputValue);
+  void forwardBatch(int batchSize, size_t numSequences, const int *starts,
+                    MatrixPtr inputValue);
   /**
    * Compute lstm backward one batch by one batch.
    */
-  void backwardBatch(int batchSize, size_t numSequences,
-                     const int *starts, MatrixPtr inputGrad);
+  void backwardBatch(int batchSize, size_t numSequences, const int *starts,
+                     MatrixPtr inputGrad);
 
   /**
    * This function only supports GPU. It not need to reorganize input into
    * batch value. It will launch one kernel to parallelly compute forward
    * propagation in sequence level.
    */
-  void forwardSeqParallel(int batchSize, size_t numSequences,
-                          const int *starts, MatrixPtr inputValue);
+  void forwardSeqParallel(int batchSize, size_t numSequences, const int *starts,
+                          MatrixPtr inputValue);
   /**
    * Backward propagation corresponding to forwardSeqParallel.
    */
@@ -157,7 +157,8 @@ protected:
   /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
   std::unique_ptr<Weight> weight_;
   /// Learned bias parameter, shape: (1, 7 * size).
-  /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$.
+  /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf},
+  /// W_{co}\f$.
   std::unique_ptr<Weight> bias_;
   /// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$.
   MatrixPtr localBias_;
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index d8903ff818..1eaf26fdbf 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -669,7 +669,7 @@ def fc_layer(input, size, act=None, name=None,
                      act=LinearActivation(),
                      bias_attr=False)
 
-   which is equal to:
+    which is equal to:
 
     .. code-block:: python
 
@@ -795,15 +795,15 @@ def lstmemory(input, name=None, reverse=False, act=None,
 
     ..  math::
 
-        i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
 
-        f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
 
-        c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
 
-        o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
 
-        h_t = o_t tanh(c_t)
+        h_t & = o_t tanh(c_t)
 
 
     NOTE: In paddle's implementation, the multiply operation
@@ -1294,8 +1294,6 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=No
                         label=data_layer,
                         num_classes=3)
 
-    :param name: layer name
-    :type name: basestring
     :param input: Input layers. It could be a LayerOutput or list/tuple of
                  LayerOutput.
     :type input: LayerOutput|list|tuple
@@ -1303,6 +1301,8 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=No
     :type label: LayerOutput
     :param num_classes: number of classes.
     :type num_classes: int
+    :param name: layer name
+    :type name: basestring
     :param bias_attr: Bias attribute. None means default bias.
                       False means no bias.
     :type bias_attr: ParameterAttribute|False
@@ -1943,18 +1943,18 @@ def lstm_step_layer(input, state, size, act=None,
 
     ..  math::
 
-        i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
 
-        f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
 
-        c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
 
-        o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
 
-        h_t = o_t tanh(c_t)
+        h_t & = o_t tanh(c_t)
 
 
-    The input\_ of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
+    The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
     :code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
     input vector.
 
@@ -2347,12 +2347,12 @@ def eos_layer(input, eos_id, name=None, layer_attr=None):
 
        eos = eos_layer(input=layer, eos_id=id)
 
+    :param name: Layer name.
+    :type name: basestring
     :param input: Input layer name.
     :type input: LayerOutput
     :param eos_id: end id of sequence
     :type eos_id: int
-    :param name: Layer name.
-    :type name: basestring
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
     :return: layer name.
@@ -2529,11 +2529,11 @@ def conv_operator(input, filter_size, num_filters,
     :param num_filter: channel of output data.
     :type num_filter: int
     :param num_channel: channel of input data.
-    :rtype num_channel: int
+    :type num_channel: int
     :param stride: The x dimension of the stride.
-    :rtype stride: int
+    :type stride: int
     :param stride_y: The y dimension of the stride.
-    :rtype stride_y: int
+    :type stride_y: int
     :param padding: The x dimension of padding.
     :type padding: int
     :param padding_y: The y dimension of padding.
@@ -2632,7 +2632,7 @@ def tensor_layer(input, size, act=None, name=None,
     :param input: Input layer.
     :type input: LayerOutput|list|tuple.
     :param size: the layer dimension.
-    :rtype: int.
+    :type size: int.
     :param act: Activation Type. Default is tanh.
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute.
@@ -2840,7 +2840,7 @@ def convex_comb_layer(input, size, name=None):
     """
     A layer for convex weighted average of vectors takes two inputs.
       - Input: a vector containing the convex weights (batchSize x weightdim),
-             and a matrix in a vector form (batchSize x (weightdim*datadim)).
+               and a matrix in a vector form (batchSize x (weightdim * datadim)).
       - Output: a vector (batchSize * datadim).
 
     .. math::
@@ -2893,8 +2893,8 @@ def block_expand_layer(input,
                        name=None):
     """
     Expand feature map to minibatch matrix.
-      - matrix width is: block_y * block_x * channel
-      - matirx height is: outputH * outputW
+       - matrix width is: block_y * block_x * channel
+       - matirx height is: outputH * outputW
 
     .. math::
 
@@ -3100,11 +3100,11 @@ def rank_cost(left, right, lable, weight=None, name=None, coeff=1.0):
 
     .. math::
 
-       C_{i,j} = -\\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}})
+       C_{i,j} & = -\\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}})
 
-       o_{i,j} =  o_i - o_j
+       o_{i,j} & =  o_i - o_j
 
-       \\tilde{P_{i,j}} = \\{0, 0.5, 1\\} \ or \ \\{0, 1\\}
+       \\tilde{P_{i,j}} & = \\{0, 0.5, 1\\} \ or \ \\{0, 1\\}
 
     In this formula:
       - :math:`C_{i,j}` is the cross entropy cost.
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 65bd9102dc..b162304b91 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -440,20 +440,20 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
     """
     Simple LSTM Cell.
 
-    It just combine a mix_layer with fully_matrix_projection and a lstmemory
+    It just combine a mixed layer with fully_matrix_projection and a lstmemory
     layer. The simple lstm cell was implemented as follow equations.
 
     ..  math::
 
-        i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
 
-        f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
 
-        c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
 
-        o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
 
-        h_t = o_t tanh(c_t)
+        h_t & = o_t tanh(c_t)
 
     Please refer **Generating Sequences With Recurrent Neural Networks** if you
     want to know what lstm is. Link_ is here.
@@ -502,28 +502,42 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
 
 
 @wrap_name_default('lstm_unit')
-def lstmemory_unit(input, name=None, size=None,
-                   mixed_bias_attr=None, mixed_layer_attr=None,
-                   param_attr=None, lstm_bias_attr=None,
-                   act=None, gate_act=None,
-                   state_act=None, lstm_layer_attr=None,
+def lstmemory_unit(input, name=None, size=None, param_attr=None,
+                   act=None, gate_act=None, state_act=None, 
+                   mixed_bias_attr=None, lstm_bias_attr=None,
+                   mixed_layer_attr=None,lstm_layer_attr=None,
                    get_output_layer_attr=None):
     """
     TODO(yuyang18): complete docs
 
-    @param input:
-    @param name:
-    @param size:
-    @param mixed_bias_attr:
-    @param mixed_layer_attr:
-    @param param_attr:
-    @param lstm_bias_attr:
-    @param act:
-    @param gate_act:
-    @param state_act:
-    @param lstm_layer_attr:
-    @param get_output_layer_attr:
-    @return:
+    :param input: input layer name.
+    :type input: LayerOutput
+    :param name: lstmemory unit name.
+    :type name: basestring
+    :param size: lstmemory unit size.
+    :type size: int
+    :param param_attr: Parameter config, None if use default.
+    :type param_attr: ParameterAttribute
+    :param act: lstm final activate type
+    :type act: BaseActivation
+    :param gate_act: lstm gate activate type
+    :type gate_act: BaseActivation
+    :param state_act: lstm state activate type.
+    :type state_act: BaseActivation
+    :param mixed_bias_attr: bias parameter attribute of mixed layer. 
+                            False means no bias, None means default bias.
+    :type mixed_bias_attr: ParameterAttribute|False
+    :param lstm_bias_attr: bias parameter attribute of lstm layer.
+                           False means no bias, None means default bias.
+    :type lstm_bias_attr: ParameterAttribute|False
+    :param mixed_layer_attr: mixed layer's extra attribute.
+    :type mixed_layer_attr: ExtraLayerAttribute
+    :param lstm_layer_attr: lstm layer's extra attribute.
+    :type lstm_layer_attr: ExtraLayerAttribute
+    :param get_output_layer_attr: get output layer's extra attribute.
+    :type get_output_layer_attr: ExtraLayerAttribute
+    :return: lstmemory unit name.
+    :rtype: LayerOutput
     """
     if size is None:
         assert input.size % 4 == 0
@@ -560,32 +574,48 @@ def lstmemory_unit(input, name=None, size=None,
 @wrap_name_default('lstm_group')
 def lstmemory_group(input, size=None, name=None,
                     reverse=False, param_attr=None,
-                    mix_bias_attr=None, lstm_bias_attr=None,
                     act=None, gate_act=None, state_act=None,
+                    mixed_bias_attr=None, lstm_bias_attr=None,
                     mixed_layer_attr=None, lstm_layer_attr=None,
                     get_output_layer_attr=None):
     """
     TODO(yuyang18): complete docs
 
-    @param input:
-    @param size:
-    @param name:
-    @param reverse:
-    @param param_attr:
-    @param mix_bias_attr:
-    @param lstm_bias_attr:
-    @param act:
-    @param gate_act:
-    @param state_act:
-    @param mixed_layer_attr:
-    @param lstm_layer_attr:
-    @param get_output_layer_attr:
-    @return:
+    :param input: input layer name.
+    :type input: LayerOutput
+    :param name: lstmemory group name.
+    :type name: basestring
+    :param size: lstmemory group size.
+    :type size: int
+    :param reverse: is lstm reversed
+    :type reverse: bool
+    :param param_attr: Parameter config, None if use default.
+    :type param_attr: ParameterAttribute
+    :param act: lstm final activate type
+    :type act: BaseActivation
+    :param gate_act: lstm gate activate type
+    :type gate_act: BaseActivation
+    :param state_act: lstm state activate type.
+    :type state_act: BaseActivation
+    :param mixed_bias_attr: bias parameter attribute of mixed layer. 
+                            False means no bias, None means default bias.
+    :type mixed_bias_attr: ParameterAttribute|False
+    :param lstm_bias_attr: bias parameter attribute of lstm layer.
+                           False means no bias, None means default bias.
+    :type lstm_bias_attr: ParameterAttribute|False
+    :param mixed_layer_attr: mixed layer's extra attribute.
+    :type mixed_layer_attr: ExtraLayerAttribute
+    :param lstm_layer_attr: lstm layer's extra attribute.
+    :type lstm_layer_attr: ExtraLayerAttribute
+    :param get_output_layer_attr: get output layer's extra attribute.
+    :type get_output_layer_attr: ExtraLayerAttribute
+    :return: lstmemory group name.
+    :rtype: LayerOutput
     """
 
     def __lstm_step__(ipt):
         return lstmemory_unit(input=ipt, name=name,
-                              size=size, mixed_bias_attr=mix_bias_attr,
+                              size=size, mixed_bias_attr=mixed_bias_attr,
                               mixed_layer_attr=mixed_layer_attr,
                               param_attr=param_attr,
                               lstm_bias_attr=lstm_bias_attr,
@@ -760,13 +790,14 @@ def simple_attention(encoded_sequence,
     Size of the context vector equals to size of encoded_sequence.
 
     ..  math::
-        a(s_{i-1},h_{j}) = v_{a}f(W_{a}s_{t-1} + U_{a}h_{j})
-    ..  math::
-        e_{i,j} = a(s_{i-1}, h_{j})
-    ..  math::
-        a_{i,j} = \\frac{exp(e_{i,i})}{\\sum_{k=1}^{T_{x}{exp(e_{i,k})}}}
-    ..  math::
-        c_{i} = \\sum_{j=1}^{T_{x}}a_{i,j}h_{j}
+
+        a(s_{i-1},h_{j}) & = v_{a}f(W_{a}s_{t-1} + U_{a}h_{j})
+
+        e_{i,j} & = a(s_{i-1}, h_{j})
+
+        a_{i,j} & = \\frac{exp(e_{i,i})}{\\sum_{k=1}^{T_{x}{exp(e_{i,k})}}}
+
+        c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}h_{j}
 
     where :math:`h_{j}` is the jth element of encoded_sequence,
     :math:`U_{a}h_{j}` is the jth element of encoded_proj
@@ -778,6 +809,7 @@ def simple_attention(encoded_sequence,
     https://arxiv.org/abs/1409.0473.
 
     The example usage is:
+
     ..  code-block:: python
 
         context = simple_attention(encoded_sequence=enc_seq,
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
index c49a2e3652..f0e51c3de5 100644
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -61,7 +61,7 @@ class BaseSGDOptimizer(Optimizer):
 
     ..  math::
 
-        w:= w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
+        w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
 
     where :math:`\\eta` is learning rate. And :math:`n` is batch size.
 
@@ -99,9 +99,9 @@ class AdamOptimizer(BaseSGDOptimizer):
 
     ..  math::
 
-        m(w, t) &:= \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
-        v(w, t) &:= \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
-        w &:= w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
+        m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
+        v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
+        w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
 
     :param beta1: the :math:`\\beta_1` in equation.
     :type beta1: float
@@ -136,11 +136,12 @@ class AdamaxOptimizer(BaseSGDOptimizer):
 
     The details of please refer this `Adam: A Method for Stochastic Optimization
     <https://arxiv.org/abs/1412.6980>`_
+
     ..  math::
 
-        m_t &:= \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
-        u_t &:= max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
-        w_t &:= w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
+        m_t & = \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
+        u_t & = max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
+        w_t & = w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
 
     :param beta1: the :math:`\\beta_1` in the equation.
     :type beta1: float
@@ -175,7 +176,7 @@ class AdaGradOptimizer(BaseSGDOptimizer):
     ..  math::
 
         G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\
-        w &:= w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
+        w & = w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
     """
 
     def to_setting_kwargs(self):
@@ -197,8 +198,8 @@ class RMSPropOptimizer(BaseSGDOptimizer):
 
     ..  math::
 
-        v(w, t) &:= \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
-        w &:= w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
+        v(w, t) & = \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
+        w & = w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
 
     :param rho: the :math:`\\rho` in the equation. The forgetting factor.
     :type rho: float