From 8e19c324ab82feeea87cd582737ebae5bec95fb8 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 12 Jun 2018 12:47:11 +0800
Subject: [PATCH 01/21] update split_lod_tensor, create_array and array_length
 doc

---
 python/paddle/fluid/layers/control_flow.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 80e8ff484a..114c1f0ed4 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -62,6 +62,8 @@ def split_lod_tensor(input, mask, level=0):
     The output is the true branch and the false branch with the mask applied to
     the input at a certain level in the tensor.
 
+    Mainly used in IfElse to split data into two parts. Related API: IfElse.
+
     Args:
         input(tuple|list|None): The input tensor that contains complete
                                 lod information needed to construct the output.
@@ -83,6 +85,7 @@ def split_lod_tensor(input, mask, level=0):
 
           out_true, out_false = layers.split_lod_tensor(
                 input=x, mask=y, level=level)
+
     """
     helper = LayerHelper('split_lod_tensor', **locals())
     out_true = helper.create_tmp_variable(dtype=input.dtype)
@@ -887,14 +890,18 @@ def array_write(x, i, array=None):
 
 
 def create_array(dtype):
-    """This function creates an array of type :math:`LOD_TENSOR_ARRAY` using the
-    LayerHelper.
+    """
+    **Create LoDTensor Array**
+
+    This function creates an array of type :math:`LOD_TENSOR_ARRAY` using the
+    LayerHelper. It is mainly used to implement RNN with array_write, array_read
+    and While.
 
     Args:
         dtype (int|float): The data type of the elements in the array.
 
     Returns:
-        Variable: The tensor variable storing the elements of data type.
+        Variable: The lod_tensor_array variable storing the elements of data type.
 
     Examples:
         .. code-block:: python
@@ -1020,9 +1027,14 @@ def shrink_memory(x, i, table):
 
 
 def array_length(array):
-    """This function performs the operation to find the length of the input
+    """
+    **Get the length of Input LoDTensorArray**
+
+    This function performs the operation to find the length of the input
     LOD_TENSOR_ARRAY.
 
+    Related API: array_read, array_write, While.
+
     Args:
         array (LOD_TENSOR_ARRAY): The input array that will be used
                                   to compute the length.

From 2c1e2caa7d8c225040fdc3674df72afd7313f219 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 12 Jun 2018 13:35:00 +0800
Subject: [PATCH 02/21] update document

---
 python/paddle/fluid/layers/control_flow.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 114c1f0ed4..15f294698c 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -60,15 +60,14 @@ def split_lod_tensor(input, mask, level=0):
     This function takes in an input that contains the complete lod information,
     and takes in a mask which is used to mask certain parts of the input.
     The output is the true branch and the false branch with the mask applied to
-    the input at a certain level in the tensor.
-
-    Mainly used in IfElse to split data into two parts. Related API: IfElse.
+    the input at a certain level in the tensor. Mainly used in IfElse to split
+    data into two parts.
 
     Args:
         input(tuple|list|None): The input tensor that contains complete
                                 lod information needed to construct the output.
         mask(list): A bool column vector which masks the input.
-        level(int): The specific lod level to rank.
+        level(int): The specific lod level to split.
 
     Returns:
         Variable: The true branch of tensor as per the mask applied to input.
@@ -108,8 +107,9 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0):
 
     This function takes in an input :math:`x`, the True branch, the False
     branch and a binary :math:`mask`. Using this information, this function
-    merges the True and False branches of the tensor into a single Output
-    at a certain lod level indiacted by :math:`level`.
+    merges the True and False branches of the tensor into a single tensor as
+    output at a certain lod level indicated by :math:`level`. Used in IfElse
+    to merge the output if True block and False Block.
 
     Args:
         in_true(tuple|list|None): The True branch to be merged.
@@ -117,7 +117,7 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0):
         x(tuple|list|None): The input tensor that contains complete
                             lod information needed to construct the output.
         mask(list): A bool column vector which masks the input.
-        level(int): The specific lod level to rank.
+        level(int): The specific lod level to merge.
 
     Returns:
         Variable: The merged output tensor.

From 4d0fd7e725b259509896f6d891965feec7effee8 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 12 Jun 2018 14:50:56 +0800
Subject: [PATCH 03/21] add API reference for create_tensor

---
 python/paddle/fluid/layers/tensor.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 62b01d595a..6ce486d70d 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -39,6 +39,25 @@ __all__ = [
 
 
 def create_tensor(dtype, name=None, persistable=False):
+    """
+    **Create a Tensor with certain data type and name**
+
+    Args:
+        dtype (string): 'float32'|'int32'|..., the data type of the
+            created tensor.
+        name (string|None): The name of the created tensor, if not set,
+            the name will be a random unique one.
+        persistable (bool): Set the persistable flag of the create tensor,
+            default value is False.
+
+    Returns:
+        Variable: The tensor variable storing the created tensor.
+
+    Examples:
+        .. code-block:: python
+
+          tensor = fluid.layers.create_tensor(dtype='float32')
+    """
     helper = LayerHelper("create_tensor", **locals())
     return helper.create_variable(
         name=helper.name, dtype=dtype, persistable=persistable)

From d82422997a7c21a9d440fae18c6c60c84a5bff7a Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 12 Jun 2018 20:07:17 +0800
Subject: [PATCH 04/21] add doc for batch norm

---
 python/paddle/fluid/layers/nn.py | 51 ++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 9e2c06d26f..6719a4d7ec 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1541,8 +1541,55 @@ def batch_norm(input,
                moving_variance_name=None,
                do_model_average_for_mean_and_var=False):
     """
-    This function helps create an operator to implement
-    the BatchNorm layer using the configurations from the input parameters.
+    **Batch Normalization Layer**
+
+    Can be used as a normalizer function for conv2d and fully_connected operations.
+    The required data format for this layer is one of the following:
+    1. NHWC `[batch, in_height, in_width, in_channels]`
+    2. NCHW `[batch, in_channels, in_height, in_width]`
+
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift
+     <https://arxiv.org/pdf/1502.03167.pdf>`_ for more details.
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Args:
+        input(variable): The input variable which is a LoDTensor.
+        act(string, default None): Activation type, linear|relu|prelu|...
+        is_test(bool, default False): Used for training or training.
+        momentum(float, default 0.9):
+        epsilon(float, default 1e-05):
+        param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
+        bias_attr(ParamAttr): The parameter attribute for Parameter `bias`.
+        data_layout(string, default NCHW): NCHW|NHWC
+        in_place(bool, default False): Make the input and output of batch norm reuse memory.
+        use_mkldnn(bool, Default false): ${use_mkldnn_comment}
+        name(string, Default None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+        moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
+        moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
+        do_model_average_for_mean_and_var(bool, Default False):
+
+    Returns:
+        The sequence's last step variable which is a Tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
+            hidden2 = fluid.layers.batch_norm(input=hidden1)
+
     """
     helper = LayerHelper('batch_norm', **locals())
     dtype = helper.input_dtype()

From f3e631cd9e59741f3d2531706080e3a94c979f35 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 12 Jun 2018 20:21:18 +0800
Subject: [PATCH 05/21] small update

---
 python/paddle/fluid/layers/nn.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 1a010ab3ac..d5db75ebea 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4078,7 +4078,7 @@ def image_resize(input,
                  name=None,
                  resample='BILINEAR'):
     """
-    Resize a batch of images.
+    **Resize a batch of images**
 
     The input must be a tensor of the shape (num_batches, channels, in_h, in_w), 
     and the resizing only applies on the last two dimensions(hight and width).
@@ -4208,6 +4208,8 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'):
 
 def gather(input, index):
     """
+    **Gather Layer**
+
     Output is obtained by gathering entries of the outer-most dimension 
     of X indexed by `index` and concatenate them together.
 

From e72eb0edec589ce6bee07ec5eb34ee7ceca2af33 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 12 Jun 2018 20:23:47 +0800
Subject: [PATCH 06/21] small update

---
 paddle/fluid/operators/detection/polygon_box_transform_op.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
index 335e8dd470..568d50d457 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -83,11 +83,13 @@ class PolygonBoxTransformOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(
 PolygonBoxTransform Operator.
+
+PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
+
 The input is the final geometry output in detection network.
 We use 2*n numbers to denote the coordinate shift from n corner vertices of
 the polygon_box to the pixel location. As each distance offset contains two numbers (xi, yi),
 the geometry output contains 2*n channels.
-PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
 )DOC");
   }
 };

From dde0a28073420134d4aae01d91511ead3d0c362a Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 12 Jun 2018 20:51:56 +0800
Subject: [PATCH 07/21] add doc for Switch

---
 python/paddle/fluid/layers/control_flow.py | 25 +++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 15f294698c..7999ee0f80 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1132,6 +1132,28 @@ class ConditionalBlock(object):
 
 
 class Switch(object):
+    """
+    **Switch Class**
+
+    Many programming languages provide `switch` as a generalization of `if-elif-else`.
+    Switch class works just like a `if-elif-else`.
+
+    The Semantics:
+
+    1. A `switch` control-flow checks cases one-by-one.
+    1. The condition of each case is a boolean value, which is a scalar.
+    1. It runs the first matched case, or the default case if there is one.
+    1. Once it matches a case, it runs the corresponding branch and only that branch.
+
+    Examples:
+        .. code-block:: python
+
+            with control_flow.Switch() as switch:
+                with switch.case(global_step == zero_var):
+                    tensor.assign(input=one_var, output=div_res)
+
+    """
+
     def __init__(self, name=None):
         self.helper = LayerHelper('switch', name=name)
         self.inside_scope = False
@@ -1161,7 +1183,8 @@ class Switch(object):
         return ConditionalBlockGuard(cond_block)
 
     def default(self):
-        """create a default case for this switch
+        """
+        create a default case for this switch
         """
         pre_cond_num = len(self.pre_not_conditions)
         if pre_cond_num == 0:

From d76f8a8f5d06419f3db2647fec8956444ca7c1fe Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 12 Jun 2018 21:24:39 +0800
Subject: [PATCH 08/21] refine doc of polynomial_decay

---
 .../fluid/layers/learning_rate_scheduler.py   | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 716cc7824e..2e5cff74c1 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -162,22 +162,27 @@ def polynomial_decay(learning_rate,
                      end_learning_rate=0.0001,
                      power=1.0,
                      cycle=False):
-    """Applies polynomial decay to the initial learning rate.
+    """
+    **polynomial_decay**
+
+    Applies polynomial decay to the initial learning rate.
+
+    .. code-block::python
+
+     if cycle:
+       decay_steps = decay_steps * ceil(global_step / decay_steps)
+     else:
+       global_step = min(global_step, decay_steps)
+       decayed_learning_rate = (learning_rate - end_learning_rate) *
+            (1 - global_step / decay_steps) ^ power + end_learning_rate
 
-    >>> if cycle:
-    >>>     decay_steps = decay_steps * ceil(global_step / decay_steps)
-    >>> else:
-    >>>     global_step = min(global_step, decay_steps)
-    >>> decayed_learning_rate = (learning_rate - end_learning_rate) *
-    >>>                   (1 - global_step / decay_steps) ^ power +
-    >>>                   end_learning_rate
     Args:
-        learning_rate: A scalar float32 value or a Variable. This
+        learning_rate(Variable|float32): A scalar float32 value or a Variable. This
           will be the initial learning rate during training
-        decay_steps: A Python `int32` number.
-        end_learning_rate: A Python `float` number.
-        power: A Python `float` number
-        cycle: Boolean. If set true, decay the learning rate every decay_steps.
+        decay_steps(int32): A Python `int32` number.
+        end_learning_rate(float): A Python `float` number.
+        power(float): A Python `float` number
+        cycle(bool, Default False): Boolean. If set true, decay the learning rate every decay_steps.
 
     Returns:
         The decayed learning rate

From 76129f03314afb26acae18e7a5838612f6fb28f0 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 13 Jun 2018 22:16:23 +0800
Subject: [PATCH 09/21] update comment

---
 python/paddle/fluid/layers/control_flow.py | 7 +++----
 python/paddle/fluid/layers/nn.py           | 5 +++--
 python/paddle/fluid/layers/tensor.py       | 7 +++----
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 7999ee0f80..feac42d94e 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -893,12 +893,11 @@ def create_array(dtype):
     """
     **Create LoDTensor Array**
 
-    This function creates an array of type :math:`LOD_TENSOR_ARRAY` using the
-    LayerHelper. It is mainly used to implement RNN with array_write, array_read
-    and While.
+    This function creates an array of LOD_TENSOR_ARRAY . It is mainly used to
+    implement RNN with array_write, array_read and While.
 
     Args:
-        dtype (int|float): The data type of the elements in the array.
+        dtype (int|float): The data type of the elements in the lod_tensor_array.
 
     Returns:
         Variable: The lod_tensor_array variable storing the elements of data type.
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 80452a1e8b..3f3b7e20ef 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1618,8 +1618,9 @@ def batch_norm(input,
     1. NHWC `[batch, in_height, in_width, in_channels]`
     2. NCHW `[batch, in_channels, in_height, in_width]`
 
-    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift
-     <https://arxiv.org/pdf/1502.03167.pdf>`_ for more details.
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
+    for more details.
 
     :math:`input` is the input features over a mini-batch.
 
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 6ce486d70d..6b7f69807b 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -40,15 +40,14 @@ __all__ = [
 
 def create_tensor(dtype, name=None, persistable=False):
     """
-    **Create a Tensor with certain data type and name**
+    **Create a Tensor**
 
     Args:
         dtype (string): 'float32'|'int32'|..., the data type of the
             created tensor.
-        name (string|None): The name of the created tensor, if not set,
+        name (string, Default: None): The name of the created tensor, if not set,
             the name will be a random unique one.
-        persistable (bool): Set the persistable flag of the create tensor,
-            default value is False.
+        persistable (bool, Default: False): Set the persistable flag of the create tensor.
 
     Returns:
         Variable: The tensor variable storing the created tensor.

From 0ae670917489d24e25e648c85df6a0f8a110f979 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Thu, 14 Jun 2018 10:49:07 +0800
Subject: [PATCH 10/21] update document

---
 python/paddle/fluid/layers/control_flow.py       | 16 +++++++++-------
 .../fluid/layers/learning_rate_scheduler.py      | 10 +++++-----
 python/paddle/fluid/layers/nn.py                 |  2 ++
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index feac42d94e..5354582aaa 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -76,13 +76,13 @@ def split_lod_tensor(input, mask, level=0):
     Examples:
         .. code-block:: python
 
-          x = layers.data(name='x', shape=[1])
+          x = fluid.layers.data(name='x', shape=[1])
           x.persistable = True
 
-          y = layers.data(name='y', shape=[1])
+          y = fluid.layers.data(name='y', shape=[1])
           y.persistable = True
 
-          out_true, out_false = layers.split_lod_tensor(
+          out_true, out_false = fluid.layers.split_lod_tensor(
                 input=x, mask=y, level=level)
 
     """
@@ -891,7 +891,7 @@ def array_write(x, i, array=None):
 
 def create_array(dtype):
     """
-    **Create LoDTensor Array**
+    **Create LoDTensorArray**
 
     This function creates an array of LOD_TENSOR_ARRAY . It is mainly used to
     implement RNN with array_write, array_read and While.
@@ -989,7 +989,8 @@ def array_read(array, i):
     Returns:
         Variable: The tensor type variable that has the data written to it.
     Examples:
-        .. code-block::python
+        .. code-block:: python
+
           tmp = fluid.layers.zeros(shape=[10], dtype='int32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
           arr = layers.array_read(tmp, i=i)
@@ -1027,7 +1028,7 @@ def shrink_memory(x, i, table):
 
 def array_length(array):
     """
-    **Get the length of Input LoDTensorArray**
+    **Get the Length of Input LoDTensorArray**
 
     This function performs the operation to find the length of the input
     LOD_TENSOR_ARRAY.
@@ -1042,12 +1043,13 @@ def array_length(array):
         Variable: The length of the input LoDTensorArray.
 
     Examples:
-        .. code-block::python
+        .. code-block:: python
 
           tmp = fluid.layers.zeros(shape=[10], dtype='int32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
           arr = fluid.layers.array_write(tmp, i=i)
           arr_len = fluid.layers.array_length(arr)
+
     """
     helper = LayerHelper('array_length', **locals())
     tmp = helper.create_tmp_variable(dtype='int64')
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 2e5cff74c1..2dbc51c23f 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -163,11 +163,11 @@ def polynomial_decay(learning_rate,
                      power=1.0,
                      cycle=False):
     """
-    **polynomial_decay**
+    **Polynomial Decay**
 
     Applies polynomial decay to the initial learning rate.
 
-    .. code-block::python
+    .. code-block:: python
 
      if cycle:
        decay_steps = decay_steps * ceil(global_step / decay_steps)
@@ -180,9 +180,9 @@ def polynomial_decay(learning_rate,
         learning_rate(Variable|float32): A scalar float32 value or a Variable. This
           will be the initial learning rate during training
         decay_steps(int32): A Python `int32` number.
-        end_learning_rate(float): A Python `float` number.
-        power(float): A Python `float` number
-        cycle(bool, Default False): Boolean. If set true, decay the learning rate every decay_steps.
+        end_learning_rate(float, Default: 0.0001): A Python `float` number.
+        power(float, Default: 1.0): A Python `float` number
+        cycle(bool, Default: False): Boolean. If set true, decay the learning rate every decay_steps.
 
     Returns:
         The decayed learning rate
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 3f3b7e20ef..7c4393c4d9 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1615,7 +1615,9 @@ def batch_norm(input,
 
     Can be used as a normalizer function for conv2d and fully_connected operations.
     The required data format for this layer is one of the following:
+
     1. NHWC `[batch, in_height, in_width, in_channels]`
+
     2. NCHW `[batch, in_channels, in_height, in_width]`
 
     Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing

From 21ecd357cffb7165813ffa65b2ab7c810eddfece Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Thu, 14 Jun 2018 11:01:07 +0800
Subject: [PATCH 11/21] little optimize

---
 python/paddle/fluid/layers/nn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 7c4393c4d9..627718f87e 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4099,7 +4099,7 @@ def image_resize(input,
                  name=None,
                  resample='BILINEAR'):
     """
-    **Resize a batch of images**
+    **Resize a Batch of Images**
 
     The input must be a tensor of the shape (num_batches, channels, in_h, in_w), 
     and the resizing only applies on the last two dimensions(hight and width).

From 62bf672eddfdbd8c9287292c5ddae80d4eae2af4 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Thu, 14 Jun 2018 11:20:46 +0800
Subject: [PATCH 12/21] update document for Switch

---
 python/paddle/fluid/layers/control_flow.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 5354582aaa..db5b07558a 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1142,16 +1142,19 @@ class Switch(object):
     The Semantics:
 
     1. A `switch` control-flow checks cases one-by-one.
-    1. The condition of each case is a boolean value, which is a scalar.
-    1. It runs the first matched case, or the default case if there is one.
-    1. Once it matches a case, it runs the corresponding branch and only that branch.
+
+    2. The condition of each case is a boolean value, which is a scalar.
+
+    3. It runs the first matched case, or the default case if there is one.
+
+    4. Once it matches a case, it runs the corresponding branch and only that branch.
 
     Examples:
         .. code-block:: python
 
-            with control_flow.Switch() as switch:
+            with fluid.control_flow.Switch() as switch:
                 with switch.case(global_step == zero_var):
-                    tensor.assign(input=one_var, output=div_res)
+                    fluid.tensor.assign(input=one_var, output=div_res)
 
     """
 

From 2f9ed97eb66250a788702e21240bc09fea93b85d Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Thu, 14 Jun 2018 13:54:44 +0800
Subject: [PATCH 13/21] follow comment

---
 python/paddle/fluid/layers/control_flow.py |  2 --
 python/paddle/fluid/layers/nn.py           | 14 +++++++-------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index db5b07558a..5394ac3278 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -55,8 +55,6 @@ __all__ = [
 
 def split_lod_tensor(input, mask, level=0):
     """
-    **split_lod_tensor**
-
     This function takes in an input that contains the complete lod information,
     and takes in a mask which is used to mask certain parts of the input.
     The output is the true branch and the false branch with the mask applied to
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 627718f87e..d3899cd442 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1638,23 +1638,23 @@ def batch_norm(input,
 
     Args:
         input(variable): The input variable which is a LoDTensor.
-        act(string, default None): Activation type, linear|relu|prelu|...
-        is_test(bool, default False): Used for training or training.
-        momentum(float, default 0.9):
-        epsilon(float, default 1e-05):
+        act(string, Default None): Activation type, linear|relu|prelu|...
+        is_test(bool, Default False): Used for training or training.
+        momentum(float, Default 0.9):
+        epsilon(float, Default 1e-05):
         param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
         bias_attr(ParamAttr): The parameter attribute for Parameter `bias`.
         data_layout(string, default NCHW): NCHW|NHWC
-        in_place(bool, default False): Make the input and output of batch norm reuse memory.
+        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
         use_mkldnn(bool, Default false): ${use_mkldnn_comment}
         name(string, Default None): A name for this layer(optional). If set None, the layer
             will be named automatically.
         moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
         moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
-        do_model_average_for_mean_and_var(bool, Default False):
+        do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
 
     Returns:
-        The sequence's last step variable which is a Tensor.
+        Variable: A tensor variable which is the result after applying batch normalization on the input.
 
     Examples:
 

From 9de779f1cfae9daba1b38b8df829cb0e56247592 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Fri, 15 Jun 2018 13:18:33 +0800
Subject: [PATCH 14/21] update switch class

---
 python/paddle/fluid/layers/control_flow.py | 25 ++++++++++++++++------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 209a767e73..2bc43c5ce9 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1156,16 +1156,14 @@ class ConditionalBlock(object):
 
 class Switch(object):
     """
-    **Switch Class**
-
-    Many programming languages provide `switch` as a generalization of `if-elif-else`.
-    Switch class works just like a `if-elif-else`.
+    Switch class works just like a `if-elif-else`. Can be used in learning rate scheduler
+    to modify learning rate
 
     The Semantics:
 
     1. A `switch` control-flow checks cases one-by-one.
 
-    2. The condition of each case is a boolean value, which is a scalar.
+    2. The condition of each case is a boolean value, which is a scalar Variable.
 
     3. It runs the first matched case, or the default case if there is one.
 
@@ -1174,9 +1172,22 @@ class Switch(object):
     Examples:
         .. code-block:: python
 
-            with fluid.control_flow.Switch() as switch:
+            lr = fluid.layers.tensor.create_global_var(
+                shape=[1],
+                value=0.0,
+                dtype='float32',
+                persistable=True,
+                name="learning_rate")
+            one_var = tensor.fill_constant(
+                shape=[1], dtype='float32', value=1.0)
+            two_var = tensor.fill_constant(
+                shape=[1], dtype='float32', value=2.0)
+
+            with fluid.layers.control_flow.Switch() as switch:
                 with switch.case(global_step == zero_var):
-                    fluid.tensor.assign(input=one_var, output=div_res)
+                    fluid.layers.tensor.assign(input=one_var, output=lr)
+                with switch.default():
+                    fluid.layers.tensor.assign(input=two_var, output=lr)
 
     """
 

From e2783bb6afeb4e5b4160ff4283c18672f2f0632e Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Fri, 15 Jun 2018 14:22:21 +0800
Subject: [PATCH 15/21] update split_lod_tensor doc

---
 python/paddle/fluid/layers/control_flow.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 2bc43c5ce9..e261e3f63a 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -69,8 +69,10 @@ def split_lod_tensor(input, mask, level=0):
         level(int): The specific lod level to split.
 
     Returns:
-        Variable: The true branch of tensor as per the mask applied to input.
-        Variable: The false branch of tensor as per the mask applied to input.
+        tuple(Variable, Variable):
+        The true branch of tensor as per the mask applied to input.
+
+        The false branch of tensor as per the mask applied to input.
 
     Examples:
         .. code-block:: python

From 1c9fc655d0b4745f74940f99acc8421faf8656f5 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Fri, 15 Jun 2018 15:16:14 +0800
Subject: [PATCH 16/21] update

---
 python/paddle/fluid/layers/detection.py       | 73 ++++++++++---------
 .../fluid/layers/learning_rate_scheduler.py   | 12 ++-
 python/paddle/fluid/layers/tensor.py          |  8 +-
 3 files changed, 48 insertions(+), 45 deletions(-)

diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index edf528a595..dacb31f8b6 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -603,7 +603,7 @@ def prior_box(input,
               offset=0.5,
               name=None):
     """
-    **Prior box operator**
+    **Prior Box Operator**
 
     Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
     Each position of the input produce N prior boxes, N is determined by
@@ -632,26 +632,30 @@ def prior_box(input,
        name(str): Name of the prior box op. Default: None.
 
     Returns:
-        boxes(Variable): the output prior boxes of PriorBox.
-             The layout is [H, W, num_priors, 4].
-             H is the height of input, W is the width of input,
-             num_priors is the total
-             box count of each position of input.
-        Variances(Variable): the expanded variances of PriorBox.
-             The layout is [H, W, num_priors, 4].
-             H is the height of input, W is the width of input
-             num_priors is the total
-             box count of each position of input
+        tuple: A tuple with two Variable (boxes, variances)
+
+        boxes: the output prior boxes of PriorBox.
+        The layout is [H, W, num_priors, 4].
+        H is the height of input, W is the width of input,
+        num_priors is the total
+        box count of each position of input.
+
+        variances: the expanded variances of PriorBox.
+        The layout is [H, W, num_priors, 4].
+        H is the height of input, W is the width of input
+        num_priors is the total
+        box count of each position of input
 
 
     Examples:
         .. code-block:: python
-            box, var = prior_box(
-            input=conv1,
-            image=images,
-            min_sizes=[100.],
-            flip=True,
-            clip=True)
+
+            box, var = fluid.layers.prior_box(
+                input=conv1,
+                image=images,
+                min_sizes=[100.],
+                flip=True,
+                clip=True)
     """
     helper = LayerHelper("prior_box", **locals())
     dtype = helper.input_dtype()
@@ -721,11 +725,9 @@ def multi_box_head(inputs,
                    stride=1,
                    name=None):
     """
-    **Prior_boxes**
-
     Generate prior boxes for SSD(Single Shot MultiBox Detector)
     algorithm. The details of this algorithm, please refer the
-    section 2.2 of SSD paper (SSD: Single Shot MultiBox Detector)
+    section 2.2 of SSD paper `SSD: Single Shot MultiBox Detector
     <https://arxiv.org/abs/1512.02325>`_ .
 
     Args:
@@ -766,24 +768,27 @@ def multi_box_head(inputs,
        name(str): Name of the prior box layer. Default: None.
 
     Returns:
-        mbox_loc(Variable): The predicted boxes' location of the inputs.
-             The layout is [N, H*W*Priors, 4]. where Priors
-             is the number of predicted boxes each position of each input.
-        mbox_conf(Variable): The predicted boxes' confidence of the inputs.
-             The layout is [N, H*W*Priors, C]. where Priors
-             is the number of predicted boxes each position of each input
-             and C is the number of Classes.
-        boxes(Variable): the output prior boxes of PriorBox.
-             The layout is [num_priors, 4]. num_priors is the total
-             box count of each position of inputs.
-        Variances(Variable): the expanded variances of PriorBox.
-             The layout is [num_priors, 4]. num_priors is the total
-             box count of each position of inputs
+        tuple: A tuple with four Variables. (mbox_loc, mbox_conf, boxes, variances)
+
+        mbox_loc: The predicted boxes' location of the inputs. The layout
+        is [N, H*W*Priors, 4]. where Priors is the number of predicted
+        boxes each position of each input.
+
+        mbox_conf: The predicted boxes' confidence of the inputs. The layout
+        is [N, H*W*Priors, C]. where Priors is the number of predicted boxes
+        each position of each input and C is the number of Classes.
+
+        boxes: the output prior boxes of PriorBox. The layout is [num_priors, 4].
+        num_priors is the total box count of each position of inputs.
+
+        variances: the expanded variances of PriorBox. The layout is
+        [num_priors, 4]. num_priors is the total box count of each position of inputs
 
 
     Examples:
         .. code-block:: python
-          mbox_locs, mbox_confs, box, var = layers.multi_box_head(
+
+          mbox_locs, mbox_confs, box, var = fluid.layers.multi_box_head(
             inputs=[conv1, conv2, conv3, conv4, conv5, conv5],
             image=images,
             num_classes=21,
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 2dbc51c23f..e76f15d838 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -163,8 +163,6 @@ def polynomial_decay(learning_rate,
                      power=1.0,
                      cycle=False):
     """
-    **Polynomial Decay**
-
     Applies polynomial decay to the initial learning rate.
 
     .. code-block:: python
@@ -178,14 +176,14 @@ def polynomial_decay(learning_rate,
 
     Args:
         learning_rate(Variable|float32): A scalar float32 value or a Variable. This
-          will be the initial learning rate during training
+          will be the initial learning rate during training.
         decay_steps(int32): A Python `int32` number.
-        end_learning_rate(float, Default: 0.0001): A Python `float` number.
-        power(float, Default: 1.0): A Python `float` number
-        cycle(bool, Default: False): Boolean. If set true, decay the learning rate every decay_steps.
+        end_learning_rate(float): A Python `float` number.
+        power(float): A Python `float` number.
+        cycle(bool): If set true, decay the learning rate every decay_steps.
 
     Returns:
-        The decayed learning rate
+        Variable: The decayed learning rate
     """
     global_step = _decay_step_counter()
 
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 25505e4427..978f7dde29 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -40,14 +40,14 @@ __all__ = [
 
 def create_tensor(dtype, name=None, persistable=False):
     """
-    **Create a Tensor**
+    Create an variable, which will hold a LoDTensor with data type dtype.
 
     Args:
-        dtype (string): 'float32'|'int32'|..., the data type of the
+        dtype(string): 'float32'|'int32'|..., the data type of the
             created tensor.
-        name (string, Default: None): The name of the created tensor, if not set,
+        name(string): The name of the created tensor, if not set,
             the name will be a random unique one.
-        persistable (bool, Default: False): Set the persistable flag of the create tensor.
+        persistable(bool): Set the persistable flag of the create tensor.
 
     Returns:
         Variable: The tensor variable storing the created tensor.

From 6ace04f655be6ea7898b5cbe61dfbdb1e16b7806 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Fri, 15 Jun 2018 16:00:07 +0800
Subject: [PATCH 17/21] update

---
 paddle/fluid/operators/activation_op.cc | 2 +-
 python/paddle/fluid/layers/nn.py        | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index c73482eb12..8743c9500a 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -133,7 +133,7 @@ $out = \max(x, 0)$
 __attribute__((unused)) constexpr char TanhDoc[] = R"DOC(
 Tanh Activation Operator.
 
-$$out = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+$$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
 )DOC";
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index c6c8c7c2d1..485470f281 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4475,6 +4475,7 @@ def image_resize(input,
     and the resizing only applies on the last two dimensions(hight and width).
 
     Supporting resample methods:
+
         'BILINEAR' : Bilinear interpolation
 
     Args:
@@ -4494,8 +4495,8 @@ def image_resize(input,
                        Default: 'BILINEAR'
 
     Returns:
-        out (Variable): The output is a 4-D tensor of the shape
-                        (num_batches, channls, out_h, out_w).
+        Variable: The output is a 4-D tensor of the shape
+        (num_batches, channls, out_h, out_w).
 
     Examples:
         .. code-block:: python
@@ -4579,7 +4580,7 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'):
         resample (str): resample method, default: BILINEAR.
 
     Returns:
-        out (Variable): The output is a 4-D tensor of the shape
+        Variable: The output is a 4-D tensor of the shape
                         (num_batches, channls, out_h, out_w).
     """
     in_shape = input.shape

From 8f59d79d751e3174f0a6f98783fa1dbdbc279cc2 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Fri, 15 Jun 2018 16:35:53 +0800
Subject: [PATCH 18/21] update doc for sigmoid_cross_entropy_with_logits

---
 .../fluid/operators/sigmoid_cross_entropy_with_logits_op.cc   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 135e2a6f7f..c3b0fe3209 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -113,14 +113,14 @@ The logistic loss is given as follows:
 
        $$loss = -Labels * \log(\sigma(X)) - (1 - Labels) * \log(1 - \sigma(X))$$
 
-We know that $$\sigma(X) = (1 / (1 + \exp(-X)))$$. By substituting this we get:
+We know that $$\sigma(X) = \\frac{1}{1 + \exp(-X)}$$. By substituting this we get:
 
        $$loss = X - X * Labels + \log(1 + \exp(-X))$$
 
 For stability and to prevent overflow of $$\exp(-X)$$ when X < 0,
 we reformulate the loss as follows:
 
-       $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-|X|))$$
+       $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-\|X\|))$$
 
 Both the input `X` and `Labels` can carry the LoD (Level of Details) information.
 However the output only shares the LoD with input `X`.

From a4ee0d0dd165cdc79beca3e0904a7adf5bf58d9c Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Sat, 16 Jun 2018 08:58:48 +0800
Subject: [PATCH 19/21] add reverse

---
 python/paddle/fluid/layers/tensor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 4c97ca40d8..18e0fedcc4 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -35,6 +35,7 @@ __all__ = [
     'argmax',
     'ones',
     'zeros',
+    'reverse',
 ]
 
 

From 82a4cf19608c7655a6b6394c65a10933f3b64dc0 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Sun, 17 Jun 2018 11:44:25 +0800
Subject: [PATCH 20/21] update image_resize_short and shape doc

---
 paddle/fluid/operators/shape_op.cc | 9 ++++++---
 python/paddle/fluid/layers/nn.py   | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index c75fce7959..b44d5f8980 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -36,10 +36,13 @@ class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("Input", "(Tensor), The input tensor.");
-    AddOutput("Out", "(Tensor), The shape of input tensor.");
+    AddOutput("Out",
+              "(Tensor), The shape of input tensor, the data type of the shape"
+              " is int64_t, will be on the same device with the input Tensor.");
     AddComment(R"DOC(
-Shape Operator. 
-Get the shape of input tensor.
+Shape Operator
+
+Get the shape of input tensor. Only support CPU input Tensor now.
 )DOC");
   }
 };
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index a3b2d2b777..40e72aa488 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4650,7 +4650,7 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'):
 
     Returns:
         Variable: The output is a 4-D tensor of the shape
-                        (num_batches, channls, out_h, out_w).
+        (num_batches, channls, out_h, out_w).
     """
     in_shape = input.shape
     if len(in_shape) != 4:

From 46ae1c93c28d346d9a4c6a4bf7c9d1019216403b Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Sun, 17 Jun 2018 14:00:49 +0800
Subject: [PATCH 21/21] add doc for softmax

---
 python/paddle/fluid/layers/nn.py | 39 ++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 6032573393..d31d12f971 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1258,6 +1258,45 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
 
 
 def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
+    """
+    The input of the softmax layer is a 2-D tensor with shape N x K (N is the
+    batch_size, K is the dimension of input feature). The output tensor has the
+    same shape as the input tensor.
+
+    For each row of the input tensor, the softmax operator squashes the
+    K-dimensional vector of arbitrary real values to a K-dimensional vector of real
+    values in the range [0, 1] that add up to 1.
+
+    It computes the exponential of the given dimension and the sum of exponential
+    values of all the other dimensions in the K-dimensional vector input.
+    Then the ratio of the exponential of the given dimension and the sum of
+    exponential values of all the other dimensions is the output of the softmax
+    operator.
+
+    For each row :math:`i` and each column :math:`j` in Input(X), we have:
+
+    .. math::
+
+        Out[i, j] = \\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}
+
+    Args:
+        input (Variable): The input variable.
+        bias_attr (ParamAttr): attributes for bias
+        param_attr (ParamAttr): attributes for parameter
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
+        library is installed.
+
+    Returns:
+        Variable: output of softmax
+
+    Examples:
+
+        .. code-block:: python
+
+             fc = fluid.layers.fc(input=x, size=10)
+             softmax = fluid.layers.softmax(input=fc)
+
+    """
     helper = LayerHelper('softmax', **locals())
     dtype = helper.input_dtype()
     softmax_out = helper.create_tmp_variable(dtype)