From 9cf1f351d25a2ab6a307f2c629f8e36fa33f4702 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 7 Jun 2018 10:51:54 +0800
Subject: [PATCH 01/69] refine nlp test

---
 paddle/fluid/inference/tests/book/test_inference_nlp.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index a0e83a1705..def6231815 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -104,9 +104,9 @@ void ThreadRunInfer(
     const int tid, paddle::framework::Scope* scope,
     const std::vector<std::vector<const paddle::framework::LoDTensor*>>& jobs) {
   // maybe framework:ProgramDesc is not thread-safe
+  paddle::platform::CPUPlace place;
+  paddle::framework::Executor executor(place);
   auto& sub_scope = scope->NewScope();
-  auto place = paddle::platform::CPUPlace();
-  auto executor = paddle::framework::Executor(place);
   auto inference_program =
       paddle::inference::Load(&executor, scope, FLAGS_model_path);
 
@@ -183,8 +183,8 @@ TEST(inference, nlp) {
     stop_ms = GetCurrentMs();
   } else {
     // 1. Define place, executor, scope
-    auto place = paddle::platform::CPUPlace();
-    auto executor = paddle::framework::Executor(place);
+    paddle::platform::CPUPlace place;
+    paddle::framework::Executor executor(place);
 
     // 2. Initialize the inference_program and load parameters
     std::unique_ptr<paddle::framework::ProgramDesc> inference_program;

From f326b0117e456e3a0cc6b38bcb819b3b56bef959 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 7 Jun 2018 10:56:35 +0800
Subject: [PATCH 02/69] refine scope lock

---
 paddle/fluid/framework/scope.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index bb2d866c82..fd23fdeab7 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -78,6 +78,7 @@ Variable* Scope::FindVarInternal(const std::string& name) const {
 }
 
 const Scope* Scope::FindScope(const Variable* var) const {
+  std::unique_lock<std::mutex> lock(mutex_);
   for (auto& kv : vars_) {
     if (kv.second.get() == var) {
       return this;
@@ -127,6 +128,7 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
+  std::unique_lock<std::mutex> lock(mutex_);
   auto origin_it = vars_.find(origin_name);
   PADDLE_ENFORCE(origin_it != vars_.end(),
                  "Cannot find original variable with name %s", origin_name);

From b8d315fb6941908a1ade7a4b19c8276a07e3e874 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 7 Jun 2018 20:35:24 +0800
Subject: [PATCH 03/69] make scope thread safe

---
 paddle/fluid/framework/scope.cc | 77 ++++++++++++++++++++-------------
 paddle/fluid/framework/scope.h  | 12 ++++-
 2 files changed, 56 insertions(+), 33 deletions(-)

diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index fd23fdeab7..50f374e370 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -43,49 +43,29 @@ Scope& Scope::NewScope() const {
 }
 
 Variable* Scope::Var(const std::string& name) {
-  // acquire the lock when new var under this scope
   std::unique_lock<std::mutex> lock(mutex_);
-  auto* v = FindVarLocally(name);
-  if (v != nullptr) return v;
-
-  v = new Variable();
-  vars_[name].reset(v);
-  VLOG(3) << "Create variable " << name;
-  v->name_ = &(vars_.find(name)->first);
-  return v;
+  return VarInternal(name);
 }
 
 Variable* Scope::Var(std::string* name) {
-  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   if (name != nullptr) {
-    *name = var_name;
+    *name = new_name;
   }
-  return Var(var_name);
+  return VarInternal(new_name);
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  // acquire the lock when find var
   std::unique_lock<std::mutex> lock(mutex_);
   return FindVarInternal(name);
 }
 
-Variable* Scope::FindVarInternal(const std::string& name) const {
-  auto var = FindVarLocally(name);
-  if (var != nullptr) {
-    return var;
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindVarInternal(name);
-}
-
 const Scope* Scope::FindScope(const Variable* var) const {
   std::unique_lock<std::mutex> lock(mutex_);
-  for (auto& kv : vars_) {
-    if (kv.second.get() == var) {
-      return this;
-    }
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+  return FindScopeInternal(var);
 }
+
 void Scope::DropKids() {
   std::unique_lock<std::mutex> lock(mutex_);
   for (Scope* s : kids_) delete s;
@@ -93,6 +73,7 @@ void Scope::DropKids() {
 }
 
 std::vector<std::string> Scope::LocalVarNames() const {
+  std::unique_lock<std::mutex> lock(mutex_);
   std::vector<std::string> known_vars;
   known_vars.reserve(this->vars_.size());
   for (auto& p : vars_) {
@@ -129,6 +110,38 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
   std::unique_lock<std::mutex> lock(mutex_);
+  RenameInternal(origin_name, new_name);
+}
+
+std::string Scope::Rename(const std::string& origin_name) const {
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
+  RenameInternal(origin_name, new_name);
+  return new_name;
+}
+
+Variable* Scope::VarInternal(const std::string& name) {
+  auto* v = FindVarLocally(name);
+  if (v != nullptr) return v;
+
+  v = new Variable();
+  vars_[name].reset(v);
+  VLOG(3) << "Create variable " << name;
+  v->name_ = &(vars_.find(name)->first);
+  return v;
+}
+
+const Scope* Scope::FindScopeInternal(const Variable* var) const {
+  for (auto& kv : vars_) {
+    if (kv.second.get() == var) {
+      return this;
+    }
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+}
+
+void Scope::RenameInternal(const std::string& origin_name,
+                           const std::string& new_name) const {
   auto origin_it = vars_.find(origin_name);
   PADDLE_ENFORCE(origin_it != vars_.end(),
                  "Cannot find original variable with name %s", origin_name);
@@ -139,10 +152,12 @@ void Scope::Rename(const std::string& origin_name,
   vars_.erase(origin_it);
 }
 
-std::string Scope::Rename(const std::string& origin_name) const {
-  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
-  Rename(origin_name, var_name);
-  return var_name;
+Variable* Scope::FindVarInternal(const std::string& name) const {
+  auto var = FindVarLocally(name);
+  if (var != nullptr) {
+    return var;
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
 }
 
 Variable* Scope::FindVarLocally(const std::string& name) const {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 98d103d867..34687df3ab 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -85,12 +85,20 @@ class Scope {
   // Call Scope::NewScope for a sub-scope.
   explicit Scope(Scope const* parent) : parent_(parent) {}
 
+  // Called by Var.
+  Variable* VarInternal(const std::string& name);
+
+  // Called by FindScope.
+  const Scope* FindScopeInternal(const Variable* var) const;
+
+  // Called by Rename.
+  void RenameInternal(const std::string& origin_name,
+                      const std::string& new_name) const;
+
   // Called by FindVar recursively.
-  // Caller doesn't own the returned Variable.
   Variable* FindVarInternal(const std::string& name) const;
 
   // Called by FindVarInternal and Var.
-  // Caller doesn't own the returned Variable.
   Variable* FindVarLocally(const std::string& name) const;
 
   mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;

From 2955ff58871648a2cb151391ee82fc5ea570b8e6 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Mon, 11 Jun 2018 15:21:22 +0800
Subject: [PATCH 04/69] Polish documentation

* row_conv
* uniform_random
* layer_norm
* create_parameter
* hard_shrink
* ssd_loss
---
 paddle/fluid/operators/activation_op.cc     | 13 ++---
 paddle/fluid/operators/layer_norm_op.cc     | 33 +++++------
 paddle/fluid/operators/row_conv_op.cc       | 20 ++++++-
 paddle/fluid/operators/uniform_random_op.cc | 20 +++----
 python/paddle/fluid/layers/detection.py     | 63 ++++++++++++---------
 python/paddle/fluid/layers/nn.py            | 63 ++++++---------------
 python/paddle/fluid/layers/tensor.py        | 15 ++++-
 7 files changed, 115 insertions(+), 112 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 96e4c0e04c..7a567a83f9 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -276,13 +276,12 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 HardShrink Activation Operator.
 
-$$
-out = \begin{cases} 
-    x, \text{if } x > \lambda \\
-    x, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+..  math::
+    out = \begin{cases}
+            x, \text{if } x > \lambda \\
+            x, \text{if } x < -\lambda \\
+            0,  \text{otherwise}
+          \end{cases}
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index ab097d31e9..14ce1da2e9 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -62,36 +62,33 @@ class LayerNormOp : public framework::OperatorWithKernel {
 class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "(LoDTensor) The input tensor.");
+    AddInput("X", "The input tensor.");
     AddInput("Scale",
-             "(Tensor, optional) Scale is a 1-dimensional tensor of size "
+             "(optional) Scale is a 1-dimensional tensor of size "
              "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
              "It is applied to the output.")
         .AsDispensable();
     AddInput("Bias",
-             "(Tensor, optional) Bias is a 1-dimensional tensor of size "
+             "(optional) Bias is a 1-dimensional tensor of size "
              "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
              "It is applied to the output.")
         .AsDispensable();
-    AddOutput("Y", "(LoDTensor) Result after normalization.");
-    AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
-        .AsIntermediate();
-    AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
+    AddOutput("Y", "Result after normalization.");
+    AddOutput("Mean", "Mean of the current mini batch.").AsIntermediate();
+    AddOutput("Variance", "Variance of the current mini batch.")
         .AsIntermediate();
 
     AddAttr<float>("epsilon",
-                   "(float, default 1e-5) Constant for "
-                   "numerical stability")
+                   "Constant for numerical stability [default 1e-5].")
         .SetDefault(1e-5)
         .AddCustomChecker([](const float &epsilon) {
           PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
                          "'epsilon' should be between 0.0 and 0.001.");
         });
     AddAttr<int>("begin_norm_axis",
-                 "(int default:1), the "
-                 "axis of `begin_norm_axis ... Rank(X) - 1` will be "
+                 "the axis of `begin_norm_axis ... Rank(X) - 1` will be "
                  "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
-                 "matrix [N,H].")
+                 "matrix [N,H]. [default 1].")
         .SetDefault(1)
         .AddCustomChecker([](const int &begin_norm_axis) {
           PADDLE_ENFORCE_GT(begin_norm_axis, 0,
@@ -99,10 +96,14 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
         });
 
     AddComment(R"DOC(
-Layer Normalization.
-Layer Norm has been implemented as discussed in the paper:
-https://arxiv.org/abs/1607.06450
-...
+Assume feature vectors exist on dimensions
+:attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
+along these dimensions for each feature vector :math:`a` with size
+:math:`H`, then normalize each feature vector using the corresponding
+statistics. After that, apply learnable gain and bias on the normalized
+tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
+
+Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index 20f140f962..f4b540f1cb 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -78,18 +78,18 @@ class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "(LoDTensor), the input(X) is a LodTensor, which supports "
+             "the input(X) is a LodTensor, which supports "
              "variable time-length input sequences. The underlying tensor "
              "in this LoDTensor is a matrix with shape (T x N), where T "
              "is the total time steps in this mini-batch and N is the input "
              "data dimension.");
     AddInput("Filter",
-             "(Tensor), the input(Filter) is a learnable parameter. It "
+             "the input(Filter) is a learnable parameter. It "
              "is a 2-D tensor with shape (future_context x N), where, "
              "future_context is the future context length and N is the data "
              "dimension.");
     AddOutput("Out",
-              "(LoDTensor), the output(Out) is a LodTensor, which supports "
+              "the output(Out) is a LodTensor, which supports "
               "variable time-length input sequences. The underlying tensor "
               "in this LodTensor is a matrix with shape T x N, i.e., the "
               "same shape as X.");
@@ -117,6 +117,20 @@ $$
 out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :}
 $$
 
+In the above equation:
+
+* $Out_{i}$: The i-th row of output variable with shape [1, D].
+
+* $\\tau$: Future context size.
+
+* $X_{j}$: The j-th row of input variable with shape [1, D].
+
+* $W_{i-j}$: The (i-j)-th row of parameters with shape [1, D].
+
+More details about row_conv please refer to
+the design document
+https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 .
+
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 137ea91cae..65525526c9 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -86,32 +86,26 @@ class UniformRandomOp : public framework::OperatorWithKernel {
 class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddOutput("Out", "(Tensor) The output tensor of uniform random op");
+    AddOutput("Out", "The output tensor of uniform random op");
     AddComment(R"DOC(
 Uniform random operator.
 
 This operator initializes a tensor with random values sampled from a
-uniform distribution.
+uniform distribution. The random result is in set [min, max].
 
 )DOC");
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) The shape of the output tensor");
-    AddAttr<float>("min",
-                   "(float, default -1.0) "
-                   "Minimum value of uniform random")
+    AddAttr<std::vector<int>>("shape", "The shape of the output tensor");
+    AddAttr<float>("min", "Minimum value of uniform random. [default -1.0].")
         .SetDefault(-1.0f);
-    AddAttr<float>("max",
-                   "(float, default 1.0) "
-                   "Maximun value of uniform random")
+    AddAttr<float>("max", "Maximun value of uniform random. [default 1.0].")
         .SetDefault(1.0f);
     AddAttr<int>("seed",
-                 "(int, default 0) "
                  "Random seed used for generating samples. "
                  "0 means use a seed generated by the system."
                  "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time.")
+                 "generate the same random numbers every time. [default 0].")
         .SetDefault(0);
-    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
+    AddAttr<int>("dtype", "Output tensor data type. [default 5(FP32)].")
         .SetDefault(framework::proto::VarType::FP32);
   }
 };
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3a83db12fd..1e8dfbe521 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -373,22 +373,55 @@ def ssd_loss(location,
     confidence loss (or classification loss) by performing the following steps:
 
     1. Find matched boundding box by bipartite matching algorithm.
+
       1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
+
       1.2 Compute matched boundding box by bipartite matching algorithm.
+
     2. Compute confidence for mining hard examples
+
       2.1. Get the target label based on matched indices.
+
       2.2. Compute confidence loss.
+
     3. Apply hard example mining to get the negative example indices and update
        the matched indices.
+
     4. Assign classification and regression targets
+
       4.1. Encoded bbox according to the prior boxes.
+
       4.2. Assign regression targets.
+
       4.3. Assign classification targets.
+
     5. Compute the overall objective loss.
+
       5.1 Compute confidence loss.
+
       5.1 Compute localization loss.
+
       5.3 Compute the overall weighted loss.
 
+    >>> import paddle.fluid.layers as layers
+    >>> pb = layers.data(
+    >>>            name='prior_box',
+    >>>            shape=[10, 4],
+    >>>            append_batch_size=False,
+    >>>            dtype='float32')
+    >>> pbv = layers.data(
+    >>>            name='prior_box_var',
+    >>>            shape=[10, 4],
+    >>>            append_batch_size=False,
+    >>>            dtype='float32')
+    >>> loc = layers.data(name='target_box', shape=[10, 4], dtype='float32')
+    >>> scores = layers.data(name='scores', shape=[10, 21], dtype='float32')
+    >>> gt_box = layers.data(
+    >>>         name='gt_box', shape=[4], lod_level=1, dtype='float32')
+    >>> gt_label = layers.data(
+    >>>         name='gt_label', shape=[1], lod_level=1, dtype='float32')
+    >>>     loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
+
     Args:
         location (Variable): The location predictions are a 3D Tensor with
             shape [N, Np, 4], N is the batch size, Np is total number of
@@ -426,34 +459,12 @@ def ssd_loss(location,
             mining_type is 'hard_example'.
 
     Returns:
-        Variable: The weighted sum of the localization loss and confidence loss,
-            with shape [N * Np, 1], N and Np are the same as they are
-            in `location`.
+        The weighted sum of the localization loss and confidence loss, with \
+        shape [N * Np, 1], N and Np are the same as they are in `location`.
 
     Raises:
-        ValueError: If mining_type is 'hard_example', now only support
-            mining type of `max_negative`.
-
-    Examples:
-        .. code-block:: python
-
-            pb = layers.data(
-                name='prior_box',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            pbv = layers.data(
-                name='prior_box_var',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            loc = layers.data(name='target_box', shape=[10, 4], dtype='float32')
-            scores = layers.data(name='scores', shape=[10, 21], dtype='float32')
-            gt_box = layers.data(
-                name='gt_box', shape=[4], lod_level=1, dtype='float32')
-            gt_label = layers.data(
-                name='gt_label', shape=[1], lod_level=1, dtype='float32')
-            loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
+        ValueError: If mining_type is 'hard_example', now only support mining \
+        type of `max_negative`.
     """
 
     helper = LayerHelper('ssd_loss', **locals())
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index b9ea74fc81..ba13b344a1 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1624,6 +1624,7 @@ def batch_norm(input,
     return helper.append_activation(batch_norm_out)
 
 
+@templatedoc()
 def layer_norm(input,
                scale=True,
                shift=True,
@@ -1634,20 +1635,11 @@ def layer_norm(input,
                act=None,
                name=None):
     """
-    **Layer Normalization**
-
-    Assume feature vectors exist on dimensions
-    :attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
-    along these dimensions for each feature vector :math:`a` with size
-    :math:`H`, then normalize each feature vector using the corresponding
-    statistics. After that, apply learnable gain and bias on the normalized
-    tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
-
-    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
+    ${comment}
 
     The formula is as follows:
 
-    .. math::
+    ..  math::
 
         \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
 
@@ -1655,6 +1647,11 @@ def layer_norm(input,
 
         h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
 
+    >>> import paddle.fluid as fluid
+    >>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
+    >>>                          dtype='float32')
+    >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
+
     Args:
         input(Variable): The input tensor variable.
         scale(bool): Whether to learn the adaptive gain :math:`g` after
@@ -1672,14 +1669,7 @@ def layer_norm(input,
         act(str): Activation to be applied to the output of layer normalizaiton.
 
     Returns:
-        Variable: A tensor variable with the same shape as the input.
-
-    Examples:
-        .. code-block:: python
-
-            data = fluid.layers.data(
-              name='data', shape=[3, 32, 32], dtype='float32')
-            x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
+        ${y_comment}
     """
     helper = LayerHelper('layer_norm', **locals())
     dtype = helper.input_dtype()
@@ -3184,29 +3174,19 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
     return out
 
 
+@templatedoc()
 def row_conv(input, future_context_size, param_attr=None, act=None):
-    """Row Conv Operator. This layer will apply lookahead convolution to
-    **input**. The input variable should be a 2D LoDTensor with shape [T, D].
-    Parameters with shape [future_context_size + 1, D] will be created. The math
-    equation of row convolution is as follows:
-
-    .. math::
-        Out_{i} = \sum_{j = i} ^ {i + \\tau} X_{j} \odot W_{i - j}
-
-    In the above equation:
+    """
+    ${comment}
 
-    * :math:`Out_{i}`: The i-th row of output variable with shape [1, D].
-    * :math:`\\tau`: Future context size.
-    * :math:`X_{j}`: The j-th row of input variable with shape [1, D].
-    * :math:`W_{i-j}`: The (i-j)-th row of parameters with shape [1, D].
+    >>> import paddle.fluid as fluid
+    >>> x = fluid.layers.data(name='x', shape=[16],
+    >>>                        dtype='float32', lod_level=1)
+    >>> out = fluid.layers.row_conv(input=x, future_context_size=2)
 
-    More details about row_conv please refer to the paper \
-    (http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf) and
-    the design document \
-    (https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645).
 
     Args:
-        input (Variable): Input variable, a 2D LoDTensor with shape [T, D].
+        input (${x_type}): ${x_comment}.
         future_context_size (int): Future context size. Please note, the shape
             of convolution kernel is [future_context_size + 1, D].
         param_attr (ParamAttr): Attributes of parameters, including
@@ -3214,14 +3194,7 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
         act (str): Non-linear activation to be applied to output variable.
 
     Returns:
-        Variable: The output tensor with same shape as input tensor.
-
-    Examples:
-        .. code-block:: python
-
-            x = fluid.layers.data(name='x', shape=[16],
-                            dtype='float32', lod_level=1)
-            out = fluid.layers.row_conv(input=x, future_context_size=2)
+        ${out_comment}.
     """
     helper = LayerHelper('row_conv', **locals())
     dtype = helper.input_dtype()
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 66db6fe13f..dbffcae86d 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -49,7 +49,18 @@ def create_parameter(shape,
                      is_bias=False,
                      default_initializer=None):
     """
-    Create a parameter
+    Create a parameter. The parameter is a learnable variable, which can have
+    gradient, and can be optimized.
+
+    NOTE: this is a very low-level API. This API is useful when you create
+    operator by your self. instead of using layers.
+
+    >>> import paddle.fluid as fluid
+    >>> W = fluid.layers.create_parameter(shape=[784, 200], dtype='float32')
+    >>> data = fluid.layers.data(name="img", shape=[64, 784],
+    >>>           append_batch_size=False)
+    >>> hidden = fluid.layers.matmul(x=data, y=W)
+
     Args:
         shape(list[int]): shape of the parameter
         dtype(string): element type of the parameter
@@ -61,7 +72,7 @@ def create_parameter(shape,
         default_initializer(Initializer): initializer for the parameter
 
     Returns:
-        Parameter: the created parameter
+        the created parameter
     """
     helper = LayerHelper("create_parameter", **locals())
     if attr is None:

From 6a494380e826053609dbae47241b81a774f1ed30 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 12 Jun 2018 16:12:48 +0800
Subject: [PATCH 05/69] remove mkldnn flag from gtest strdup

---
 paddle/testing/paddle_gtest_main.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 507479c862..586ec48477 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -30,7 +30,7 @@ int main(int argc, char** argv) {
   new_argv.push_back(
       strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
 #else
-  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,use_mkldnn"));
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
 #endif
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();

From 6a32f19865ac7d1a7b439b609b757f6aa08baceb Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 12 Jun 2018 16:23:33 +0800
Subject: [PATCH 06/69] fix unknown use_mkldnn

---
 paddle/fluid/inference/tests/book/test_inference_nlp.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 9dcd79c3bb..4dc576cede 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -29,6 +29,7 @@ DEFINE_string(data_file, "", "File of input index data.");
 DEFINE_int32(repeat, 100, "Running the inference program repeat times");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_int32(num_threads, 1, "Number of threads should be used");
+DECLARE_bool(use_mkldnn);
 
 inline double GetCurrentMs() {
   struct timeval time;

From 6602db5b3e43ef7b5b21cce0d62ef580815206fc Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 12 Jun 2018 17:09:16 +0800
Subject: [PATCH 07/69] throw warning if try to use mkldnn while not compiled

---
 paddle/fluid/framework/executor.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 4a6f53cba1..571019be36 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -402,6 +402,9 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) {
       }
     }
   }
+#else
+  LOG(WARNING)
+      << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
 #endif
 }
 

From ff55d4c5937af9263d3dab6bfd9ee0d9b460a15b Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Tue, 12 Jun 2018 17:54:10 +0800
Subject: [PATCH 08/69] Polish documents

* less_than
* cumsum
* multiplex
* open_recordio_file
---
 paddle/fluid/operators/compare_op.cc          | 23 +++++-----
 paddle/fluid/operators/cumsum_op.cc           | 10 ++---
 paddle/fluid/operators/multiplex_op.cc        | 42 +++++++++++++------
 .../reader/create_recordio_file_reader_op.cc  | 10 +++--
 .../operators/reader/reader_op_registry.cc    |  2 +-
 python/paddle/fluid/layers/control_flow.py    | 29 +++++++------
 python/paddle/fluid/layers/io.py              | 31 ++++++--------
 python/paddle/fluid/layers/nn.py              | 41 +++++-------------
 8 files changed, 93 insertions(+), 95 deletions(-)

diff --git a/paddle/fluid/operators/compare_op.cc b/paddle/fluid/operators/compare_op.cc
index 3a4819f3de..11e91c5ec8 100644
--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -23,25 +23,22 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     OpComment comment;
-    AddInput("X",
-             string::Sprintf("(LoDTensor) the left hand operand of %s operator",
-                             comment.type));
-    AddInput("Y", string::Sprintf(
-                      "(LoDTensor) the right hand operand of %s operator",
-                      comment.type));
+    AddInput("X", string::Sprintf("the left hand operand of %s operator",
+                                  comment.type));
+    AddInput("Y", string::Sprintf("the right hand operand of %s operator",
+                                  comment.type));
     AddAttr<bool>("force_cpu",
-                  "(bool, default false) Force fill output variable to cpu "
+                  "Force fill output variable to cpu "
                   "memory. Otherwise, fill output variable to the running "
-                  "device")
-        .SetDefault(false);
-    AddOutput("Out", string::Sprintf(
-                         "(LoDTensor) n-dim bool tensor. Each element is %s",
-                         comment.equation));
+                  "device [default true].")
+        .SetDefault(true);
+    AddOutput("Out", string::Sprintf("n-dim bool tensor. Each element is %s",
+                                     comment.equation));
     AddComment(string::Sprintf(R"DOC(%s Operator
 
 It operates element-wise on X and Y, and returns the Out. Each of them is a
 N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
-calculated by %s
+calculated by $%s$
 )DOC",
                                comment.type, comment.equation));
     AddAttr<int>("axis",
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 92bb835e8f..2caa8bf2d5 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -33,16 +33,16 @@ class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Cumsum operator");
     AddOutput("Out", "Output of Cumsum operator");
     AddAttr<int>("axis",
-                 "(int, default -1). The dimenstion to accumulate along. "
-                 "-1 means the last dimenstion")
+                 "The dimenstion to accumulate along. -1 means the last "
+                 "dimenstion [default -1].")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
     AddAttr<bool>("exclusive",
-                  "bool, default false). Whether to perform exclusive cumsum")
+                  "Whether to perform exclusive cumsum. [default false].")
         .SetDefault(false);
     AddAttr<bool>("reverse",
-                  "bool, default false). If true, the cumsum is performed in "
-                  "the reversed direction")
+                  "If true, the cumsum is performed in the reversed direction. "
+                  "[default false].")
         .SetDefault(false);
     AddComment(R"DOC(
 The cumulative sum of the elements along a given axis.
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index a4363fd25d..9db2df2a4c 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -62,26 +62,44 @@ class MultiplexOp : public framework::OperatorWithKernel {
 class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("Ids", "The index tensor of multiplex operator.");
-    AddInput("X", "The candidate tensors of multiplex operator.")
+    AddInput("Ids",
+             "Tensor<int32>, index variable which is a 2-D tensor with shape "
+             "[M, 1] where M is the batch size.");
+    AddInput("X",
+             "A list of variables to gather from. All variables have the same "
+             "shape and the rank is at least 2.")
         .AsDuplicable();
     AddOutput("Out", "The output tensor of multiplex operator.");
     AddComment(R"DOC(
-Multiplex Operator.
-
-Multiplex multiple tensors according to the index provided by the index tensor.
-
-Ids: the index tensor.
-X[0 : N - 1]: the candidate tensors for output (N >= 2).
-For each index i from 0 to batchSize - 1, the output is the i-th row of the
+Referring to the given index variable, this layer selects rows from the
+input variables to construct a multiplex variable. Assuming that there are
+:math:`m` input variables and :math:`I_i` represents the i-th input
+variable and :math:`i` is in [0, :math:`m`). All input variables are
+tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`].
+Please note that rank of the input tensor should be at least 2. Each input
+variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`]
+where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2`
+* ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input
+variable. The given index variable should be a 2-D tensor with shape
+[:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable.
+Then the output variable will be a tensor with shape [:math:`d_0`,
+:math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D
+matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th
+row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
+
+* Ids: the index tensor.
+
+* X[0 : N - 1]: the candidate tensors for output (N >= 2).
+
+* For each index i from 0 to batchSize - 1, the output is the i-th row of the
 the (Ids[i])-th tensor.
 
 For i-th row of the output tensor:
 
-$$y[i] = x_{k}[i]$$
+$ y[i] = x_{k}[i] $
 
-where `y` is the output tensor, `x_{k}` is the k-th input tensor,
-and `k = Ids[i]`.
+where $y$ is the output tensor, $x_{k}$ is the k-th input tensor,
+and $k = Ids[i]$.
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
index 282ec3f36b..559827f084 100644
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -78,11 +78,15 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
 class CreateRecordIOReaderOpMaker : public FileReaderMakerBase {
  protected:
   void Apply() override {
-    AddAttr<std::string>("filename", "The filename of record io reader");
+    AddAttr<std::string>(
+        "filename",
+        "The filename of record file. This file will given to reader.");
     AddComment(R"DOC(
-      CreateRecordIOReader Operator
+Open a recordio file and return the reader object. The returned reader object
+is thread-safe.
 
-      Create a reader from a record io file
+NOTE: This is a very low-level API. It is used for debugging data file or
+training. Please use `open_files` instead of this API for production usage.
     )DOC");
   }
 };
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index 612e1f5eca..e11256a49f 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -54,7 +54,7 @@ std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
 }
 
 void FileReaderMakerBase::Make() {
-  AddOutput("Out", "(ReaderHolder) The created random reader.").AsDuplicable();
+  AddOutput("Out", "(ReaderHolder): The created random reader.").AsDuplicable();
   AddAttr<std::vector<int>>("shape_concat", "The concat of all data's shapes.");
   AddAttr<std::vector<int>>(
       "ranks",
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 80e8ff484a..6c707a35d3 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -909,37 +909,40 @@ def create_array(dtype):
         dtype=dtype)
 
 
-def less_than(x, y, force_cpu=True, cond=None, **ignored):
+@templatedoc()
+def less_than(x, y, force_cpu=None, cond=None, **ignored):
     """
-    **Less than**
+    ${comment}
 
-    This layer returns the truth value of :math:`x < y` elementwise.
+    >>> import paddle.fluid as fluid
+    >>> less = fluid.layers.less_than(x=label, y=limit)
 
     Args:
-        x(Variable): First operand of *less_than*
-        y(Variable): Second operand of *less_than*
-        force_cpu(Bool|True): The output data will be on CPU if set true.
+        x(${x_type}): ${x_comment}.
+        y(${y_type}): ${y_comment}.
+        force_cpu(${force_cpu_type}): ${force_cpu_comment}.
         cond(Variable|None): Optional output variable to store the result of *less_than*
 
     Returns:
-        Variable: The tensor variable storing the output of *less_than*.
-
-    Examples:
-        .. code-block:: python
-
-          less = fluid.layers.less_than(x=label, y=limit)
+        ${out_comment}.
     """
     helper = LayerHelper("less_than", **locals())
     if cond is None:
         cond = helper.create_tmp_variable(dtype='bool')
         cond.stop_gradient = True
 
+    attrs = dict()
+    if force_cpu is not None:
+        attrs['force_cpu'] = force_cpu
+    elif force_init_on_cpu():
+        attrs['force_cpu'] = force_init_on_cpu()
+
     helper.append_op(
         type='less_than',
         inputs={'X': [x],
                 'Y': [y]},
         outputs={'Out': [cond]},
-        attrs={'force_cpu': force_cpu or force_init_on_cpu()})
+        attrs=attrs)
     return cond
 
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 9de88e2c32..13a3d5441a 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -292,6 +292,7 @@ def _copy_reader_create_op_(block, op):
     return new_op
 
 
+@templatedoc(op_type='create_recordio_file_reader')
 def open_recordio_file(filename,
                        shapes,
                        lod_levels,
@@ -299,34 +300,28 @@ def open_recordio_file(filename,
                        pass_num=1,
                        for_parallel=True):
     """
-    Open a RecordIO file
+    ${comment}
 
-    This layer takes a RecordIO file to read from and returns a Reader Variable.
-    Via the Reader Variable, we can get data from the given RecordIO file.
+    >>> import paddle.fluid as fluid
+    >>> reader = fluid.layers.io.open_recordio_file(
+    >>>                               filename='./data.recordio',
+    >>>                               shapes=[(3,224,224), (1)],
+    >>>                               lod_levels=[0, 0],
+    >>>                               dtypes=['float32', 'int64'])
+    >>> # Via the reader, we can use 'read_file' layer to get data:
+    >>> image, label = fluid.layers.io.read_file(reader)
 
     Args:
-       filename(str): The RecordIO file's name.
+       filename(${filename_type}): ${filename_comment}.
        shapes(list): List of tuples which declaring data shapes.
-       lod_levels(list): List of ints which declaring data lod_level.
+       lod_levels(${lod_levels_type}): ${lod_levels_comment}.
        dtypes(list): List of strs which declaring data type.
        pass_num(int): Number of passes to run.
        for_parallel(Bool): Set it as True if you are going to run
             subsequent operators in parallel.
 
     Returns:
-       Variable: A Reader Variable via which we can get RecordIO file data.
-
-    Examples:
-       .. code-block:: python
-
-         reader = fluid.layers.io.open_recordio_file(
-                                          filename='./data.recordio',
-                                          shapes=[(3,224,224), (1)],
-                                          lod_levels=[0, 0],
-                                          dtypes=['float32', 'int64'])
-
-         # Via the reader, we can use 'read_file' layer to get data:
-         image, label = fluid.layers.io.read_file(reader)
+       ${out_comment}.
     """
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
     shape_concat = []
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ba13b344a1..1c5322288a 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3210,42 +3210,23 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
     return helper.append_activation(out)
 
 
+@templatedoc()
 def multiplex(inputs, index):
     """
-    **Multiplex Layer**
-
-    Referring to the given index variable, this layer selects rows from the
-    input variables to construct a multiplex variable. Assuming that there are
-    :math:`m` input variables and :math:`I_i` represents the i-th input
-    variable and :math:`i` is in [0, :math:`m`). All input variables are
-    tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`].
-    Please note that rank of the input tensor should be at least 2. Each input
-    variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`]
-    where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2`
-    * ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input
-    variable. The given index variable should be a 2-D tensor with shape
-    [:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable.
-    Then the output variable will be a tensor with shape [:math:`d_0`,
-    :math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D
-    matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th
-    row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
+    ${comment}
+
+    >>> import paddle.fluid as fluid
+    >>> x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
+    >>> x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32')
+    >>> index = fluid.layers.data(name='index', shape=[1], dtype='int32')
+    >>> out = fluid.layers.multiplex(inputs=[x1, x2], index=index)
 
     Args:
-       inputs (list): A list of variables to gather from. All variables have the
-                same shape and the rank is at least 2.
-       index (Variable): Tensor<int32>, index variable which is a 2-D tensor
-                with shape [M, 1] where M is the batch size.
+       inputs (list): ${x_comment}.
+       index (${ids_type}): ${ids_comment}.
 
     Returns:
-        Variable: Multiplex variable gathered from input variables.
-
-    Examples:
-        .. code-block:: python
-
-            x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32')
-            index = fluid.layers.data(name='index', shape=[1], dtype='int32')
-            out = fluid.layers.multiplex(inputs=[x1, x2], index=index)
+        ${out_comment}.
     """
     helper = LayerHelper('multiplex', **locals())
 

From 5d0bf8bc8f4daa3b86e478c9870297f101a9788b Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 12 Jun 2018 21:26:07 +0800
Subject: [PATCH 09/69] Add API docs.

---
 paddle/fluid/operators/get_places_op.cc    |  2 +-
 python/paddle/fluid/layers/control_flow.py | 26 ++++++++++++++++++++++
 python/paddle/fluid/layers/io.py           | 19 ++++++++++++++++
 3 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/get_places_op.cc b/paddle/fluid/operators/get_places_op.cc
index eafc364a15..db6ff78256 100644
--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
@@ -85,7 +85,7 @@ class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker {
         .InEnum({"CUDA", "CPU", "AUTO"})
         .SetDefault("AUTO");
     AddComment(R"DOC(
-Returns a list of places based on flags. The list will be used for parallel
+Returns a list of places based on arguments. The list will be used for parallel
 execution.
 )DOC");
   }
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 80e8ff484a..7e541efcfc 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1209,6 +1209,32 @@ class IfElseBlockGuard(object):
 
 
 class IfElse(object):
+    """
+    if-else control flow.
+
+    Args:
+        cond (Variable): condition used to compare.
+        name (str, default None): The name of this layer.
+
+    Examples:
+          .. code-block:: python
+            limit = layers.fill_constant_batch_size_like(
+                input=label, dtype='int64', shape=[1], value=5.0)
+            cond = layers.less_than(x=label, y=limit)
+            ie = layers.IfElse(cond)
+            with ie.true_block():
+                true_image = ie.input(image)
+                hidden = layers.fc(input=true_image, size=100, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                ie.output(prob)
+
+            with ie.false_block():
+                false_image = ie.input(image)
+                hidden = layers.fc(input=false_image, size=200, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                ie.output(prob)
+            prob = ie()
+    """
     OUT_IF_ELSE_BLOCKS = 0
     IN_IF_ELSE_TRUE_BLOCKS = 1
     IN_IF_ELSE_FALSE_BLOCKS = 2
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 9de88e2c32..df264e4f26 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -587,6 +587,25 @@ def read_file(file_obj):
 
 
 class Preprocessor(object):
+    """
+    A block for data pre-processing in reader.
+
+    Args:
+        reader (Variable): A reader variable.
+        name (str, default None): The name of the reader.
+
+    Examples:
+          .. code-block:: python
+            preprocessor = fluid.layers.io.Preprocessor(reader=reader)
+            with preprocessor.block():
+                img, lbl = preprocessor.inputs()
+                img_out = img / 2
+                lbl_out = lbl + 1
+                preprocessor.outputs(img_out, lbl_out)
+
+            data_file = fluid.layers.io.double_buffer(preprocessor())
+
+    """
     BEFORE_SUB_BLOCK = 0
     IN_SUB_BLOCK = 1
     AFTER_SUB_BLOCK = 2

From 9185579c45281641618e95996b3451a5a81ea215 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 12 Jun 2018 21:58:28 +0800
Subject: [PATCH 10/69] follow comments

---
 python/paddle/fluid/layers/control_flow.py | 1 +
 python/paddle/fluid/layers/io.py           | 1 +
 2 files changed, 2 insertions(+)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 7e541efcfc..2279197717 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1218,6 +1218,7 @@ class IfElse(object):
 
     Examples:
           .. code-block:: python
+
             limit = layers.fill_constant_batch_size_like(
                 input=label, dtype='int64', shape=[1], value=5.0)
             cond = layers.less_than(x=label, y=limit)
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index df264e4f26..f3aeb6cd75 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -596,6 +596,7 @@ class Preprocessor(object):
 
     Examples:
           .. code-block:: python
+
             preprocessor = fluid.layers.io.Preprocessor(reader=reader)
             with preprocessor.block():
                 img, lbl = preprocessor.inputs()

From 26ed4b13573678507d09e466e85fe2dc8a074105 Mon Sep 17 00:00:00 2001
From: weixing02 <wx_crome@163.com>
Date: Wed, 13 Jun 2018 11:12:02 +0800
Subject: [PATCH 11/69] fix deadlink

---
 doc/v2/dev/contribute_to_paddle_cn.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/v2/dev/contribute_to_paddle_cn.md b/doc/v2/dev/contribute_to_paddle_cn.md
index add06e42f1..3244eedf91 100644
--- a/doc/v2/dev/contribute_to_paddle_cn.md
+++ b/doc/v2/dev/contribute_to_paddle_cn.md
@@ -104,7 +104,7 @@ no changes added to commit (use "git add" and/or "git commit -a")
 ➜  docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
 ```
 
-关于构建和测试的更多信息，请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。
+关于构建和测试的更多信息，请参见[使用Docker安装运行](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2/build_and_install/docker_install_cn.rst)。
 
 ## 提交（commit）
 

From b9843abb613860da99cd6dc5bda502f6d595d165 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Wed, 13 Jun 2018 13:20:09 +0800
Subject: [PATCH 12/69] Polish comsum, DynamicRNN

---
 paddle/fluid/operators/cumsum_op.cc        |   4 +-
 python/paddle/fluid/layers/control_flow.py | 142 +++++++++++++++++++++
 2 files changed, 144 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 2caa8bf2d5..5302b822d6 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -30,8 +30,8 @@ class CumOp : public framework::OperatorWithKernel {
 class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "Input of Cumsum operator");
-    AddOutput("Out", "Output of Cumsum operator");
+    AddInput("X", "Input of cumsum operator");
+    AddOutput("Out", "Output of cumsum operator");
     AddAttr<int>("axis",
                  "The dimenstion to accumulate along. -1 means the last "
                  "dimenstion [default -1].")
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 6c707a35d3..8d28aeb2ed 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -20,6 +20,7 @@ from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
 from ..initializer import force_init_on_cpu
 from ops import logical_and, logical_not, logical_or
+import numpy
 
 __all__ = [
     'split_lod_tensor',
@@ -1314,6 +1315,39 @@ class IfElse(object):
 
 
 class DynamicRNN(object):
+    """
+    Dynamic RNN.
+
+    This RNN can process a batch of sequence data. The length of each sample
+    sequence can be different. This API automatically process them in batch.
+
+    The input lod must be set. Please reference `lod_tensor`
+
+    >>> import paddle.fluid as fluid
+    >>> data = fluid.layers.data(name='sentence', dtype='int64', lod_level=1)
+    >>> embedding = fluid.layers.embedding(input=data, size=[65535, 32],
+    >>>                                    is_sparse=True)
+    >>>
+    >>> drnn = fluid.layers.DynamicRNN()
+    >>> with drnn.block():
+    >>>     word = drnn.step_input(embedding)
+    >>>     prev = drnn.memory(shape=[200])
+    >>>     hidden = fluid.layers.fc(input=[word, prev], size=200, act='relu')
+    >>>     drnn.update_memory(prev, hidden)  # set prev to hidden
+    >>>     drnn.output(hidden)
+    >>>
+    >>> # last is the last time step of rnn. It is the encoding result.
+    >>> last = fluid.layers.sequence_last_step(drnn())
+
+    The dynamic RNN will unfold sequence into timesteps. Users need to define
+    how to process each time step during the :code:`with` block.
+
+    The `memory` is used staging data cross time step. The initial value of
+    memory can be zero or another variable.
+
+    The dynamic RNN can mark multiple variables as its output. Use `drnn()` to
+    get the output sequence.
+    """
     BEFORE_RNN = 0
     IN_RNN = 1
     AFTER_RNN = 2
@@ -1336,6 +1370,15 @@ class DynamicRNN(object):
         self.mem_link = []
 
     def step_input(self, x):
+        """
+        Mark a sequence as a dynamic RNN input.
+        Args:
+            x(Variable): The input sequence.
+
+        Returns:
+            The current timestep in the input sequence.
+
+        """
         self._assert_in_rnn_block_("step_input")
         if not isinstance(x, Variable):
             raise TypeError(
@@ -1379,6 +1422,15 @@ class DynamicRNN(object):
         return array_read(array=input_array, i=self.step_idx)
 
     def static_input(self, x):
+        """
+        Mark a variable as a RNN input. The input will not be scattered into
+        time steps.
+        Args:
+            x(Variable): The input variable.
+
+        Returns:
+            The input variable that can access in RNN.
+        """
         self._assert_in_rnn_block_("static_input")
         if not isinstance(x, Variable):
             raise TypeError(
@@ -1400,6 +1452,10 @@ class DynamicRNN(object):
 
     @contextlib.contextmanager
     def block(self):
+        """
+        The block for user to define operators in RNN. See the class docstring
+        for more details.
+        """
         if self.status != DynamicRNN.BEFORE_RNN:
             raise ValueError("rnn.block() can only be invoke once")
         self.step_idx = fill_constant(
@@ -1426,6 +1482,9 @@ class DynamicRNN(object):
                     x=each_array, table=self.lod_rank_table))
 
     def __call__(self, *args, **kwargs):
+        """
+        Get the output of RNN. This API should only be invoked after RNN.block()
+        """
         if self.status != DynamicRNN.AFTER_RNN:
             raise ValueError(("Output of the dynamic RNN can only be visited "
                               "outside the rnn block."))
@@ -1440,6 +1499,70 @@ class DynamicRNN(object):
                value=0.0,
                need_reorder=False,
                dtype='float32'):
+        """
+        Create a memory variable.
+
+        If the :code:`init` is not None, :code:`memory` will be initialized by
+        this variable. The :code:`need_reorder` is used to reorder the memory as
+        the input variable. It should be set to true when the initialized memory
+        depends on the input sample.
+
+        For example,
+
+        >>> import paddle.fluid as fluid
+        >>> sentence = fluid.layers.data(
+        >>>                 name='sentence', dtype='float32', shape=[32])
+        >>> boot_memory = fluid.layers.data(
+        >>>                 name='boot', dtype='float32', shape=[10])
+        >>>
+        >>> drnn = fluid.layers.DynamicRNN()
+        >>> with drnn.block():
+        >>>     word = drnn.step_input(sentence)
+        >>>     memory = drnn.memory(init=boot_memory, need_reorder=True)
+        >>>     hidden = fluid.layers.fc(
+        >>>                 input=[word, memory], size=10, act='tanh')
+        >>>     drnn.update_memory(ex_mem=memory, new_mem=hidden)
+        >>>     drnn.output(hidden)
+        >>> rnn_output = drnn()
+
+
+        Otherwise, if :code:`shape`, :code:`value`, :code:`dtype` are set, the
+        :code:`memory` will be initialized by this :code:`value`.
+
+        For example,
+
+        >>> import paddle.fluid as fluid
+        >>> sentence = fluid.layers.data(
+        >>>                 name='sentence', dtype='float32', shape=[32])
+        >>>
+        >>> drnn = fluid.layers.DynamicRNN()
+        >>> with drnn.block():
+        >>>     word = drnn.step_input(sentence)
+        >>>     memory = drnn.memory(shape=[10], dtype='float32', value=0)
+        >>>     hidden = fluid.layers.fc(
+        >>>             input=[word, memory], size=10, act='tanh')
+        >>>     drnn.update_memory(ex_mem=memory, new_mem=hidden)
+        >>>     drnn.output(hidden)
+        >>> rnn_output = drnn()
+
+
+        Args:
+            init(Variable|None): The initialized variable.
+
+            shape(list|tuple): The memory shape. NOTE the shape does not contain
+            batch_size.
+
+            value(float): the initalized value.
+
+            need_reorder(bool): True if the initialized memory depends on the
+            input sample.
+
+            dtype(str|numpy.dtype): The data type of the initialized memory.
+
+        Returns:
+            the memory variable.
+
+        """
         self._assert_in_rnn_block_('memory')
         if init is not None:
             if not isinstance(init, Variable):
@@ -1507,6 +1630,16 @@ class DynamicRNN(object):
             return self.memory(init=init)
 
     def update_memory(self, ex_mem, new_mem):
+        """
+        Update the memory from ex_mem to new_mem. NOTE that the shape and data
+        type of :code:`ex_mem` and :code:`new_mem` must be same.
+        Args:
+            ex_mem(Variable): the memory variable.
+            new_mem(Variable): the plain variable generated in RNN block.
+
+        Returns:
+            None
+        """
         self._assert_in_rnn_block_('update_memory')
         if not isinstance(ex_mem, Variable):
             raise TypeError("The input arg `ex_mem` of update_memory() must "
@@ -1524,6 +1657,15 @@ class DynamicRNN(object):
         self.mem_link.append((new_mem, mem_array))
 
     def output(self, *outputs):
+        """
+        mark the RNN output variables.
+
+        Args:
+            outputs: The output variables.
+
+        Returns:
+            None
+        """
         self._assert_in_rnn_block_('output')
         parent_block = self._parent_block_()
         for each in outputs:

From c6c9c657e0c4d9d2c17129511f6a3ab30a190486 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 13 Jun 2018 15:35:23 +0800
Subject: [PATCH 13/69] update doc

---
 python/paddle/fluid/layers/control_flow.py    | 36 ++++++---
 .../fluid/layers/learning_rate_scheduler.py   | 70 +++++++++++++-----
 python/paddle/fluid/layers/nn.py              | 73 +++++++++++++------
 python/paddle/fluid/layers/tensor.py          | 42 +++++++----
 4 files changed, 155 insertions(+), 66 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 80e8ff484a..f4013b61d0 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -748,16 +748,25 @@ def max_sequence_len(rank_table):
 
 
 def lod_tensor_to_array(x, table):
-    """ Convert a LOD_TENSOR to an LOD_TENSOR_ARRAY.
+    """ 
+    Convert a LoDTensor to a LoDTensorArray.
+
+    This function split a LoDTesnor to a LoDTensorArray according to its LoD 
+    information. LoDTensorArray is an alias of C++ std::vector<LoDTensor> in 
+    Paddle. The generated LoDTensorArray of this function can be further read 
+    or written by 'read_from_array()' and 'write_to_array()' operators. However, 
+    this function is generally an internal component of Paddle 'DynamicRNN'. 
+    Users should not use it directly.
 
     Args:
-        x (Variable|list): The LOD tensor to be converted to a LOD tensor array.
+        x (Variable|list): The LoDTensor to be converted to a LoDTensorArray.
         table (ParamAttr|list): The variable that stores the level of lod
                                 which is ordered by sequence length in
-                                descending order.
+                                descending order. It is generally generated 
+                                by 'layers.lod_rank_table()' API.
 
     Returns:
-        Variable: The variable of type array that has been converted from a
+        Variable: The LoDTensorArray that has been converted from the input 
                   tensor.
 
     Examples:
@@ -1047,6 +1056,13 @@ def array_length(array):
 
 
 class ConditionalBlockGuard(BlockGuard):
+    """
+    ConditionalBlockGuard is derived from BlockGuard. It is dedicated for 
+    holding a ConditionalBlock, and helping users entering and exiting the 
+    ConditionalBlock via Python's 'with' keyword. However, ConditionalBlockGuard 
+    is generally an internal component of IfElse, users should not use it directly.
+    """
+
     def __init__(self, block):
         if not isinstance(block, ConditionalBlock):
             raise TypeError("block should be conditional block")
@@ -1563,17 +1579,15 @@ def reorder_lod_tensor_by_rank(x, rank_table):
 
 def is_empty(x, cond=None, **ignored):
     """
-    **Is Empty**
-
-    This layer returns the truth value of whether the variable is empty.
+    Test whether an Variable is empty.
 
     Args:
-        x(Variable): Operand of *is_empty*
-        cond(Variable|None): Optional output variable to store the result
-                             of *is_empty*
+        x (Variable): The Variable to be tested.
+        cond (Variable|None): Output parameter. Returns the test result 
+                              of given 'x'.
 
     Returns:
-        Variable: The tensor variable storing the output of *is_empty*.
+        Variable: The tensor variable storing the test result of 'x'.
 
     Raises:
         TypeError: If input cond is not a variable, or cond's dtype is
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 716cc7824e..9cbb559093 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -70,21 +70,40 @@ def noam_decay(d_model, warmup_steps):
 
 
 def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
-    """Applies exponential decay to the learning rate.
+    """
+    Applies exponential decay to the learning rate. 
+
+    When training a model, it is often recommended to lower the learning rate as the 
+    training progresses. By using this function, the learning rate will be decayed by 
+    'decay_rate' every 'decay_steps' steps.
+
+    >>> if staircase == True:
+    >>>     decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps)
+    >>> else:
+    >>>     decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
 
-    ```python
-    decayed_learning_rate = learning_rate *
-            decay_rate ^ (global_step / decay_steps)
-    ```
     Args:
-        learning_rate: A scalar float32 value or a Variable. This
-          will be the initial learning rate during training
-        decay_steps: A Python `int32` number.
-        decay_rate: A Python `float` number.
-        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+        learning_rate(Variable|float): The initial learning rate.
+        decay_steps(int): See the decay computation above.
+        decay_rate(float): The decay rate. See the decay computation above.
+        staircase(Boolean): If True, decay the learning rate at discrete intervals.
+                            Default: False
 
     Returns:
         The decayed learning rate
+
+    Examples:
+        .. code-block:: python
+
+          base_lr = 0.1
+          sgd_optimizer = fluid.optimizer.SGD(
+                learning_rate=fluid.layers.exponential_decay(
+                    learning_rate=base_lr,
+                    decay_steps=10000,
+                    decay_rate=0.5,
+                    staircase=True))
+          sgd_optimizer.minimize(avg_cost)
+
     """
     global_step = _decay_step_counter()
 
@@ -128,22 +147,39 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
 
 
 def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
-    """Applies inverse time decay to the initial learning rate.
+    """
+    Applies inverse time decay to the initial learning rate.
 
-    >>> if staircase:
+    When training a model, it is often recommended to lower the learning rate as the 
+    training progresses. By using this function, an inverse decay function will be 
+    applied to the initial learning rate.
+
+    >>> if staircase == True:
     >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
     >>> else:
     >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
 
     Args:
-        learning_rate: A scalar float32 value or a Variable. This
-          will be the initial learning rate during training.
-        decay_steps: A Python `int32` number.
-        decay_rate: A Python `float` number.
-        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+        learning_rate(Variable|float): The initial learning rate.
+        decay_steps(int): See the decay computation above.
+        decay_rate(float): The decay rate. See the decay computation above.
+        staircase(Boolean): If True, decay the learning rate at discrete intervals.
+                            Default: False
 
     Returns:
         The decayed learning rate
+
+    Examples:
+        .. code-block:: python
+
+          base_lr = 0.1
+          sgd_optimizer = fluid.optimizer.SGD(
+                learning_rate=fluid.layers.inverse_time_decay(
+                    learning_rate=base_lr,
+                    decay_steps=10000,
+                    decay_rate=0.5,
+                    staircase=True))
+          sgd_optimizer.minimize(avg_cost)
     """
     global_step = _decay_step_counter()
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index c8cbb5ef00..047c3aa2b7 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -102,14 +102,15 @@ def fc(input,
     """
     **Fully Connected Layer**
 
-    The fully connected layer can take multiple tensors as its inputs. It
-    creates a variable called weights for each input tensor, which represents
-    a fully connected weight matrix from each input unit to each output unit.
-    The fully connected layer multiplies each input tensor with its coresponding
-    weight to produce an output Tensor. If multiple input tensors are given,
-    the results of multiple multiplications will be sumed up. If bias_attr is
-    not None, a bias variable will be created and added to the output. Finally,
-    if activation is not None, it will be applied to the output as well.
+    This function creates a fully connected layer in the network. It can take 
+    multiple tensors as its inputs. It creates a variable called weights for 
+    each input tensor, which represents a fully connected weight matrix from 
+    each input unit to each output unit. The fully connected layer multiplies 
+    each input tensor with its coresponding weight to produce an output Tensor. 
+    If multiple input tensors are given, the results of multiple multiplications 
+    will be sumed up. If bias_attr is not None, a bias variable will be created 
+    and added to the output. Finally, if activation is not None, it will be applied 
+    to the output as well.
 
     This process can be formulated as follows:
 
@@ -878,7 +879,7 @@ def cos_sim(X, Y):
     Args:
         X (Variable): The input X.
         Y (Variable): The input Y.
-    
+
     Returns:
         Variable: the output of cosine(X, Y).
     """
@@ -1083,7 +1084,7 @@ def chunk_eval(input,
         chunk_scheme (str): ${chunk_scheme_comment}
         num_chunk_types (int): ${num_chunk_types_comment}
         excluded_chunk_types (list): ${excluded_chunk_types_comment}
-    
+
     Returns:
         tuple: tuple containing: (precision, recall, f1_score,
                num_infer_chunks, num_label_chunks,
@@ -1143,7 +1144,7 @@ def sequence_conv(input,
         bias_attr (ParamAttr|None): attributes for bias
         param_attr (ParamAttr|None): attributes for parameter
         act (str): the activation type
-    
+
     Returns:
         Variable: output of sequence_conv
     """
@@ -1509,6 +1510,7 @@ def sequence_last_step(input):
     return sequence_pool(input=input, pool_type="last")
 
 
+@templatedoc()
 def pool2d(input,
            pool_size=-1,
            pool_type="max",
@@ -1520,12 +1522,12 @@ def pool2d(input,
            use_mkldnn=False,
            name=None):
     """
-    This function adds the operator for pooling in 2 dimensions, using the
-    pooling configurations mentioned in input parameters.
+    ${comment}
 
     Args:
         input (Variable): ${input_comment}
-        pool_size (int): ${ksize_comment}
+        pool_size (int): The side length of pooling windows. All pooling 
+                         windows are squares with pool_size on a side.
         pool_type (str): ${pooling_type_comment}
         pool_stride (int): stride of the pooling layer.
         pool_padding (int): padding size.
@@ -1533,11 +1535,29 @@ def pool2d(input,
         use_cudnn (bool): ${use_cudnn_comment}
         ceil_mode (bool): ${ceil_mode_comment}
         use_mkldnn (bool): ${use_mkldnn_comment}
-        name (str): A name for this layer(optional). If set None, the layer
-            will be named automatically.
-    
+        name (str|None): A name for this layer(optional). If set None, the 
+                        layer will be named automatically.
+
     Returns:
         Variable: output of pool2d layer.
+
+    Raises:
+        ValueError: If 'pool_type' is not "max" nor "avg"
+        ValueError: If 'global_pooling' is False and 'pool_size' is -1
+        ValueError: If 'use_cudnn' is not a bool value.
+
+    Examples:
+
+        .. code-block:: python
+
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.pool2d(
+                            input=data, 
+                            pool_size=2, 
+                            pool_type='max', 
+                            pool_stride=1, 
+                            global_pooling=False)
     """
     if pool_type not in ["max", "avg"]:
         raise ValueError(
@@ -1800,7 +1820,7 @@ def beam_search_decode(ids, scores, name=None):
         ids (Variable): ${ids_comment}
         scores (Variable): ${scores_comment}
         name (str): The name of this layer. It is optional.
-    
+
     Returns:
         tuple: a tuple of two output variable: sentence_ids, sentence_scores
     """
@@ -2063,7 +2083,7 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
         beam_size (int): ${beam_size_comment}
         end_id (int): ${end_id_comment}
         level (int): ${level_comment}
-    
+
     Returns:
         tuple: a tuple of beam_search output variables: selected_ids, selected_scores
     '''
@@ -2719,7 +2739,7 @@ def topk(input, k, name=None):
     This operator is used to find values and indices of the k largest entries
     for the last dimension.
 
-    If the input is a vector (rank=1), finds the k largest entries in the vector
+    If the input is a vector (1-D Tensor), finds the k largest entries in the vector
     and outputs their values and indices as vectors. Thus values[j] is the j-th
     largest entry in input, and its index is indices[j].
 
@@ -2729,9 +2749,11 @@ def topk(input, k, name=None):
     Args:
         input(Variable): The input variable which can be a vector or Tensor with
             higher rank.
-        k(int): An integer value to specify the top k largest elements.
+        k(int):  The number of top elements to look for along the last dimension 
+                 of input.
         name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
+                       will be named automatically. 
+                       Default: None
 
     Returns:
         values(Variable): The k largest elements along each last dimensional
@@ -2739,13 +2761,16 @@ def topk(input, k, name=None):
         indices(Variable): The indices of values within the last dimension of
             input.
 
+    Raises:
+        ValueError: If k < 1 or k is not less than the last dimension of input
+
     Examples:
         .. code-block:: python
 
             top5_values, top5_indices = layers.topk(input, k=5)
     """
     shape = input.shape
-    if k < 1 and k >= shape[-1]:
+    if k < 1 or k >= shape[-1]:
         raise ValueError("k must be greater than 0 and less than %d." %
                          (shape[-1]))
 
@@ -3045,7 +3070,7 @@ def nce(input,
         param_attr (ParamAttr|None): attributes for parameter
         bias_attr (ParamAttr|None): attributes for bias
         num_neg_samples (int): ${num_neg_samples_comment}
-    
+
     Returns:
         Variable: output of nce layer.
     """
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 62b01d595a..e03c8ca914 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -79,20 +79,33 @@ def create_global_var(shape,
                       force_cpu=False,
                       name=None):
     """
-    Create a global variable. such as global_step
+    Create a new variable in the global block(block 0).
+
     Args:
         shape(list[int]): shape of the variable
-        value(float): the value of the variable
-        dtype(string): element type of the parameter
-        persistable(bool): if this variable is persistable
-        force_cpu(bool): force this variable to be on CPU
+        value(float): the value of the variable. The new created 
+                      variable will be filled with it.
+        dtype(string): data type of the variable
+        persistable(bool): if this variable is persistable. 
+                           Default: False
+        force_cpu(bool): force this variable to be on CPU. 
+                         Default: False
+        name(str|None): The name of the variable. If set to None the variable 
+                        name will be generated automatically. 
+                        Default: None
 
     Returns:
         Variable: the created Variable
+
+    Examples:
+        .. code-block:: python
+
+            var = fluid.create_global_var(shape=[2,3], value=1.0, dtype='float32', 
+                                 persistable=True, force_cpu=True, name='new_var')
     """
     helper = LayerHelper("global_var", **locals())
     var = helper.create_global_variable(
-        dtype=dtype, shape=shape, persistable=persistable, name=name)
+        dtype=dtype, shape=shape, persistable=persistable)
     helper.set_variable_initializer(
         var, initializer=Constant(
             value=float(value), force_cpu=force_cpu))
@@ -152,10 +165,11 @@ def sums(input, out=None):
     Args:
         input (Variable|list): The input tensor that has the elements
                                that need to be summed up.
+        out (Variable|None): Output parameter. Returns the sum result.
+                             Default: None
 
     Returns:
-        Variable: The tensor type variable that has the sum of input
-                  written to it.
+        Variable: the sum of input. The same as the argument 'out'
 
     Examples:
         .. code-block::python
@@ -328,13 +342,13 @@ def argmin(x, axis=0):
         x(Variable): The input to compute the indices of
                      the min elements.
         axis(int): Axis to compute indices along.
-    
+
     Returns:
         Variable: The tensor variable storing the output
-    
+
     Examples:
         .. code-block:: python
-          
+
           out = fluid.layers.argmin(x=in, axis=0)
           out = fluid.layers.argmin(x=in, axis=-1)  
     """
@@ -359,13 +373,13 @@ def argmax(x, axis=0):
         x(Variable): The input to compute the indices of
                      the max elements.
         axis(int): Axis to compute indices along.
-    
+
     Returns:
         Variable: The tensor variable storing the output
-    
+
     Examples:
         .. code-block:: python
-          
+
           out = fluid.layers.argmax(x=in, axis=0)
           out = fluid.layers.argmax(x=in, axis=-1)  
     """

From 3ab32532d54af6185a9604883abd022d7a3bd6fc Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Wed, 13 Jun 2018 17:02:38 +0800
Subject: [PATCH 14/69] Add conv3d Python API

---
 python/paddle/fluid/layers/nn.py | 168 ++++++++++++++++++++++++++++++-
 1 file changed, 166 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index c8cbb5ef00..f6b4348d25 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1305,8 +1305,6 @@ def conv2d(input,
           conv2d = fluid.layers.conv2d(
               input=data, num_filters=2, filter_size=3, act="relu")
     """
-    if stride is None:
-        stride = [1, 1]
 
     num_channels = input.shape[1]
 
@@ -1369,6 +1367,172 @@ def conv2d(input,
     return helper.append_activation(pre_act)
 
 
+def conv3d(input,
+           num_filters,
+           filter_size,
+           stride=1,
+           padding=0,
+           dilation=1,
+           groups=None,
+           param_attr=None,
+           bias_attr=None,
+           use_cudnn=True,
+           use_mkldnn=False,
+           act=None,
+           name=None):
+    """
+    **Convlution3D Layer**
+
+    The convolution3D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input(Input) and
+    Output(Output) are in NCHW format. Where N is batch size, C is the number of
+    channels, H is the height of the feature, and W is the width of the feature.
+    The details of convolution layer, please refer UFLDL's `convolution,
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
+    If bias attribution and activation type are provided, bias is added to the
+    output of the convolution, and the corresponding activation function is
+    applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    In the above equation:
+
+    * :math:`X`: Input value, a tensor with NCHW format.
+    * :math:`W`: Filter value, a tensor with MCHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
+                   different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+
+          Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`
+
+        - Output:
+          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+            D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
+
+    Args:
+        input (Variable): The input image with [N, C, D, H, W] format.
+            num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_D, filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride (int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_D, stride_H, stride_W). Otherwise, the
+            stride_D = stride_H = stride_W = stride. Default: stride = 1.
+        padding (int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_D, padding_H, padding_W). Otherwise, the
+            padding_D = padding_H = padding_W = padding. Default: padding = 0.
+        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups (int): The groups number of the Conv3d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        param_attr (ParamAttr): The parameters to the Conv3d Layer. Default: None
+        bias_attr (ParamAttr): Bias parameter for the Conv3d layer. Default: None
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        use_mkldnn (bool): Use mkldnn kernels or not.
+        act (str): Activation type. Default: None
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+
+    Returns:
+        Variable: The tensor variable storing the convolution and \
+                  non-linearity activation result.
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(
+              name='data', shape=[3, 12, 32, 32], dtype='float32')
+          conv2d = fluid.layers.conv3d(
+              input=data, num_filters=2, filter_size=3, act="relu")
+    """
+
+    l_type = 'conv3d'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+
+    num_channels = input.shape[1]
+
+    if groups is None:
+        num_filter_channels = num_channels
+    else:
+        if num_channels % groups != 0:
+            raise ValueError("num_channels must be divisible by groups.")
+        num_filter_channels = num_channels / groups
+
+    filter_size = utils.convert_to_list(filter_size, 3, 'filter_size')
+    stride = utils.convert_to_list(stride, 3, 'stride')
+    padding = utils.convert_to_list(padding, 3, 'padding')
+    dilation = utils.convert_to_list(dilation, 3, 'dilation')
+
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("use_cudnn should be True or False")
+
+    input_shape = input.shape
+    filter_shape = [num_filters, num_filter_channels] + filter_size
+
+    def _get_default_param_initializer():
+        std = (2.0 / (filter_size[0]**3 * num_channels))**0.5
+        return Normal(0.0, std, 0)
+
+    filter_param = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=filter_shape,
+        dtype=dtype,
+        default_initializer=_get_default_param_initializer())
+
+    pre_bias = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type=l_type,
+        inputs={
+            'Input': input,
+            'Filter': filter_param,
+        },
+        outputs={"Output": pre_bias},
+        attrs={
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'use_mkldnn': use_mkldnn
+        })
+
+    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=3)
+
+    return helper.append_activation(pre_act)
+
+
 def sequence_pool(input, pool_type):
     """
     This function add the operator for sequence pooling.

From 2183d01799dcaa452c63ca04d5ac194f91f01b25 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Wed, 13 Jun 2018 17:14:00 +0800
Subject: [PATCH 15/69] Add pool3d and conv3d_trans Python API

---
 python/paddle/fluid/layers/nn.py | 273 +++++++++++++++++++++++++++++--
 1 file changed, 257 insertions(+), 16 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index f6b4348d25..3a4f93929f 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -39,13 +39,16 @@ __all__ = [
     'chunk_eval',
     'sequence_conv',
     'conv2d',
+    'conv3d',
     'sequence_pool',
     'sequence_softmax',
     'softmax',
     'pool2d',
+    'pool3d',
     'batch_norm',
     'beam_search_decode',
     'conv2d_transpose',
+    'conv3d_transpose',
     'sequence_expand',
     'lstm_unit',
     'reduce_sum',
@@ -1385,13 +1388,12 @@ def conv3d(input,
 
     The convolution3D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are in NCHW format. Where N is batch size, C is the number of
-    channels, H is the height of the feature, and W is the width of the feature.
-    The details of convolution layer, please refer UFLDL's `convolution,
-    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
-    If bias attribution and activation type are provided, bias is added to the
-    output of the convolution, and the corresponding activation function is
-    applied to the final result.
+    Output(Output) are in NCDHW format. Where N is batch size C is the number of
+    channels, D is the depth of the feature, H is the height of the feature,
+    and W is the width of the feature. Convlution3D is similar with Convlution2D
+    but adds one dimension(depth). If bias attribution and activation type are
+    provided, bias is added to the output of the convolution, and the
+    corresponding activation function is applied to the final result.
 
     For each input :math:`X`, the equation is:
 
@@ -1401,8 +1403,8 @@ def conv3d(input,
 
     In the above equation:
 
-    * :math:`X`: Input value, a tensor with NCHW format.
-    * :math:`W`: Filter value, a tensor with MCHW format.
+    * :math:`X`: Input value, a tensor with NCDHW format.
+    * :math:`W`: Filter value, a tensor with MCDHW format.
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
@@ -1433,16 +1435,16 @@ def conv3d(input,
             num_filters(int): The number of filter. It is as same as the output
             image channel.
         filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_D, filter_size_H, filter_size_W).
+            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
             Otherwise, the filter will be a square.
         stride (int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_D, stride_H, stride_W). Otherwise, the
+            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
             stride_D = stride_H = stride_W = stride. Default: stride = 1.
         padding (int|tuple): The padding size. If padding is a tuple, it must
-            contain two integers, (padding_D, padding_H, padding_W). Otherwise, the
+            contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
             padding_D = padding_H = padding_W = padding. Default: padding = 0.
         dilation (int|tuple): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1.
         groups (int): The groups number of the Conv3d Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
@@ -1528,7 +1530,7 @@ def conv3d(input,
             'use_mkldnn': use_mkldnn
         })
 
-    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=3)
+    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
 
     return helper.append_activation(pre_act)
 
@@ -1720,12 +1722,84 @@ def pool2d(input,
     if not isinstance(use_cudnn, bool):
         raise ValueError("use_cudnn should be True or False")
 
-    helper = LayerHelper('pool2d', **locals())
+    l_type = 'conv2d'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": input},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "global_pooling": global_pooling,
+            "strides": pool_stride,
+            "paddings": pool_padding,
+            "use_cudnn": use_cudnn,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": use_mkldnn
+        })
+
+    return pool_out
+
+
+def pool3d(input,
+           pool_size=-1,
+           pool_type="max",
+           pool_stride=1,
+           pool_padding=0,
+           global_pooling=False,
+           use_cudnn=True,
+           ceil_mode=False,
+           use_mkldnn=False,
+           name=None):
+    """
+    This function adds the operator for pooling in 3-dimensions, using the
+    pooling configurations mentioned in input parameters.
+
+    Args:
+        input (Variable): ${input_comment}
+        pool_size (int): ${ksize_comment}
+        pool_type (str): ${pooling_type_comment}
+        pool_stride (int): stride of the pooling layer.
+        pool_padding (int): padding size.
+        global_pooling (bool): ${global_pooling_comment}
+        use_cudnn (bool): ${use_cudnn_comment}
+        ceil_mode (bool): ${ceil_mode_comment}
+        use_mkldnn (bool): ${use_mkldnn_comment}
+        name (str): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+
+    Returns:
+        Variable: output of pool3d layer.
+    """
+    if pool_type not in ["max", "avg"]:
+        raise ValueError(
+            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            str(pool_type))
+
+    if global_pooling is False and pool_size == -1:
+        raise ValueError(
+            "When the global_pooling is False, pool_size must be passed "
+            "and be a valid value. Received pool_size: " + str(pool_size))
+
+    pool_size = utils.convert_to_list(pool_size, 3, 'pool_size')
+    pool_padding = utils.convert_to_list(pool_padding, 3, 'pool_padding')
+    pool_stride = utils.convert_to_list(pool_stride, 3, 'pool_stride')
+
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("use_cudnn should be True or False")
+
+    l_type = "pool3d"
+    helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_tmp_variable(dtype)
 
     helper.append_op(
-        type="pool2d",
+        type=l_type,
         inputs={"X": input},
         outputs={"Out": pool_out},
         attrs={
@@ -2146,6 +2220,173 @@ def conv2d_transpose(input,
     return out
 
 
+def conv3d_transpose(input,
+                     num_filters,
+                     output_size=None,
+                     filter_size=None,
+                     padding=0,
+                     stride=1,
+                     dilation=1,
+                     groups=None,
+                     param_attr=None,
+                     bias_attr=None,
+                     use_cudnn=True,
+                     act=None,
+                     name=None):
+    """
+    **Convlution3D transpose layer**
+
+    The convolution3D transpose layer calculates the output based on the input,
+    filter, and dilations, strides, paddings. Input(Input) and output(Output)
+    are in NCDHW format. Where N is batch size, C is the number of channels,
+    D is the depth of the feature, H is the height of the feature, and W
+    is the width of the feature. Parameters(dilations, strides, paddings) are
+    two elements. These two elements represent height and width, respectively.
+    The details of convolution transpose layer, please refer to the following
+    explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = W \\ast X
+
+    In the above equation:
+
+    * :math:`X`: Input value, a tensor with NCDHW format.
+    * :math:`W`: Filter value, a tensor with MCDHW format.
+    * :math:`\\ast` : Convolution transpose operation.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
+                   different.
+
+    Example:
+
+        - Input:
+
+          Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+
+          Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
+
+        - Output:
+
+          Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+
+        Where
+
+        .. math::
+
+           D_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
+           H_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
+           W_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1
+
+    Args:
+        input(Variable): The input image with [N, C, D, H, W] format.
+        num_filters(int): The number of the filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain three integers, (image_D, image_H, image_W). This
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square. None if use output size to
+            calculate filter_size.
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
+            padding_D = padding_H = padding_W = padding. Default: padding = 0.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
+            stride_D = stride_H = stride_W = stride. Default: stride = 1.
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups(int): The groups number of the Conv3d transpose layer. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups=1
+        param_attr(ParamAttr): The parameters to the Conv3d_transpose Layer.
+            Default: None
+        bias_attr(ParamAttr): Bias parameter for the Conv3d layer. Default: None
+        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act(str): Activation type. Default: None
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+
+    Returns:
+        Variable: The tensor variable storing the convolution transpose result.
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+
+    Examples:
+       .. code-block:: python
+
+          data = fluid.layers.data(
+              name='data', shape=[3, 12, 32, 32], dtype='float32')
+          conv2d_transpose = fluid.layers.conv3d_transpose(
+              input=data, num_filters=2, filter_size=3)
+    """
+    l_type = "conv3d_transpose"
+    helper = LayerHelper(l_type, **locals())
+    if not isinstance(input, Variable):
+        raise TypeError("Input of conv3d_transpose must be Variable")
+    input_channel = input.shape[1]
+
+    padding = utils.convert_to_list(padding, 3, 'padding')
+    stride = utils.convert_to_list(stride, 3, 'stride')
+    dilation = utils.convert_to_list(dilation, 3, 'dilation')
+
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("use_cudnn should be True or False")
+
+    if filter_size is None:
+        if output_size is None:
+            raise ValueError("output_size must be set when filter_size is None")
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]
+
+        d_in = input.shape[2]
+        h_in = input.shape[3]
+        w_in = input.shape[4]
+
+        filter_size_d = (output_size[0] - (d_in - 1) * stride[0] + 2 *
+                         padding[0] - 1) / dilation[0] + 1
+        filter_size_h = (output_size[1] - (h_in - 1) * stride[1] + 2 *
+                         padding[1] - 1) / dilation[1] + 1
+        filter_size_w = (output_size[2] - (w_in - 1) * stride[2] + 2 *
+                         padding[2] - 1) / dilation[2] + 1
+        filter_size = [filter_size_d, filter_size_h, filter_size_w]
+    else:
+        filter_size = utils.convert_to_list(filter_size, 3,
+                                            'conv3d_transpose.filter_size')
+
+    groups = 1 if groups is None else groups
+    filter_shape = [input_channel, num_filters / groups] + filter_size
+    img_filter = helper.create_parameter(
+        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
+
+    pre_bias = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type=l_type,
+        inputs={'Input': [input],
+                'Filter': [img_filter]},
+        outputs={'Output': pre_bias},
+        attrs={
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn
+        })
+
+    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
+    out = helper.append_activation(pre_act)
+    return out
+
+
 def sequence_expand(x, y, ref_level=-1, name=None):
     """Sequence Expand Layer. This layer will expand the input variable **x**
     according to specified level lod of **y**. Please note that lod level of

From c1843fd2ae84fc3f88b7eb40953dbed38a682083 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 13 Jun 2018 17:49:44 +0800
Subject: [PATCH 16/69] improve

---
 python/paddle/fluid/layers/control_flow.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 2279197717..4db085e9f5 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1219,20 +1219,21 @@ class IfElse(object):
     Examples:
           .. code-block:: python
 
-            limit = layers.fill_constant_batch_size_like(
+            limit = fluid.layers.fill_constant_batch_size_like(
                 input=label, dtype='int64', shape=[1], value=5.0)
-            cond = layers.less_than(x=label, y=limit)
-            ie = layers.IfElse(cond)
+            cond = fluid.layers.less_than(x=label, y=limit)
+            ie = fluid.layers.IfElse(cond)
             with ie.true_block():
                 true_image = ie.input(image)
-                hidden = layers.fc(input=true_image, size=100, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
+                hidden = fluid.layers.fc(input=true_image, size=100, act='tanh')
+                prob = fluid.layers.fc(input=hidden, size=10, act='softmax')
                 ie.output(prob)
 
             with ie.false_block():
                 false_image = ie.input(image)
-                hidden = layers.fc(input=false_image, size=200, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
+                hidden = fluid.layers.fc(
+                    input=false_image, size=200, act='tanh')
+                prob = fluid.layers.fc(input=hidden, size=10, act='softmax')
                 ie.output(prob)
             prob = ie()
     """

From ce6394ed73ae5f9b6ad44c0e8dca762791001f2d Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Wed, 13 Jun 2018 17:50:27 +0800
Subject: [PATCH 17/69] Polish example

---
 paddle/fluid/operators/row_conv_op.cc       |  2 +-
 paddle/fluid/operators/uniform_random_op.cc |  2 --
 python/paddle/fluid/layers/nn.py            | 30 ++++++++++++++-------
 python/paddle/fluid/layers/ops.py           | 21 ++++++++++++++-
 python/paddle/fluid/layers/tensor.py        | 15 +++++------
 5 files changed, 48 insertions(+), 22 deletions(-)

diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index f4b540f1cb..d7286111fd 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -114,7 +114,7 @@ and a filter ($W$) of size $context \times d$,
 the output sequence is convolved as:
 
 $$
-out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :}
+out_{i, :} = \\sum_{j=i}^{i + context} in_{j,:} \\cdot W_{i-j, :}
 $$
 
 In the above equation:
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 65525526c9..edd1baa4ac 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -88,8 +88,6 @@ class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddOutput("Out", "The output tensor of uniform random op");
     AddComment(R"DOC(
-Uniform random operator.
-
 This operator initializes a tensor with random values sampled from a
 uniform distribution. The random result is in set [min, max].
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index fe60d8b78d..f1241d9479 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1718,10 +1718,14 @@ def layer_norm(input,
 
         h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
 
-    >>> import paddle.fluid as fluid
-    >>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
-    >>>                          dtype='float32')
-    >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
+    * :math:`a`: the vector representation of the summed inputs to the neurons
+    in that layer.
+
+    * :math:`H`: the number of hidden units in a layers
+
+    * :math:`g`: the trainable scale parameter.
+
+    * :math:`b`: the trainable bias parameter.
 
     Args:
         input(Variable): The input tensor variable.
@@ -1742,6 +1746,12 @@ def layer_norm(input,
 
     Returns:
         ${y_comment}
+
+    Examples:
+
+        >>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
+        >>>                          dtype='float32')
+        >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
     """
     helper = LayerHelper('layer_norm', **locals())
     dtype = helper.input_dtype()
@@ -3262,12 +3272,6 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
     """
     ${comment}
 
-    >>> import paddle.fluid as fluid
-    >>> x = fluid.layers.data(name='x', shape=[16],
-    >>>                        dtype='float32', lod_level=1)
-    >>> out = fluid.layers.row_conv(input=x, future_context_size=2)
-
-
     Args:
         input (${x_type}): ${x_comment}.
         future_context_size (int): Future context size. Please note, the shape
@@ -3278,6 +3282,12 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
 
     Returns:
         ${out_comment}.
+
+    Examples:
+        >>> import paddle.fluid as fluid
+        >>> x = fluid.layers.data(name='x', shape=[16],
+        >>>                        dtype='float32', lod_level=1)
+        >>> out = fluid.layers.row_conv(input=x, future_context_size=2)
     """
     helper = LayerHelper('row_conv', **locals())
     dtype = helper.input_dtype()
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 98f169e8f0..46c6fd686e 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -64,7 +64,6 @@ __all__ = [
     'logical_or',
     'logical_xor',
     'logical_not',
-    'uniform_random',
     'uniform_random_batch_size_like',
     'gaussian_random',
     'gaussian_random_batch_size_like',
@@ -79,3 +78,23 @@ __all__ = [
 
 for _OP in set(__all__):
     globals()[_OP] = generate_layer_fn(_OP)
+
+__all__ += ["uniform_random"]
+
+_uniform_random_ = generate_layer_fn('uniform_random')
+
+
+def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
+    kwargs = dict()
+    for name in locals():
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+    return _uniform_random_(**kwargs)
+
+uniform_random.__doc__ = _uniform_random_.__doc__  + "\n"\
++"""
+Examples:
+
+    >>> result = fluid.layers.uniform_random(shape=[32, 784])
+"""
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 241bbe78bd..04efc40af5 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -6,7 +6,7 @@
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
+# Unlessf required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
@@ -57,12 +57,6 @@ def create_parameter(shape,
     NOTE: this is a very low-level API. This API is useful when you create
     operator by your self. instead of using layers.
 
-    >>> import paddle.fluid as fluid
-    >>> W = fluid.layers.create_parameter(shape=[784, 200], dtype='float32')
-    >>> data = fluid.layers.data(name="img", shape=[64, 784],
-    >>>           append_batch_size=False)
-    >>> hidden = fluid.layers.matmul(x=data, y=W)
-
     Args:
         shape(list[int]): shape of the parameter
         dtype(string): element type of the parameter
@@ -74,7 +68,12 @@ def create_parameter(shape,
         default_initializer(Initializer): initializer for the parameter
 
     Returns:
-        the created parameter
+        the created parameter.
+
+    Examples:
+        >>> W = fluid.layers.create_parameter(shape=[784, 200], dtype='float32')
+        >>> data = fluid.layers.data(name="img", shape=[64, 784], append_batch_size=False)
+        >>> hidden = fluid.layers.matmul(x=data, y=W)
     """
     helper = LayerHelper("create_parameter", **locals())
     if attr is None:

From 674327a4b179286bbe7db2e2b96f8e59c0120562 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Wed, 13 Jun 2018 18:06:59 +0800
Subject: [PATCH 18/69] Polish several API

---
 paddle/fluid/operators/activation_op.cc |  3 +-
 python/paddle/fluid/layers/detection.py | 38 ++++++++++++-------------
 python/paddle/fluid/layers/ops.py       | 28 ++++++++++++++++--
 3 files changed, 46 insertions(+), 23 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 5e2fa56677..89bb1e2d4f 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -271,7 +271,8 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "Input of HardShrink operator");
     AddOutput("Out", "Output of HardShrink operator");
-    AddAttr<float>("threshold", "The value of threshold for HardShrink")
+    AddAttr<float>("threshold",
+                   "The value of threshold for HardShrink. [default: 0.5]")
         .SetDefault(0.5f);
     AddComment(R"DOC(
 HardShrink Activation Operator.
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 1e8dfbe521..d5f7e420dd 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -403,25 +403,6 @@ def ssd_loss(location,
 
       5.3 Compute the overall weighted loss.
 
-    >>> import paddle.fluid.layers as layers
-    >>> pb = layers.data(
-    >>>            name='prior_box',
-    >>>            shape=[10, 4],
-    >>>            append_batch_size=False,
-    >>>            dtype='float32')
-    >>> pbv = layers.data(
-    >>>            name='prior_box_var',
-    >>>            shape=[10, 4],
-    >>>            append_batch_size=False,
-    >>>            dtype='float32')
-    >>> loc = layers.data(name='target_box', shape=[10, 4], dtype='float32')
-    >>> scores = layers.data(name='scores', shape=[10, 21], dtype='float32')
-    >>> gt_box = layers.data(
-    >>>         name='gt_box', shape=[4], lod_level=1, dtype='float32')
-    >>> gt_label = layers.data(
-    >>>         name='gt_label', shape=[1], lod_level=1, dtype='float32')
-    >>>     loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
-
     Args:
         location (Variable): The location predictions are a 3D Tensor with
             shape [N, Np, 4], N is the batch size, Np is total number of
@@ -465,6 +446,25 @@ def ssd_loss(location,
     Raises:
         ValueError: If mining_type is 'hard_example', now only support mining \
         type of `max_negative`.
+
+    Examples:
+        >>> pb = fluid.layers.data(
+        >>>                   name='prior_box',
+        >>>                   shape=[10, 4],
+        >>>                   append_batch_size=False,
+        >>>                   dtype='float32')
+        >>> pbv = fluid.layers.data(
+        >>>                   name='prior_box_var',
+        >>>                   shape=[10, 4],
+        >>>                   append_batch_size=False,
+        >>>                   dtype='float32')
+        >>> loc = fluid.layers.data(name='target_box', shape=[10, 4], dtype='float32')
+        >>> scores = fluid.layers.data(name='scores', shape=[10, 21], dtype='float32')
+        >>> gt_box = fluid.layers.data(
+        >>>         name='gt_box', shape=[4], lod_level=1, dtype='float32')
+        >>> gt_label = fluid.layers.data(
+        >>>         name='gt_label', shape=[1], lod_level=1, dtype='float32')
+        >>> loss = fluid.layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
     """
 
     helper = LayerHelper('ssd_loss', **locals())
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 46c6fd686e..f0abd3089d 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -40,7 +40,6 @@ __activations__ = [
     'relu6',
     'pow',
     'stanh',
-    'hard_shrink',
     'thresholded_relu',
     'hard_sigmoid',
     'swish',
@@ -92,9 +91,32 @@ def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
             kwargs[name] = val
     return _uniform_random_(**kwargs)
 
-uniform_random.__doc__ = _uniform_random_.__doc__  + "\n"\
-+"""
+
+uniform_random.__doc__ = _uniform_random_.__doc__ + "\n" \
+                         + """
 Examples:
 
     >>> result = fluid.layers.uniform_random(shape=[32, 784])
 """
+
+__all__ += ['hard_shrink']
+
+_hard_shrink_ = generate_layer_fn('hard_shrink')
+
+
+def hard_shrink(x, threshold=None):
+    kwargs = dict()
+    for name in locals():
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+    return _hard_shrink_(**kwargs)
+
+
+hard_shrink.__doc__ = _hard_shrink_.__doc__ + "\n" \
+                      + """
+Examples:
+
+    >>> data = fluid.layers.data(name="input", shape=[784])
+    >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
+"""

From 9b13b4c0d295544b6eaed50ad77657b20d3736b6 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Wed, 13 Jun 2018 17:32:04 +0800
Subject: [PATCH 19/69] Add doc

---
 doc/fluid/api/layers.rst         | 19 +++++++++++++++++++
 python/paddle/fluid/layers/nn.py |  2 +-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index f78e6db326..4be56791b1 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -361,6 +361,12 @@ conv2d
 ..  autofunction:: paddle.fluid.layers.conv2d
     :noindex:
 
+conv3d
+------
+
+..  autofunction:: paddle.fluid.layers.conv3d
+    :noindex:
+
 sequence_pool
 -------------
 
@@ -385,6 +391,12 @@ pool2d
 ..  autofunction:: paddle.fluid.layers.pool2d
     :noindex:
 
+pool3d
+------
+
+..  autofunction:: paddle.fluid.layers.pool3d
+    :noindex:
+
 batch_norm
 ----------
 
@@ -403,6 +415,13 @@ conv2d_transpose
 ..  autofunction:: paddle.fluid.layers.conv2d_transpose
     :noindex:
 
+conv3d_transpose
+----------------
+
+..  autofunction:: paddle.fluid.layers.conv2d_transpose
+    :noindex:
+
+
 sequence_expand
 ---------------
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 3a4f93929f..888245fc9e 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1722,7 +1722,7 @@ def pool2d(input,
     if not isinstance(use_cudnn, bool):
         raise ValueError("use_cudnn should be True or False")
 
-    l_type = 'conv2d'
+    l_type = 'pool2d'
 
     helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()

From 980499faf15e869f237bfd94171f4aea079865e5 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Thu, 14 Jun 2018 13:54:23 +0800
Subject: [PATCH 20/69] fix errors

---
 paddle/fluid/operators/pool_op.cc             | 15 ++++--
 python/paddle/fluid/layers/control_flow.py    | 23 +++++----
 python/paddle/fluid/layers/io.py              | 26 ++++++++++
 .../fluid/layers/learning_rate_scheduler.py   |  4 +-
 python/paddle/fluid/layers/nn.py              | 51 ++++++++++++++-----
 python/paddle/fluid/layers/tensor.py          |  7 +--
 6 files changed, 92 insertions(+), 34 deletions(-)

diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 6707cdded4..d94ddc7a53 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -204,8 +204,6 @@ void Pool2dOpMaker::Make() {
   // TODO(dzhwinter): need to registered layout transform function
 
   AddComment(R"DOC(
-Pool2d Operator.
-
 The pooling2d operation calculates the output based on
 the input, pooling_type and ksize, strides, paddings parameters.
 Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
@@ -215,18 +213,27 @@ These two elements represent height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
 Example:
+
   Input:
+
        X shape: $(N, C, H_{in}, W_{in})$
+
   Output:
+
        Out shape: $(N, C, H_{out}, W_{out})$
+
   For ceil_mode = false:
        $$
-       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
+       $$
+       $$
        W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
        $$
   For ceil_mode = true:
        $$
-       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1
+       $$
+       $$
        W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
        $$
 
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index f4013b61d0..3ffebb960b 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -753,9 +753,9 @@ def lod_tensor_to_array(x, table):
 
     This function split a LoDTesnor to a LoDTensorArray according to its LoD 
     information. LoDTensorArray is an alias of C++ std::vector<LoDTensor> in 
-    Paddle. The generated LoDTensorArray of this function can be further read 
-    or written by 'read_from_array()' and 'write_to_array()' operators. However, 
-    this function is generally an internal component of Paddle 'DynamicRNN'. 
+    PaddlePaddle. The generated LoDTensorArray of this function can be further read 
+    or written by `read_from_array()` and `write_to_array()` operators. However, 
+    this function is generally an internal component of PaddlePaddle `DynamicRNN`. 
     Users should not use it directly.
 
     Args:
@@ -763,11 +763,10 @@ def lod_tensor_to_array(x, table):
         table (ParamAttr|list): The variable that stores the level of lod
                                 which is ordered by sequence length in
                                 descending order. It is generally generated 
-                                by 'layers.lod_rank_table()' API.
+                                by `layers.lod_rank_table()` API.
 
     Returns:
-        Variable: The LoDTensorArray that has been converted from the input 
-                  tensor.
+        Variable: The LoDTensorArray that has been converted from the input tensor.
 
     Examples:
         .. code-block:: python
@@ -1579,24 +1578,26 @@ def reorder_lod_tensor_by_rank(x, rank_table):
 
 def is_empty(x, cond=None, **ignored):
     """
-    Test whether an Variable is empty.
+    Test whether a Variable is empty.
 
     Args:
         x (Variable): The Variable to be tested.
         cond (Variable|None): Output parameter. Returns the test result 
-                              of given 'x'.
+                              of given 'x'. Default: None
 
     Returns:
-        Variable: The tensor variable storing the test result of 'x'.
+        Variable: A bool scalar. True if 'x' is an empty Variable.
 
     Raises:
         TypeError: If input cond is not a variable, or cond's dtype is
-                   not bool
+                   not bool.
 
     Examples:
         .. code-block:: python
 
-          less = fluid.layers.is_empty(x=input)
+          res = fluid.layers.is_empty(x=input)
+          # or:
+          fluid.layers.is_empty(x=input, cond=res)
     """
     helper = LayerHelper("is_empty", **locals())
     if cond is None:
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 9de88e2c32..6d6cdffe27 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -572,6 +572,32 @@ def parallel(reader):
 
 
 def read_file(file_obj):
+    """
+    Read data from a file object.
+
+    A file object is also a Variable. It can be a raw file object generated by 
+    `fluid.layers.open_files()` or a decorated one generated by 
+    `fluid.layers.double_buffer()` and so on.
+
+    Args:
+
+        file_obj(Variable): The file object from where to read data.
+
+    Returns:
+        Tuple[Variable]: Data read from the given file object.
+
+    Examples:
+        .. code-block:: python
+
+           data_file = fluid.layers.open_files(
+                filenames=['mnist.recordio'],
+                shapes=[(-1, 748), (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"])
+            data_file = fluid.layers.double_buffer(
+                fluid.layers.batch(data_file, batch_size=64))
+            input, label = fluid.layers.read_file(data_file)
+    """
     helper = LayerHelper('read_file')
     out = [
         helper.create_tmp_variable(
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 9cbb559093..0efa726331 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -90,7 +90,7 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
                             Default: False
 
     Returns:
-        The decayed learning rate
+        Variable: The decayed learning rate
 
     Examples:
         .. code-block:: python
@@ -167,7 +167,7 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
                             Default: False
 
     Returns:
-        The decayed learning rate
+        Variable: The decayed learning rate
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 3194b72da6..030681999d 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -151,7 +151,7 @@ def fc(input,
         name (str, default None): The name of this layer.
 
     Returns:
-        A tensor variable storing the transformation result.
+        Variable: The transformation result.
 
     Raises:
         ValueError: If rank of the input tensor is less than 2.
@@ -159,8 +159,7 @@ def fc(input,
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(
-              name="data", shape=[32, 32], dtype="float32")
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
           fc = fluid.layers.fc(input=data, size=1000, act="tanh")
     """
 
@@ -1543,21 +1542,24 @@ def pool2d(input,
     ${comment}
 
     Args:
-        input (Variable): ${input_comment}
+        input (Variable): The input tensor of pooling operator. The format of 
+        input tensor is NCHW, where N is batch size, C is the number of 
+        channels, H is the height of the feature, and W is the width of 
+        the feature.
         pool_size (int): The side length of pooling windows. All pooling 
                          windows are squares with pool_size on a side.
-        pool_type (str): ${pooling_type_comment}
+        pool_type: ${pooling_type_comment}
         pool_stride (int): stride of the pooling layer.
         pool_padding (int): padding size.
-        global_pooling (bool): ${global_pooling_comment}
-        use_cudnn (bool): ${use_cudnn_comment}
-        ceil_mode (bool): ${ceil_mode_comment}
-        use_mkldnn (bool): ${use_mkldnn_comment}
+        global_pooling: ${global_pooling_comment}
+        use_cudnn: ${use_cudnn_comment}
+        ceil_mode: ${ceil_mode_comment}
+        use_mkldnn: ${use_mkldnn_comment}
         name (str|None): A name for this layer(optional). If set None, the 
                         layer will be named automatically.
 
     Returns:
-        Variable: output of pool2d layer.
+        Variable: The pooling result.
 
     Raises:
         ValueError: If 'pool_type' is not "max" nor "avg"
@@ -2764,6 +2766,27 @@ def topk(input, k, name=None):
     If the input is a Tensor with higher rank, this operator computes the top k
     entries along the last dimension.
 
+    For example:
+
+    .. code-block:: text
+
+        If:
+            input = [[5, 4, 2, 3],
+                     [9, 7, 10, 25],
+                     [6, 2, 10, 1]]
+            k = 2
+
+        Then:
+            The first output:
+            values = [[5, 4],
+                      [10, 25],
+                      [6, 10]]
+
+            The second output:
+            indices = [[0, 1],
+                       [2, 3],
+                       [0, 2]]
+
     Args:
         input(Variable): The input variable which can be a vector or Tensor with
             higher rank.
@@ -2774,10 +2797,10 @@ def topk(input, k, name=None):
                        Default: None
 
     Returns:
-        values(Variable): The k largest elements along each last dimensional
-            slice.
-        indices(Variable): The indices of values within the last dimension of
-            input.
+        Tuple[Variable]: A tuple with two elements. Each element is a Variable. 
+        The first one is k largest elements along each last 
+        dimensional slice. The second one is indices of values 
+        within the last dimension of input.
 
     Raises:
         ValueError: If k < 1 or k is not less than the last dimension of input
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index e03c8ca914..392fa6a422 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -159,20 +159,21 @@ def concat(input, axis=0, name=None):
 
 
 def sums(input, out=None):
-    """This function performs the sum operation on the input and returns the
+    """
+    This function performs the sum operation on the input and returns the
     result as the output.
 
     Args:
         input (Variable|list): The input tensor that has the elements
                                that need to be summed up.
-        out (Variable|None): Output parameter. Returns the sum result.
+        out (Variable|None): Output parameter. The sum result.
                              Default: None
 
     Returns:
         Variable: the sum of input. The same as the argument 'out'
 
     Examples:
-        .. code-block::python
+        .. code-block:: python
 
           tmp = fluid.layers.zeros(shape=[10], dtype='int32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)

From cbc1b7f1cecc8d5e0e14315dad43410fb0d53f23 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Thu, 14 Jun 2018 16:58:25 +0800
Subject: [PATCH 21/69] Polish documentation

---
 paddle/fluid/operators/activation_op.cc    | 13 ++--
 paddle/fluid/operators/compare_op.cc       | 11 ++-
 paddle/fluid/operators/multiplex_op.cc     |  4 +-
 paddle/fluid/operators/row_conv_op.cc      |  2 +-
 python/paddle/fluid/layers/control_flow.py | 31 +++++++--
 python/paddle/fluid/layers/detection.py    | 81 +++++++++++++---------
 python/paddle/fluid/layers/io.py           | 43 +++++++++---
 python/paddle/fluid/layers/ops.py          | 23 +++++-
 8 files changed, 144 insertions(+), 64 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 89bb1e2d4f..91a2826947 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -275,7 +275,7 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
                    "The value of threshold for HardShrink. [default: 0.5]")
         .SetDefault(0.5f);
     AddComment(R"DOC(
-HardShrink Activation Operator.
+** HardShrink activation operator **
 
 ..  math::
     out = \begin{cases}
@@ -399,13 +399,12 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 ThresholdedRelu Activation Operator.
 
-$$
-out = \begin{cases} 
-    x, \text{if } x > threshold \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+..  math::
 
+    out = \begin{cases}
+             x, \text{if } x > threshold \\
+             0,  \text{otherwise}
+          \end{cases}
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/compare_op.cc b/paddle/fluid/operators/compare_op.cc
index 11e91c5ec8..f40b1ba338 100644
--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -34,16 +34,15 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(true);
     AddOutput("Out", string::Sprintf("n-dim bool tensor. Each element is %s",
                                      comment.equation));
-    AddComment(string::Sprintf(R"DOC(%s Operator
-
+    AddComment(string::Sprintf(R"DOC(
 It operates element-wise on X and Y, and returns the Out. Each of them is a
 N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
 calculated by $%s$
 )DOC",
-                               comment.type, comment.equation));
-    AddAttr<int>("axis",
-                 "(int, default -1). The start dimension index "
-                 "for broadcasting Y onto X.")
+                               comment.equation));
+    AddAttr<int>(
+        "axis",
+        "The start dimension index for broadcasting Y onto X. [default -1]")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
   }
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index 9db2df2a4c..18ad46cb5e 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -96,7 +96,9 @@ the (Ids[i])-th tensor.
 
 For i-th row of the output tensor:
 
-$ y[i] = x_{k}[i] $
+$$
+y[i] = x_{k}[i]
+$$
 
 where $y$ is the output tensor, $x_{k}$ is the k-th input tensor,
 and $k = Ids[i]$.
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index d7286111fd..52c37e8c91 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -94,7 +94,7 @@ class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
               "in this LodTensor is a matrix with shape T x N, i.e., the "
               "same shape as X.");
     AddComment(R"DOC(
-Row-convolution Operator.
+** Row-convolution operator **
 
 The row convolution is called lookahead convolution.  This operator was 
 introduced in the following paper for DeepSpeech2:
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 8d28aeb2ed..a618937b11 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1008,8 +1008,28 @@ def array_read(array, i):
 
 def shrink_memory(x, i, table):
     """
-    This function creates an operator to shrink_rnn_memory using the RankTable
+    This function creates an operator to shrink rnn memory using the RankTable
     as mentioned in the input parameter.
+
+    NOTE: This API is very low-level API. It is used by DynamicRNN only.
+
+    Since the Dynamic RNN uses no-padding way to implement RNN. The sequence
+    will be sorted by order, and the length of valid memory will be shrink after
+    each time step.
+
+    Args:
+        x(Variable): The memory object in the previous time step.
+        i(Variable): The step count variable. A int scalar as LoDTensor.
+        table(Variable): The RNNRankTable object.
+
+    Returns:
+        the memory variable after shrink.
+
+    Examples:
+
+        Since this API is very low level API. The example is not provided.
+        Please reference the implementation of class DynamicRNN for detail
+        usage.
     """
     helper = LayerHelper('shrink_memory', **locals())
     out = helper.create_tmp_variable(dtype=x.dtype)
@@ -1316,10 +1336,9 @@ class IfElse(object):
 
 class DynamicRNN(object):
     """
-    Dynamic RNN.
-
-    This RNN can process a batch of sequence data. The length of each sample
-    sequence can be different. This API automatically process them in batch.
+    The dynamic RNN can process a batch of sequence data. The length of each
+    sample sequence can be different. This API automatically process them in
+    batch.
 
     The input lod must be set. Please reference `lod_tensor`
 
@@ -1500,7 +1519,7 @@ class DynamicRNN(object):
                need_reorder=False,
                dtype='float32'):
         """
-        Create a memory variable.
+        Create a memory variable for dynamic rnn.
 
         If the :code:`init` is not None, :code:`memory` will be initialized by
         this variable. The :code:`need_reorder` is used to reorder the memory as
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index d5f7e420dd..edf528a595 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -210,53 +210,68 @@ def bipartite_match(dist_matrix,
                     dist_threshold=None,
                     name=None):
     """
-    **Bipartite matchint operator**
-
-    This operator is a greedy bipartite matching algorithm, which is used to
-    obtain the matching with the maximum distance based on the input
+    This operator implements a greedy bipartite matching algorithm, which is
+    used to obtain the matching with the maximum distance based on the input
     distance matrix. For input 2D matrix, the bipartite matching algorithm can
-    find the matched column for each row, also can find the matched row for
-    each column. And this operator only calculate matched indices from column
-    to row. For each instance, the number of matched indices is the number of
-    of columns of the input ditance matrix.
-
-    There are two outputs to save matched indices and distance.
-    A simple description, this algothrim matched the best (maximum distance)
+    find the matched column for each row (matched means the largest distance),
+    also can find the matched row for each column. And this operator only
+    calculate matched indices from column to row. For each instance,
+    the number of matched indices is the column number of the input distance
+    matrix.
+
+    There are two outputs, matched indices and distance.
+    A simple description, this algorithm matched the best (maximum distance)
     row entity to the column entity and the matched indices are not duplicated
     in each row of ColToRowMatchIndices. If the column entity is not matched
     any row entity, set -1 in ColToRowMatchIndices.
 
-    Please note that the input DistMat can be LoDTensor (with LoD) or Tensor.
+    NOTE: the input DistMat can be LoDTensor (with LoD) or Tensor.
     If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
     If Tensor, the height of ColToRowMatchIndices is 1.
 
+    NOTE: This API is a very low level API. It is used by :code:`ssd_loss`
+    layer. Please consider to use :code:`ssd_loss` instead.
+
     Args:
         dist_matrix(Variable): This input is a 2-D LoDTensor with shape
             [K, M]. It is pair-wise distance matrix between the entities
             represented by each row and each column. For example, assumed one
             entity is A with shape [K], another entity is B with shape [M]. The
-            dist_matirx[i][j] is the distance between A[i] and B[j]. The bigger
-            the distance is, the better macthing the pairs are. Please note,
-            This tensor can contain LoD information to represent a batch of
-            inputs. One instance of this batch can contain different numbers of
-            entities.
+            dist_matrix[i][j] is the distance between A[i] and B[j]. The bigger
+            the distance is, the better matching the pairs are.
+
+            NOTE: This tensor can contain LoD information to represent a batch
+            of inputs. One instance of this batch can contain different numbers
+            of entities.
         match_type(string|None): The type of matching method, should be
-           'bipartite' or 'per_prediction', 'bipartite' by defalut.
+           'bipartite' or 'per_prediction'. [default 'bipartite'].
         dist_threshold(float|None): If `match_type` is 'per_prediction',
             this threshold is to determine the extra matching bboxes based
-            on the maximum distance, 0.5 by defalut.
+            on the maximum distance, 0.5 by default.
     Returns:
-        match_indices(Variable): A 2-D Tensor with shape [N, M] in int type.
-            N is the batch size. If match_indices[i][j] is -1, it
-            means B[j] does not match any entity in i-th instance.
-            Otherwise, it means B[j] is matched to row
-            match_indices[i][j] in i-th instance. The row number of
-            i-th instance is saved in match_indices[i][j].
-        match_distance(Variable): A 2-D Tensor with shape [N, M] in float type.
-            N is batch size. If match_indices[i][j] is -1,
-            match_distance[i][j] is also -1.0. Otherwise, assumed
-            match_distance[i][j] = d, and the row offsets of each instance
-            are called LoD. Then match_distance[i][j] = dist_matrix[d+LoD[i]][j].
+        tuple: a tuple with two elements is returned. The first is
+        matched_indices, the second is matched_distance.
+
+        The matched_indices is a 2-D Tensor with shape [N, M] in int type.
+        N is the batch size. If match_indices[i][j] is -1, it
+        means B[j] does not match any entity in i-th instance.
+        Otherwise, it means B[j] is matched to row
+        match_indices[i][j] in i-th instance. The row number of
+        i-th instance is saved in match_indices[i][j].
+
+        The matched_distance is a 2-D Tensor with shape [N, M] in float type
+        . N is batch size. If match_indices[i][j] is -1,
+        match_distance[i][j] is also -1.0. Otherwise, assumed
+        match_distance[i][j] = d, and the row offsets of each instance
+        are called LoD. Then match_distance[i][j] =
+        dist_matrix[d+LoD[i]][j].
+
+    Examples:
+
+        >>> x = fluid.layers.data(name='x', shape=[4], dtype='float32')
+        >>> y = fluid.layers.data(name='y', shape=[4], dtype='float32')
+        >>> iou = fluid.layers.iou_similarity(x=x, y=y)
+        >>> matched_indices, matched_dist = fluid.layers.bipartite_match(iou)
     """
     helper = LayerHelper('bipartite_match', **locals())
     match_indices = helper.create_tmp_variable(dtype='int32')
@@ -364,7 +379,7 @@ def ssd_loss(location,
              normalize=True,
              sample_size=None):
     """
-    **Multi-box loss layer for object dection algorithm of SSD**
+    **Multi-box loss layer for object detection algorithm of SSD**
 
     This layer is to compute dection loss for SSD given the location offset
     predictions, confidence predictions, prior boxes and ground-truth boudding
@@ -372,7 +387,7 @@ def ssd_loss(location,
     is a weighted sum of the localization loss (or regression loss) and
     confidence loss (or classification loss) by performing the following steps:
 
-    1. Find matched boundding box by bipartite matching algorithm.
+    1. Find matched bounding box by bipartite matching algorithm.
 
       1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
 
@@ -435,7 +450,7 @@ def ssd_loss(location,
         mining_type (str): The hard example mining type, should be 'hard_example'
             or 'max_negative', now only support `max_negative`.
         normalize (bool): Whether to normalize the SSD loss by the total number
-            of output locations, True by defalut.
+            of output locations, True by default.
         sample_size (int): The max sample size of negative box, used only when
             mining_type is 'hard_example'.
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 13a3d5441a..150be96d86 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -302,15 +302,6 @@ def open_recordio_file(filename,
     """
     ${comment}
 
-    >>> import paddle.fluid as fluid
-    >>> reader = fluid.layers.io.open_recordio_file(
-    >>>                               filename='./data.recordio',
-    >>>                               shapes=[(3,224,224), (1)],
-    >>>                               lod_levels=[0, 0],
-    >>>                               dtypes=['float32', 'int64'])
-    >>> # Via the reader, we can use 'read_file' layer to get data:
-    >>> image, label = fluid.layers.io.read_file(reader)
-
     Args:
        filename(${filename_type}): ${filename_comment}.
        shapes(list): List of tuples which declaring data shapes.
@@ -322,6 +313,17 @@ def open_recordio_file(filename,
 
     Returns:
        ${out_comment}.
+
+    Examples:
+
+        >>> import paddle.fluid as fluid
+        >>> reader = fluid.layers.io.open_recordio_file(
+        >>>                               filename='./data.recordio',
+        >>>                               shapes=[(3,224,224), (1)],
+        >>>                               lod_levels=[0, 0],
+        >>>                               dtypes=['float32', 'int64'])
+        >>> # Via the reader, we can use 'read_file' layer to get data:
+        >>> image, label = fluid.layers.io.read_file(reader)
     """
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
     shape_concat = []
@@ -549,6 +551,29 @@ def batch(reader, batch_size):
 
 
 def double_buffer(reader, place=None, name=None):
+    """
+    Wrap a double buffer reader. The data will copy to target place with a
+    double buffer queue. If the target place is None, the place that executor
+    perform on will be used.
+
+    Args:
+        reader(Variable): the reader variable need to be wrapped.
+        place(Place): the place of target data. Default is the sample place of
+            executor perform.
+
+        name(str): Variable name. None if the user does not care.
+
+    Returns:
+        wrapped reader with double buffer.
+
+    Examples:
+
+        >>> reader = fluid.layers.open_files(filenames=['somefile'],
+        >>>                                  shapes=[[-1, 784], [-1, 1]],
+        >>>                                  dtypes=['float32', 'int64'])
+        >>> reader = fluid.layers.double_buffer(reader)
+        >>> img, label = fluid.layers.read_file(reader)
+    """
     attrs = dict()
     if place is not None:
         attrs['place'] = str(place).upper()
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index f0abd3089d..486d6f371f 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -66,7 +66,6 @@ __all__ = [
     'uniform_random_batch_size_like',
     'gaussian_random',
     'gaussian_random_batch_size_like',
-    'cumsum',
     'scatter',
     'sum',
     'slice',
@@ -120,3 +119,25 @@ Examples:
     >>> data = fluid.layers.data(name="input", shape=[784])
     >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
 """
+
+__all__ += ['cumsum']
+
+_cum_sum_ = generate_layer_fn('cumsum')
+
+
+def cumsum(x, axis=None, exclusive=None, reverse=None):
+    kwargs = dict()
+    for name in locals():
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+
+    return _cum_sum_(**kwargs)
+
+
+cumsum.__doc__ = _cum_sum_.__doc__ + """
+Examples:
+
+    >>> data = fluid.layers.data(name="input", shape=[32, 784])
+    >>> result = fluid.layers.cumsum(data, axis=0)
+"""

From 055df47035fe9729f2cca9b1cd14874b1f6fe560 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Thu, 14 Jun 2018 17:31:08 +0800
Subject: [PATCH 22/69] Polish code

---
 paddle/fluid/operators/activation_op.cc |  9 ++++----
 paddle/fluid/operators/row_conv_op.cc   |  2 +-
 python/paddle/fluid/layers/ops.py       | 29 ++++++++++++++++++++-----
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 91a2826947..c73482eb12 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -275,7 +275,7 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
                    "The value of threshold for HardShrink. [default: 0.5]")
         .SetDefault(0.5f);
     AddComment(R"DOC(
-** HardShrink activation operator **
+:strong:`HardShrink activation operator`
 
 ..  math::
     out = \begin{cases}
@@ -394,15 +394,16 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "Input of ThresholdedRelu operator");
     AddOutput("Out", "Output of ThresholdedRelu operator");
-    AddAttr<float>("threshold", "The threshold location of activation")
+    AddAttr<float>("threshold",
+                   "The threshold location of activation. [default 1.0].")
         .SetDefault(1.0f);
     AddComment(R"DOC(
-ThresholdedRelu Activation Operator.
+:strong:`ThresholdedRelu activation operator`
 
 ..  math::
 
     out = \begin{cases}
-             x, \text{if } x > threshold \\
+             x,  \text{if } x > threshold \\
              0,  \text{otherwise}
           \end{cases}
 )DOC");
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index 52c37e8c91..10b1b0c899 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -94,7 +94,7 @@ class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
               "in this LodTensor is a matrix with shape T x N, i.e., the "
               "same shape as X.");
     AddComment(R"DOC(
-** Row-convolution operator **
+:strong:`Row-convolution operator`
 
 The row convolution is called lookahead convolution.  This operator was 
 introduced in the following paper for DeepSpeech2:
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 486d6f371f..6f404c5cc6 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -40,7 +40,6 @@ __activations__ = [
     'relu6',
     'pow',
     'stanh',
-    'thresholded_relu',
     'hard_sigmoid',
     'swish',
 ]
@@ -91,8 +90,7 @@ def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
     return _uniform_random_(**kwargs)
 
 
-uniform_random.__doc__ = _uniform_random_.__doc__ + "\n" \
-                         + """
+uniform_random.__doc__ = _uniform_random_.__doc__ + """
 Examples:
 
     >>> result = fluid.layers.uniform_random(shape=[32, 784])
@@ -112,8 +110,7 @@ def hard_shrink(x, threshold=None):
     return _hard_shrink_(**kwargs)
 
 
-hard_shrink.__doc__ = _hard_shrink_.__doc__ + "\n" \
-                      + """
+hard_shrink.__doc__ = _hard_shrink_.__doc__ + """
 Examples:
 
     >>> data = fluid.layers.data(name="input", shape=[784])
@@ -141,3 +138,25 @@ Examples:
     >>> data = fluid.layers.data(name="input", shape=[32, 784])
     >>> result = fluid.layers.cumsum(data, axis=0)
 """
+
+__all__ += ['thresholded_relu']
+
+_thresholded_relu_ = generate_layer_fn('thresholded_relu')
+
+
+def thresholded_relu(x, threshold=None):
+    kwargs = dict()
+    for name in locals():
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+
+    _thresholded_relu_(**kwargs)
+
+
+thresholded_relu.__doc__ = _thresholded_relu_.__doc__ + """
+Examples:
+
+    >>> data = fluid.layers.data(name="input", shape=[1])
+    >>> result = fluid.layers.thresholded_relu(data, threshold=0.4)
+"""

From 44925eb4c2d56ede78c6a1a68aeef57cfbfe03d1 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Thu, 14 Jun 2018 17:37:20 +0800
Subject: [PATCH 23/69] fix dist ut

---
 paddle/fluid/operators/listen_and_serv_op.cc  |  3 +-
 python/paddle/fluid/layers/io.py              | 49 +++++++++----------
 .../fluid/tests/unittests/test_dist_train.py  | 29 ++++++++---
 .../unittests/test_listen_and_serv_op.py      | 21 ++++----
 4 files changed, 59 insertions(+), 43 deletions(-)

diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 4d12278799..57c2ce4577 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -348,7 +348,8 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
 };
 
 void SignalHandler::StopAndExit(int signal_num) {
-  VLOG(3) << "Catch interrupt signal: " << signal_num << ", program will exit";
+  // Do not use VLOG here for the device for printing maybe already released.
+  // exit will release interal allocated resoureces.
   exit(0);
 }
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 9de88e2c32..bbb8e2e9cf 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -22,9 +22,9 @@ from ..executor import global_scope
 from layer_function_generator import generate_layer_fn, templatedoc
 
 __all__ = [
-    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
-    'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
-    'random_data_generator', 'Preprocessor', 'load'
+    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'Recv',
+    'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
+    'double_buffer', 'random_data_generator', 'Preprocessor', 'load'
 ]
 
 
@@ -177,18 +177,17 @@ class ListenAndServ(object):
             })
 
 
-def Send(endpoints, send_vars, get_vars=None):
+def Send(endpoints, send_vars, sync=True):
     """
-    Send layer
+    Send variables to the server side, and get vars from server
+    side when server have finished running server side program.
 
     Args:
-        endpoints: comma seperated IP:PORT pairs in the order
+        endpoints (str): comma seperated IP:PORT pairs in the order
                    of send_vars to send
-        send_vars: vars to send
-        get_vars: vars to get from server after send completes.
-
-    Send variables to the server side, and get vars from server
-    side when server have finished running server side program.
+        send_vars (list): variables to send to server
+        sync (bool): whether to wait the request finish
+    
     """
     assert (type(send_vars) == list)
 
@@ -196,40 +195,33 @@ def Send(endpoints, send_vars, get_vars=None):
     endpoints = list(set(epmap))
 
     helper = LayerHelper("Send", **locals())
-    if not get_vars:
-        get_vars = []
-        for s in send_vars:
-            v = helper.create_tmp_variable(dtype=s.dtype, stop_gradient=True)
-            get_vars.append(v)
     rpc_op_role_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
     helper.append_op(
         type="send",
         inputs={"X": send_vars},
-        outputs={"Out": get_vars},
         attrs={
             "endpoints": endpoints,
             "epmap": epmap,
             rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC
         })
+    if sync:
+        helper.append_op(type="send_barrier", attrs={"endpoints": endpoints})
 
-    return get_vars
 
-
-def Recv(endpoints, get_vars):
+def Recv(endpoints, get_vars, sync=True):
     """
-    Recv layer
+    Receive variables from server side
 
     Args:
-        endpoints: comma seperated IP:PORT pairs in the order
+        endpoints (str): comma seperated IP:PORT pairs in the order
                    of send_vars to send
-        send_vars: vars to send
-        get_vars: vars to get from server after send completes.
+        get_vars (list): vars to get from server after send completes.
+        sync (bool): whether to wait the request finish
 
-    Send variables to the server side, and get vars from server
-    side when server have finished running server side program.
+    Returns:
+        list: list of received variables
     """
-    assert (type(send_vars) == list)
     assert (type(get_vars) == list)
 
     epmap = endpoints.split(",")
@@ -242,6 +234,9 @@ def Recv(endpoints, get_vars):
         outputs={"Out": get_vars},
         attrs={"endpoints": endpoints,
                "epmap": epmap})
+    if sync:
+        helper.append_op(type="fetch_barrier", attrs={"endpoints": endpoints})
+    return get_vars
 
 
 def monkey_patch_reader_methods(reader):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
index 2314bb2ed8..562e66b062 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -16,6 +16,7 @@ import os
 import time
 import unittest
 from multiprocessing import Process
+import signal
 
 import numpy
 
@@ -24,9 +25,6 @@ import paddle.fluid.layers as layers
 
 
 class TestSendOp(unittest.TestCase):
-    @unittest.skip(
-        "This test is buggy. We cannot use time.sleep to sync processes, the connection may fail in unittest."
-    )
     def test_send(self):
         # Run init_serv in a thread
         place = fluid.CPUPlace()
@@ -35,7 +33,9 @@ class TestSendOp(unittest.TestCase):
         p.daemon = True
         p.start()
 
-        time.sleep(10)
+        self.ps_timeout = 5
+        self._wait_ps_ready(p.pid)
+
         with open("/tmp/paddle.%d.port" % p.pid, "r") as fn:
             selected_port = int(fn.readlines()[0])
         self.init_client(place, selected_port)
@@ -44,9 +44,23 @@ class TestSendOp(unittest.TestCase):
         self.assertTrue(numpy.allclose(self.local_out, self.dist_out))
 
         # FIXME(typhoonzero): find a way to gracefully shutdown the server.
-        os.system("kill -9 %d" % p.pid)
+        os.kill(p.pid, signal.SIGKILL)
         p.join()
 
+    def _wait_ps_ready(self, pid):
+        start_left_time = self.ps_timeout
+        sleep_time = 0.5
+        while True:
+            assert start_left_time >= 0, "wait ps ready failed"
+            time.sleep(sleep_time)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                start_left_time -= sleep_time
+
     def init_serv(self, place):
         main = fluid.Program()
 
@@ -84,7 +98,10 @@ class TestSendOp(unittest.TestCase):
                 dtype="float32",
                 persistable=False,
                 shape=[32, 32])
-            o = layers.Send("127.0.0.1:%d" % port, [x], [get_var])
+            fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
+            layers.Send("127.0.0.1:%d" % port, [x])
+            o = layers.Recv("127.0.0.1:%d" % port, [get_var])
+
         exe = fluid.Executor(place)
         self.dist_out = exe.run(main, fetch_list=o)  # o is a list
 
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index d1d709551c..9dec2acb1d 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -57,17 +57,18 @@ class TestListenAndServOp(OpTest):
     def setUp(self):
         self.ps_timeout = 5
         self.ip = "127.0.0.1"
-        self.port = "6173"
+        self.port = "0"
         self.trainers = 1
-        self.trainer_id = 1
+        self.trainer_id = 0
 
     def _start_pserver(self, use_cuda, sync_mode):
         p = Process(
             target=run_pserver,
             args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
                   self.trainer_id))
+        p.daemon = True
         p.start()
-        return p.pid
+        return p
 
     def _wait_ps_ready(self, pid):
         start_left_time = self.ps_timeout
@@ -89,18 +90,20 @@ class TestListenAndServOp(OpTest):
 
     def test_handle_signal_in_serv_op(self):
         # run pserver on CPU in sync mode
-        pid = self._start_pserver(False, True)
-        self._wait_ps_ready(pid)
+        p1 = self._start_pserver(False, True)
+        self._wait_ps_ready(p1.pid)
 
         # raise SIGTERM to pserver
-        os.kill(pid, signal.SIGTERM)
+        os.kill(p1.pid, signal.SIGKILL)
+        p1.join()
 
         # run pserver on CPU in async mode
-        pid = self._start_pserver(False, False)
-        self._wait_ps_ready(pid)
+        p2 = self._start_pserver(False, False)
+        self._wait_ps_ready(p2.pid)
 
         # raise SIGTERM to pserver
-        os.kill(pid, signal.SIGTERM)
+        os.kill(p2.pid, signal.SIGKILL)
+        p2.join()
 
 
 if __name__ == '__main__':

From ca743de2e09c4e966a7b647d3ce6b304fb61cdb7 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Thu, 14 Jun 2018 18:06:55 +0800
Subject: [PATCH 24/69] enable more type for splitOp and ConcatOp

---
 paddle/fluid/operators/concat_op.cc    | 10 ++++++++--
 paddle/fluid/operators/concat_op.cu.cc | 10 ++++++++--
 paddle/fluid/operators/split_op.cc     |  5 ++++-
 paddle/fluid/operators/split_op.cu.cc  |  5 ++++-
 4 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 38337f9aa5..c724055937 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -107,7 +107,13 @@ REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
                       false> /* set false to disable empty grad */);
 REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>);
+    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, int>);
 REGISTER_OP_CPU_KERNEL(
     concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/concat_op.cu.cc b/paddle/fluid/operators/concat_op.cu.cc
index 590eca9d06..8e38e5231f 100644
--- a/paddle/fluid/operators/concat_op.cu.cc
+++ b/paddle/fluid/operators/concat_op.cu.cc
@@ -15,7 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/concat_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>);
+    concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, int>);
 REGISTER_OP_CUDA_KERNEL(
     concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int>);
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index 5e2b2a9945..d661b276bc 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -115,4 +115,7 @@ USE_CPU_ONLY_OP(concat);
 
 REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker);
 REGISTER_OP_CPU_KERNEL(split,
-                       ops::SplitOpKernel<paddle::platform::CPUPlace, float>);
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, double>,
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, float>,
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, int64_t>,
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, int>);
diff --git a/paddle/fluid/operators/split_op.cu.cc b/paddle/fluid/operators/split_op.cu.cc
index efa378af85..18e0904681 100644
--- a/paddle/fluid/operators/split_op.cu.cc
+++ b/paddle/fluid/operators/split_op.cu.cc
@@ -15,4 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/split_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>);
+    split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SplitOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SplitOpKernel<paddle::platform::CUDADeviceContext, int>);

From b3cb536402c8e3cc332c6c6da5de13a28ff78acc Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Thu, 14 Jun 2018 11:53:35 +0000
Subject: [PATCH 25/69] Fix doc of relu, log and zeros.

---
 python/paddle/fluid/layers/nn.py     | 130 ++++++++++++++-------------
 python/paddle/fluid/layers/ops.py    |   2 -
 python/paddle/fluid/layers/tensor.py |   7 +-
 3 files changed, 74 insertions(+), 65 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bd6ed0f30e..97a8af69ea 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -24,66 +24,20 @@ from tensor import concat
 import utils
 
 __all__ = [
-    'fc',
-    'embedding',
-    'dynamic_lstm',
-    'dynamic_lstmp',
-    'dynamic_gru',
-    'gru_unit',
-    'linear_chain_crf',
-    'crf_decoding',
-    'cos_sim',
-    'cross_entropy',
-    'square_error_cost',
-    'chunk_eval',
-    'sequence_conv',
-    'conv2d',
-    'sequence_pool',
-    'sequence_softmax',
-    'softmax',
-    'pool2d',
-    'batch_norm',
-    'beam_search_decode',
-    'conv2d_transpose',
-    'sequence_expand',
-    'lstm_unit',
-    'reduce_sum',
-    'reduce_mean',
-    'reduce_max',
-    'reduce_min',
-    'reduce_prod',
-    'sequence_first_step',
-    'sequence_last_step',
-    'dropout',
-    'split',
-    'ctc_greedy_decoder',
-    'edit_distance',
-    'l2_normalize',
-    'matmul',
-    'topk',
-    'warpctc',
-    'sequence_reshape',
-    'transpose',
-    'im2sequence',
-    'nce',
-    'beam_search',
-    'row_conv',
-    'multiplex',
-    'layer_norm',
-    'softmax_with_cross_entropy',
-    'smooth_l1',
-    'one_hot',
-    'autoincreased_step_counter',
-    'reshape',
-    'lod_reset',
-    'lrn',
-    'pad',
-    'label_smooth',
-    'roi_pool',
-    'dice_loss',
-    'resize_bilinear',
-    'gather',
-    'random_crop',
+    'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru',
+    'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy',
+    'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d',
+    'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', 'batch_norm',
+    'beam_search_decode', 'conv2d_transpose', 'sequence_expand', 'lstm_unit',
+    'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min', 'reduce_prod',
+    'sequence_first_step', 'sequence_last_step', 'dropout', 'split',
+    'ctc_greedy_decoder', 'edit_distance', 'l2_normalize', 'matmul', 'topk',
+    'warpctc', 'sequence_reshape', 'transpose', 'im2sequence', 'nce',
+    'beam_search', 'row_conv', 'multiplex', 'layer_norm',
+    'softmax_with_cross_entropy', 'smooth_l1', 'one_hot',
+    'autoincreased_step_counter', 'reshape', 'lod_reset', 'lrn', 'pad',
+    'label_smooth', 'roi_pool', 'dice_loss', 'resize_bilinear', 'gather',
+    'random_crop', 'relu', 'log'
 ]
 
 
@@ -4075,3 +4029,59 @@ def random_crop(input, shape, seed=1):
                  "SeedOut": seed_out},
         attrs={"shape": shape})
     return out
+
+
+def log(x):
+    """
+    Calculates the natural log of the given input tensor, element-wise.
+
+    .. math::
+
+        Out = \\ln(x)
+
+    Args:
+        x (Variable): Input tensor. 
+
+    Returns:
+        Variable: The natural log of the input tensor computed element-wise.
+
+    Examples:
+
+        .. code-block:: python
+
+            output = fluid.layers.log(x)
+    """
+    helper = LayerHelper('log', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(type="log", inputs={"X": input}, outputs={"Out": out})
+    return out
+
+
+def relu(x):
+    """
+    Relu takes one input data (Tensor) and produces one output data (Tensor)
+    where the rectified linear function, y = max(0, x), is applied to
+    the tensor elementwise.
+
+    .. math::
+
+        Out = \\max(0, x)
+
+    Args:
+        x (Variable): The input tensor. 
+
+    Returns:
+        Variable: The output tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            output = fluid.layers.relu(x)
+    """
+    helper = LayerHelper('relu', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(type="relu", inputs={"X": input}, outputs={"Out": out})
+    return out
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 60f8cbbfa7..7d3c44a389 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -17,7 +17,6 @@ __activations__ = [
     'sigmoid',
     'logsigmoid',
     'exp',
-    'relu',
     'tanh',
     'tanh_shrink',
     'softshrink',
@@ -29,7 +28,6 @@ __activations__ = [
     'sin',
     'round',
     'reciprocal',
-    'log',
     'square',
     'softplus',
     'softsign',
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index be34cc81a5..c7e9813c49 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -349,11 +349,12 @@ def zeros(shape, dtype, force_cpu=False):
     It also sets *stop_gradient* to True.
 
     Args:
-        shape(tuple|list|None): Shape of output tensor
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor
+        shape(tuple|list|None): Shape of output tensor.
+        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor.
+        force_cpu(bool, default False): Whether to make output stay on CPU.
 
     Returns:
-        Variable: The tensor variable storing the output
+        Variable: The tensor variable storing the output.
 
     Examples:
         .. code-block:: python

From a6e69d3cc12c922b9cfc13b653fa845050390e68 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Thu, 14 Jun 2018 20:40:02 +0800
Subject: [PATCH 26/69] Add doc for 'batch'

---
 python/paddle/fluid/layers/io.py | 45 ++++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 6d6cdffe27..e22257ea34 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -549,6 +549,41 @@ def shuffle(reader, buffer_size):
 
 
 def batch(reader, batch_size):
+    """
+    This layer is a reader decorator. It takes a reader and adds 
+    'batching' decoration on it. When reading with the result 
+    decorated reader, output data will be automatically organized 
+    to the form of batches.
+
+    Args:
+        reader(Variable): The reader to be decorated with 'batching'.
+        batch_size(int): The batch size.
+
+    Returns:
+        Variable: The reader which has been decorated with 'batching'.
+
+    Examples:
+        .. code-block:: python
+
+            raw_reader = fluid.layers.io.open_files(filenames=['./data1.recordio',
+                                                           './data2.recordio'],
+                                                    shapes=[(3,224,224), (1)],
+                                                    lod_levels=[0, 0],
+                                                    dtypes=['float32', 'int64'],
+                                                    thread_num=2,
+                                                    buffer_size=2)
+            batch_reader = fluid.layers.batch(reader=raw_reader, batch_size=5)
+
+            # If we read data with the raw_reader:
+            #     data = fluid.layers.read_file(raw_reader)
+            # We can only get data instance by instance.
+            # 
+            # However, if we read data with the batch_reader:
+            #     data = fluid.layers.read_file(batch_reader)
+            # Each 5 adjacent instances will be automatically combined together 
+            # to become a batch. So what we get('data') is a batch data instead 
+            # of an instance.
+    """
     return __create_unshared_decorated_reader__(
         'create_batch_reader', reader, {'batch_size': int(batch_size)})
 
@@ -571,20 +606,20 @@ def parallel(reader):
                                               {})
 
 
-def read_file(file_obj):
+def read_file(reader):
     """
-    Read data from a file object.
+    Execute the given reader and get data via it.
 
-    A file object is also a Variable. It can be a raw file object generated by 
+    A reader is also a Variable. It can be a raw reader generated by 
     `fluid.layers.open_files()` or a decorated one generated by 
     `fluid.layers.double_buffer()` and so on.
 
     Args:
 
-        file_obj(Variable): The file object from where to read data.
+        reader(Variable): The reader to execute.
 
     Returns:
-        Tuple[Variable]: Data read from the given file object.
+        Tuple[Variable]: Data read via the given reader.
 
     Examples:
         .. code-block:: python

From fbbac505104225329ff2953116acfe1db50d6eac Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Thu, 14 Jun 2018 06:07:31 -0700
Subject: [PATCH 27/69] Fix typos and format problems in smooth_l1's doc

---
 python/paddle/fluid/layers/nn.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 2c1f988828..ed2e1811f6 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3411,31 +3411,30 @@ def softmax_with_cross_entropy(logits, label, soft_label=False):
 
 def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     """
-    **Smooth L1 Loss Operator. **
-
-    This operator computes the smooth L1 loss for X and Y.
-    The operator takes the first dimension of X and Y as batch size.
+    This layer computes the smooth L1 loss for Variable `x` and `y`.
+    It takes the first dimension of `x` and `y` as batch size.
     For each instance, it computes the smooth L1 loss element by element first
-    and then sums all the losses. So the shape of Out is [batch_size, 1].
+    and then sums all the losses. So the shape of ouput Variable is 
+    [batch_size, 1].
 
     Args:
         x (Variable): A tensor with rank at least 2. The input value of smooth
             L1 loss op with shape [batch_size, dim1, ..., dimN].
         y (Variable): A tensor with rank at least 2. The target value of smooth
-            L1 loss op with same shape as x.
+            L1 loss op with same shape as `x`.
         inside_weight (Variable|None):  A tensor with rank at least 2. This
-            input is optional and should have same shape with x. If provided,
-            the result of (x - y) will be multiplied by this tensor element by
+            input is optional and should have same shape with `x`. If provided,
+            the result of (`x - y`) will be multiplied by this tensor element by
             element.
         outside_weight (Variable|None): A tensor with rank at least 2. This
             input is optional and should have same shape with x. If provided,
             the out smooth L1 loss will be multiplied by this tensor element
             by element.
-        sigma (float|None): Hyper parameter of smooth L1 loss op. A float scalar
-            with default value 1.0.
+        sigma (float|None): Hyper parameter of smooth L1 loss layer. A float 
+           scalar with default value 1.0.
+
     Returns:
-        Variable: A tensor with rank be 2. The output smooth L1 loss with
-            shape [batch_size, 1].
+        Variable: The output smooth L1 loss with shape [batch_size, 1].
 
     Examples:
         .. code-block:: python
@@ -3446,6 +3445,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
             fc = fluid.layers.fc(input=data, size=100)
             out = fluid.layers.smooth_l1(x=fc, y=label)
     """
+
     helper = LayerHelper('smooth_l1_loss', **locals())
     diff = helper.create_tmp_variable(dtype=x.dtype)
     loss = helper.create_tmp_variable(dtype=x.dtype)

From bff4cec3b3ab9c1b4cf5086d006def15cc0eaa82 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Thu, 14 Jun 2018 06:36:54 -0700
Subject: [PATCH 28/69] Format lod_reset's doc

---
 python/paddle/fluid/layers/nn.py | 39 ++++++++++++++++----------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ed2e1811f6..378a1c33c6 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3411,8 +3411,8 @@ def softmax_with_cross_entropy(logits, label, soft_label=False):
 
 def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     """
-    This layer computes the smooth L1 loss for Variable `x` and `y`.
-    It takes the first dimension of `x` and `y` as batch size.
+    This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
+    It takes the first dimension of :attr:`x` and :attr:`y` as batch size.
     For each instance, it computes the smooth L1 loss element by element first
     and then sums all the losses. So the shape of ouput Variable is 
     [batch_size, 1].
@@ -3421,15 +3421,15 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
         x (Variable): A tensor with rank at least 2. The input value of smooth
             L1 loss op with shape [batch_size, dim1, ..., dimN].
         y (Variable): A tensor with rank at least 2. The target value of smooth
-            L1 loss op with same shape as `x`.
+            L1 loss op with same shape as :attr:`x`.
         inside_weight (Variable|None):  A tensor with rank at least 2. This
-            input is optional and should have same shape with `x`. If provided,
-            the result of (`x - y`) will be multiplied by this tensor element by
-            element.
+            input is optional and should have same shape with :attr:`x`. If 
+            provided, the result of (:attr:`x` - :attr:`y`) will be multiplied 
+            by this tensor element by element.
         outside_weight (Variable|None): A tensor with rank at least 2. This
-            input is optional and should have same shape with x. If provided,
-            the out smooth L1 loss will be multiplied by this tensor element
-            by element.
+            input is optional and should have same shape with :attr:`x`. If 
+            provided, the out smooth L1 loss will be multiplied by this tensor 
+            element by element.
         sigma (float|None): Hyper parameter of smooth L1 loss layer. A float 
            scalar with default value 1.0.
 
@@ -3634,12 +3634,12 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
 
 def lod_reset(x, y=None, target_lod=None):
     """
-    LoD Reset Operator. Set LoD of **x** to a new one specified by **y** or
-    **target_lod**. When **y** provided, **y.lod** would be considered as target
-    LoD first, otherwise **y.data** would be considered as target LoD. If **y**
-    is not provided, target LoD should be specified by **target_lod**.
-    If target LoD is specified by **Y.data** or **target_lod**, only one level
-    LoD is supported.
+    Set LoD of :attr:`x` to a new one specified by :attr:`y` or
+    :attr:`target_lod`. When :attr:`y` provided, :attr:`y.lod` would be 
+    considered as target LoD first, otherwise :attr:`y.data` would be 
+    considered as target LoD. If :attr:`y` is not provided, target LoD should 
+    be specified by :attr:`target_lod`. If target LoD is specified by 
+    :attr:`Y.data` or :attr:`target_lod`, only one level LoD is supported.
 
     .. code-block:: text
 
@@ -3692,15 +3692,16 @@ def lod_reset(x, y=None, target_lod=None):
 
     Args:
         x (Variable): Input variable which could be a Tensor or LodTensor.
-        y (Variable|None): If provided, output's LoD would be derived from y.
+        y (Variable|None): If provided, output's LoD would be derived 
+                           from :attr:`y`.
         target_lod (list|tuple|None): One level LoD which should be considered
-                                      as target LoD when y not provided.
+                                      as target LoD when :attr:`y` not provided.
 
     Returns:
-        Variable: Output variable with LoD specified by this operator.
+        Variable: Output variable with LoD specified by this layer.
 
     Raises:
-        ValueError: If y and target_lod are both None.
+        ValueError: If :attr:`y` and :attr:`target_lod` are both None.
 
     Examples:
         .. code-block:: python

From 80ccabbfa7b9777fab2369f6a2ecb852b61b0906 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Thu, 14 Jun 2018 06:56:17 -0700
Subject: [PATCH 29/69] Fix typos in reduce_mean's doc

---
 python/paddle/fluid/layers/nn.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 378a1c33c6..47dfed5468 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2243,23 +2243,24 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
 
 def reduce_mean(input, dim=None, keep_dim=False, name=None):
     """
-    Computes the mean of tensor elements over the given dimension.
+    Computes the mean of the input tensor's elements along the given dimension.
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (list|int|None): The dimensions along which the mean is computed. If
-            :attr:`None`, compute the mean over all elements of :attr:`input`
-            and return a Tensor variable with a single element, otherwise
+        dim (list|int|None): The dimension along which the mean is computed. If
+            `None`, compute the mean over all elements of :attr:`input`
+            and return a variable with a single element, otherwise it
             must be in the range :math:`[-rank(input), rank(input))`. If
-            :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
+            :math:`dim[i] < 0`, the dimension to reduce is 
+            :math:`rank(input) + dim[i]`.
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
-        name(str|None): A name for this layer(optional). If set None, the layer
+        name(str|None): A name for this layer(optional). If set `None`, the layer
                        will be named automatically.
 
     Returns:
-        Variable: The reduced Tensor variable.
+        Variable: The reduced mean Variable.
 
     Examples:
         .. code-block:: python

From acdb57a510d116d0c9f2a0d0b26083f474cb4f8a Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 15 Jun 2018 02:30:36 +0800
Subject: [PATCH 30/69] polish doc: conv2d

---
 python/paddle/fluid/layers/nn.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 2c1f988828..48c6bb99bb 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1183,14 +1183,17 @@ def conv2d(input,
            act=None,
            name=None):
     """
-    **Convlution2D Layer**
-
     The convolution2D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are in NCHW format. Where N is batch size, C is the number of
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCHW format, where N is batch size, C is the number of
     channels, H is the height of the feature, and W is the width of the feature.
-    The details of convolution layer, please refer UFLDL's `convolution,
-    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
+    Filter is in MCHW format, where M is the number of output image channels,
+    C is the number of input image channels, H is the height of the filter,
+    and W is the width of the filter. If the groups is greater than 1,
+    C will equal the number of input image channels divided by the groups.
+    Please refer to UFLDL's `convolution
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
+    for more detials.
     If bias attribution and activation type are provided, bias is added to the
     output of the convolution, and the corresponding activation function is
     applied to the final result.
@@ -1201,15 +1204,14 @@ def conv2d(input,
 
         Out = \sigma (W \\ast X + b)
 
-    In the above equation:
+    Where:
 
     * :math:`X`: Input value, a tensor with NCHW format.
     * :math:`W`: Filter value, a tensor with MCHW format.
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
-                   different.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
     Example:
 
@@ -1220,6 +1222,7 @@ def conv2d(input,
           Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
 
         - Output:
+
           Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
 
         Where
@@ -1231,7 +1234,7 @@ def conv2d(input,
 
     Args:
         input (Variable): The input image with [N, C, H, W] format.
-            num_filters(int): The number of filter. It is as same as the output
+        num_filters(int): The number of filter. It is as same as the output
             image channel.
         filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
             it must contain two integers, (filter_size_H, filter_size_W).
@@ -1254,7 +1257,8 @@ def conv2d(input,
         bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
         use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
             library is installed. Default: True
-        use_mkldnn (bool): Use mkldnn kernels or not.
+        use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
+            with mkldnn library. Default: False
         act (str): Activation type. Default: None
         name (str|None): A name for this layer(optional). If set None, the layer
             will be named automatically.

From 24fea628ccb224c3d8a4eadd37d5b23bc39ad1ce Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 15 Jun 2018 03:03:28 +0800
Subject: [PATCH 31/69] polish doc: mean

---
 paddle/fluid/operators/mean_op.cc | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 4881cff4a3..9e0bebd17c 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -33,12 +33,10 @@ class MeanOp : public framework::OperatorWithKernel {
 class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "The input of mean op");
-    AddOutput("Out", "The output of mean op").Reuse("X");
+    AddInput("X", "(Tensor) The input of mean op");
+    AddOutput("Out", "(Tensor) The output of mean op").Reuse("X");
     AddComment(R"DOC(
-Mean Operator.
-
-Out is a scalar which is the mean of all elements in X. 
+Mean Operator calculates the mean of all elements in X.
 
 )DOC");
   }

From f9bebfe43092807891b25392960db6944f072d5d Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 15 Jun 2018 04:22:33 +0800
Subject: [PATCH 32/69] polish doc: lod_rank_table, embedding

---
 python/paddle/fluid/layers/control_flow.py | 2 +-
 python/paddle/fluid/layers/nn.py           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 80e8ff484a..87843b0e96 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -706,7 +706,7 @@ def lod_rank_table(x, level=0):
         .. code-block:: python
 
             x = fluid.layers.data(name='x', shape=[10],
-                            dtype='float32', lod_level=1)
+                                  dtype='float32', lod_level=1)
             out = layers.lod_rank_table(x=x, level=0)
     """
     helper = LayerHelper("lod_rank_table", **locals())
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 48c6bb99bb..635b08635b 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -173,11 +173,11 @@ def embedding(input,
             have two elements which indicate the size of the dictionary of
             embeddings and the size of each embedding vector respectively.
         is_sparse(bool): The flag indicating whether to use sparse update.
-        is_distributed (bool): Whether to run lookup table from remote parameter server.
+        is_distributed(bool): Whether to run lookup table from remote parameter server.
         padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
             Otherwise the given :attr:`padding_idx` indicates padding the output
             with zeros whenever lookup encounters it in :attr:`input`. If
-            :math:`padding_idx < 0`, the padding_idx to use in lookup is
+            :math:`padding_idx < 0`, the :attr:`padding_idx` to use in lookup is
             :math:`size[0] + dim`.
         param_attr(ParamAttr): Parameters for this layer
         dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc

From 98ab2b403efb475bf449317b139c2b99f94b49c8 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 15 Jun 2018 04:55:06 +0800
Subject: [PATCH 33/69] polish doc: softshrink, assign, shuffle

---
 paddle/fluid/operators/activation_op.cc | 17 ++++++++---------
 python/paddle/fluid/layers/io.py        |  3 +++
 python/paddle/fluid/layers/tensor.py    |  1 +
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index af1d85047e..4d224a1341 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -252,15 +252,14 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "Output of Softshrink operator");
     AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
     AddComment(R"DOC(
-Softshrink Activation Operator.
-
-$$
-out = \begin{cases} 
-    x - \lambda, \text{if } x > \lambda \\
-    x + \lambda, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+:strong:`Softshrink Activation Operator`
+
+..  math::
+    out = \begin{cases} 
+         x - \lambda, \text{if } x > \lambda \\
+         x + \lambda, \text{if } x < -\lambda \\
+         0,  \text{otherwise}
+         \end{cases}
 
 )DOC");
   }
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 9de88e2c32..fc53cd802b 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -544,6 +544,9 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
 
 
 def shuffle(reader, buffer_size):
+    """
+    Shuffle the reader.
+    """
     return __create_unshared_decorated_reader__(
         'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
 
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 62b01d595a..d4e9a19d1a 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -191,6 +191,7 @@ def assign(input, output):
 
     Examples:
         .. code-block:: python
+
           out = fluid.layers.create_tensor(dtype='float32')
           hidden = fluid.layers.fc(input=data, size=10)
           fluid.layers.assign(hidden, out)

From 14efb55dcff62f84599eaf08ec2490f5019f77d3 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <kexin.zhao.paddle@gmail.com>
Date: Thu, 14 Jun 2018 18:25:30 -0700
Subject: [PATCH 34/69] add author (#11499)

---
 AUTHORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/AUTHORS.md b/AUTHORS.md
index 11f227be71..8c4a113fc2 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -22,6 +22,7 @@
 | jczaja | Jacek Czaja |
 | JiayiFeng | Jia-Yi Feng |
 | kbinias | Krzysztof Binias |
+| kexinzhao | Ke-Xin Zhao |
 | kuke | Yi-Bing Liu |
 | lcy-seso | Ying Cao |
 | lipeng-unisound | Peng Li |

From caf6914fadf31ac5e9c94c32e367481e750772f1 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Thu, 14 Jun 2018 20:18:27 +0800
Subject: [PATCH 35/69] add doc of sequence_softmax and parallelDo

---
 python/paddle/fluid/layers/control_flow.py |  5 ++--
 python/paddle/fluid/layers/nn.py           | 35 ++++++++++++++++++++++
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 80e8ff484a..15ecc731e8 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -233,9 +233,8 @@ class BlockGuard(object):
 
 class ParallelDo(object):
     """
-    ParallelDo class.
-
-    ParallelDo class is used to create a ParallelDo.
+    ParallelDo class is used to create a ParallelDo. 
+    It will be soon deprecated, please use ParallelExecutor instead.
     """
 
     def __init__(self, places, use_nccl=False, name=None):
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 2c1f988828..04b23457d4 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1146,6 +1146,41 @@ def sequence_conv(input,
 
 
 def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
+    """
+    This function computes the softmax activation among all time-steps for each
+    sequence. The dimension of each time-step should be 1. Thus, the shape of
+    input Tensor can be either :math:`[N, 1]` or :math:`[N]`, where :math:`N` 
+    is the sum of the length of all sequences.
+
+    For i-th sequence in a mini-batch:
+
+    .. math::
+
+        Out(X[lod[i]:lod[i+1]], :) = \\frac{\exp(X[lod[i]:lod[i+1], :])}{\sum(\exp(X[lod[i]:lod[i+1], :]))}
+
+    For example, for a mini-batch of 3 sequences with variable-length,
+    each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
+    then softmax will be computed among :math:`X[0:2, :]`, :math:`X[2:5, :]`,
+    :math:`X[5:7, :]`, and :math:`N` turns out to be 7.
+
+    Args:
+        input (Variable): The input variable which is a LoDTensor.
+        bias_attr (ParamAttr|None): attributes for bias
+        param_attr (ParamAttr|None): attributes for parameter
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
+        library is installed. Default: True
+    
+    Returns:
+        Variable: output of sequence_softmax
+
+    Examples:
+
+        .. code-block:: python
+
+             x = fluid.layers.data(name='x', shape=[7, 1],
+                              dtype='float32', lod_level=1)
+             x_sequence_softmax = fluid.layers.sequence_softmax(input=x)
+    """
     helper = LayerHelper('sequence_softmax', **locals())
     dtype = helper.input_dtype()
     softmax_out = helper.create_tmp_variable(dtype)

From e42e6ea142d8f40cc0fa0a82079be20918733c8a Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Fri, 15 Jun 2018 10:30:03 +0800
Subject: [PATCH 36/69] add inference lib to release (#11482)

---
 cmake/inference_lib.cmake | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 236a55d332..cd44fe2542 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -39,7 +39,7 @@ function(copy TARGET)
         message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
     endif()
     math(EXPR len "${copy_lib_SRCS_len} - 1")
-    
+
     add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
     foreach(index RANGE ${len})
         list(GET copy_lib_SRCS ${index} src)
@@ -155,6 +155,15 @@ copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}
 )
 
+if(WITH_CONTRIB)
+   set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
+   copy(contrib_inference_lib DEPS paddle_inference_api
+        SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
+        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
+        DSTS ${contrib_dst_dir} ${contrib_dst_dir}
+   )
+endif()
+
 set(module "platform")
 copy(platform_lib DEPS profiler_py_proto
   SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h

From 0b063e5e57f05de428f3fce194a8d5fe4629568c Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Thu, 14 Jun 2018 20:26:50 -0700
Subject: [PATCH 37/69] Fix one_hot layer's doc

---
 python/paddle/fluid/layers/nn.py | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 47dfed5468..f4a8c5f37d 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3466,32 +3466,20 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
 
 def one_hot(input, depth):
     """
-    One Hot Operator. This operator creates the one-hot representations for input
-    index values. The following example will help to explain the function of this
-    operator.
+    This layer creates the one-hot representations for input indices.
 
     Args:
-        input(variable):  A Tensor/LodTensor of indices, last dimension must be 1.
-        depth(scalar): an interger defining the depth of the one hot dimension.
+        input(Variable): Input indices, last dimension must be 1.
+        depth(scalar): An interger defining the depth of the one-hot dimension.
 
     Returns:
-         The one-hot tensor or LodTensor, same as input.
+        Variable: The one-hot representations of input.
 
     Examples:
         .. code-block:: python
-
-        X is a LoDTensor:
-          X.lod = [[0, 1, 4]]
-          X.shape = [4, 1]
-          X.data = [[1], [1], [3], [0]]
-        set depth = 4
-        Out is a LoDTensor:
-          Out.lod = [[0, 1, 4]]
-          Out.shape = [4, 4]
-          Out.data = [[0., 1., 0., 0.],
-                      [0., 1., 0., 0.],
-                      [0., 0., 0., 1.],
-                      [1., 0., 0., 0.]]
+        
+            label = layers.data(name="label", shape=[1], dtype="float32")
+            one_hot_label = layers.one_hot(input=label, depth=10)
     """
     helper = LayerHelper("one_hot", **locals())
     one_hot_out = helper.create_tmp_variable(dtype='float32')

From d91060d300edf3c908f25b741adc999b065887da Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 15 Jun 2018 11:38:31 +0800
Subject: [PATCH 38/69] fix errors

---
 paddle/fluid/operators/activation_op.cc | 2 +-
 paddle/fluid/operators/pool_op.cc       | 8 ++++----
 python/paddle/fluid/layers/nn.py        | 6 +++---
 python/paddle/fluid/layers/tensor.py    | 3 ++-
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index af1d85047e..790c012fdb 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -444,7 +444,7 @@ class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Swish Activation Operator.
 
-$$out = \frac{x}{1 + e^{- \beta x}}$$
+$$out = \\frac{x}{1 + e^{- \beta x}}$$
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index d94ddc7a53..f8ad63690e 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -224,17 +224,17 @@ Example:
 
   For ceil_mode = false:
        $$
-       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
        $$
        $$
-       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
+       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
        $$
   For ceil_mode = true:
        $$
-       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1
        $$
        $$
-       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
+       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
        $$
 
 )DOC");
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 1218766e8d..b073955e2f 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1495,9 +1495,9 @@ def pool2d(input,
 
     Args:
         input (Variable): The input tensor of pooling operator. The format of 
-        input tensor is NCHW, where N is batch size, C is the number of 
-        channels, H is the height of the feature, and W is the width of 
-        the feature.
+                          input tensor is NCHW, where N is batch size, C is 
+                          the number of channels, H is the height of the 
+                          feature, and W is the width of the feature.
         pool_size (int): The side length of pooling windows. All pooling 
                          windows are squares with pool_size on a side.
         pool_type: ${pooling_type_comment}
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 392fa6a422..81f42ff470 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -146,7 +146,8 @@ def concat(input, axis=0, name=None):
 
     Examples:
         .. code-block:: python
-          out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
+        
+           out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
     """
     helper = LayerHelper('concat', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())

From 96e466391689f2dd0915c21f1176eae754a73a23 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Fri, 15 Jun 2018 12:22:30 +0800
Subject: [PATCH 39/69] Polish inline math and duplicable/optional in auto
 generated doc

---
 .../fluid/layers/layer_function_generator.py  | 28 +++++++++++++------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index cb60a3aec9..0f05ea2b08 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -44,6 +44,11 @@ def _type_to_str_(tp):
     return framework_pb2.AttrType.Name(tp)
 
 
+_two_dollar_pattern_ = re.compile(r"\$\$([^\$]+)\$\$")
+_single_dollar_pattern_ = re.compile(r"\$([^\$]+)\$")
+_two_bang_pattern_ = re.compile(r"!!([^!]+)!!")
+
+
 def _generate_doc_string_(op_proto):
     """
     Generate docstring by OpProto
@@ -55,22 +60,27 @@ def _generate_doc_string_(op_proto):
         str: the document string
     """
 
+    def escape_math(text):
+        return _two_bang_pattern_.sub(
+            r'$$\1$$',
+            _single_dollar_pattern_.sub(
+                r':math:`\1`', _two_dollar_pattern_.sub(r"!!\1!!", text)))
+
     if not isinstance(op_proto, framework_pb2.OpProto):
         raise TypeError("OpProto should be `framework_pb2.OpProto`")
 
     buf = cStringIO.StringIO()
-    buf.write(op_proto.comment)
+    buf.write(escape_math(op_proto.comment))
     buf.write('\nArgs:\n')
     for each_input in op_proto.inputs:
         line_begin = '    {0}: '.format(_convert_(each_input.name))
         buf.write(line_begin)
-        buf.write(each_input.comment)
+        buf.write(escape_math(each_input.comment))
         buf.write('\n')
-        buf.write(' ' * len(line_begin))
-        buf.write('Duplicable: ')
-        buf.write(str(each_input.duplicable))
-        buf.write('  Optional: ')
-        buf.write(str(each_input.dispensable))
+        if each_input.duplicable:
+            buf.write("  Duplicatable.")
+        if each_input.dispensable:
+            buf.write("  Optional.")
         buf.write('\n')
 
     skip_attrs = OpProtoHolder.generated_op_attr_names()
@@ -83,7 +93,7 @@ def _generate_doc_string_(op_proto):
         buf.write(' (')
         buf.write(_type_to_str_(each_attr.type))
         buf.write('): ')
-        buf.write(each_attr.comment)
+        buf.write(escape_math(each_attr.comment))
         buf.write('\n')
 
     if len(op_proto.outputs) != 0:
@@ -92,7 +102,7 @@ def _generate_doc_string_(op_proto):
         for each_opt in op_proto.outputs:
             if not each_opt.intermediate:
                 break
-        buf.write(each_opt.comment)
+        buf.write(escape_math(each_opt.comment))
 
     return buf.getvalue()
 

From 34ac0eb8e613868a2b6de35a62c86a4059d72335 Mon Sep 17 00:00:00 2001
From: QI JUN <qijun1994@hotmail.com>
Date: Fri, 15 Jun 2018 13:09:25 +0800
Subject: [PATCH 40/69] enhance memory optimization transpiler to support user
 defined skip_opt_set (#11372)

* fix mac build error

* enhance memory optimize transpiler to let users set to some skip opt set of variables
---
 .../memory_optimization_transpiler.py           | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 9ff0ae6fca..8bfb554845 100644
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -157,9 +157,11 @@ class ControlFlowGraph(object):
             if op.type() == "fill_constant" and op.attr("force_cpu") == True:
                 self._skip_opt.update(op.output_arg_names())
 
-    def release_memory(self):
+    def release_memory(self, skip_opt_set=None):
         self._dataflow_analyze()
         self._update_skip_opt_set()
+        if skip_opt_set:
+            self._skip_opt.update(skip_opt_set)
         fwd_id = 0
         bwd_id = 0
         for i in range(self.op_size):
@@ -183,7 +185,7 @@ class ControlFlowGraph(object):
                 else:
                     bwd_id += 1
 
-    def memory_optimize(self, level=0):
+    def memory_optimize(self, skip_opt_set=None, level=0):
         def compare_shape(x_shape, cache_shape, opt_level):
             if opt_level == 0:
                 return x_shape == cache_shape
@@ -200,6 +202,9 @@ class ControlFlowGraph(object):
 
         self._dataflow_analyze()
         self._update_skip_opt_set()
+        # update skip set to meet users' demand
+        if skip_opt_set:
+            self._skip_opt.update(skip_opt_set)
         self.pool = []
         for i in range(self.op_size):
             op = self._ops[i]
@@ -358,7 +363,7 @@ def _get_cfgs(input_program):
     return cfgs
 
 
-def memory_optimize(input_program, print_log=False, level=0):
+def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
     """Optimize memory by reusing var memory.
 
       Note: it doesn't not support subblock nested in subblock.
@@ -374,10 +379,10 @@ def memory_optimize(input_program, print_log=False, level=0):
     PRINT_LOG = print_log
     cfgs = _get_cfgs(input_program)
     for cfg in cfgs:
-        cfg.memory_optimize(level)
+        cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level)
 
 
-def release_memory(input_program):
+def release_memory(input_program, skip_opt_set=None):
     cfgs = _get_cfgs(input_program)
     for cfg in cfgs:
-        cfg.release_memory()
+        cfg.release_memory(skip_opt_set=skip_opt_set)

From 1e2acd979669b14c03a916aabc23a73b69af08f6 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Fri, 15 Jun 2018 13:46:03 +0800
Subject: [PATCH 41/69] refine ParallelDo doc

---
 python/paddle/fluid/layers/control_flow.py | 52 +++++++++++++++++++++-
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 9c3b5cfda5..52276df3bf 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -234,8 +234,56 @@ class BlockGuard(object):
 
 class ParallelDo(object):
     """
-    ParallelDo class is used to create a ParallelDo. 
-    It will be soon deprecated, please use ParallelExecutor instead.
+    ParallelDo is used to represent multi-thread data parallel processing.
+
+    Its vanilla implementation can be shown as the following (:math:`|` means 
+    single thread and :math:`||||` means multiple threads)
+
+    .. code-block:: text
+
+      In the forward pass
+        |      Split input onto different devices
+        |      Copy parameter onto different devices
+        ||||   Compute forward pass in parallel
+        |      Merge output from different devices
+
+      In the backward pass
+        |      Split output@grad onto different devices
+        ||||   Compute backward pass in parallel
+        |      accumulate param@grad from different devices to the first device
+        |      Merge input@grad from different devices
+        |      Copy param@grad to the place of parallel_do_op
+
+    Examples:
+
+    .. code-block:: python
+
+      images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+      label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+      # ParallelDo version & Single-thread version
+      if thread_num > 1:
+          places = fluid.layers.get_places(thread_num)
+          pd = fluid.layers.ParallelDo(places)
+          with pd.do():
+              images = pd.read_input(images)
+              label = pd.read_input(label)
+              predict = cnn_model(images)
+              cost = fluid.layers.cross_entropy(input=predict, label=label)
+
+              avg_cost = fluid.layers.mean(x=cost)
+              pd.write_output(avg_cost)
+
+          avg_cost = pd()
+          avg_cost = fluid.layers.mean(avg_cost)
+      else:
+          predict = cnn_model(images)
+          cost = fluid.layers.cross_entropy(input=predict, label=label)
+          avg_cost = fluid.layers.mean(x=cost)
+
+    .. warning::
+    
+       It will be soon deprecated, please use ParallelExecutor instead.
     """
 
     def __init__(self, places, use_nccl=False, name=None):

From 5fd142c3fd5cd673802593befd0f27a2257134f4 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Fri, 15 Jun 2018 13:48:57 +0800
Subject: [PATCH 42/69] bugfix/trt engine op (#11487)

---
 .../inference/tensorrt/convert/op_converter.h |  3 +-
 paddle/fluid/inference/tensorrt/engine.h      | 32 ++++--
 paddle/fluid/operators/tensorrt_engine_op.cc  | 28 ++++--
 paddle/fluid/operators/tensorrt_engine_op.h   | 33 ++++---
 .../operators/tensorrt_engine_op_test.cc      | 99 ++++++++++++++++++-
 5 files changed, 158 insertions(+), 37 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index c7a5a49dd0..6697952051 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -64,7 +64,8 @@ class OpConverter {
     (*it)(op, scope, test_mode);
   }
 
-  // convert fluid block to tensorrt network
+  // Convert a fluid block to tensorrt network, NOTE it just convert operators,
+  // the INetwork's inputs and outputs should specified in some other modules.
   void ConvertBlock(const framework::proto::BlockDesc& block,
                     const std::unordered_set<std::string>& parameters,
                     const framework::Scope& scope, TensorRTEngine* engine) {
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index b60f00de9f..b06a9bbc67 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -51,11 +51,12 @@ class TensorRTEngine : public EngineBase {
     nvinfer1::Weights w_;
   };
 
-  TensorRTEngine(int max_batch, int max_workspace, cudaStream_t* stream,
+  TensorRTEngine(int max_batch, int max_workspace,
+                 cudaStream_t* stream = nullptr,
                  nvinfer1::ILogger& logger = NaiveLogger::Global())
       : max_batch_(max_batch),
         max_workspace_(max_workspace),
-        stream_(stream),
+        stream_(stream ? stream : &default_stream_),
         logger_(logger) {}
 
   virtual ~TensorRTEngine();
@@ -121,6 +122,8 @@ class TensorRTEngine : public EngineBase {
   // the max memory size the engine uses
   int max_workspace_;
   cudaStream_t* stream_;
+  // If stream_ is not set from outside, hold its own stream.
+  cudaStream_t default_stream_;
   nvinfer1::ILogger& logger_;
 
   std::vector<Buffer> buffers_;
@@ -165,20 +168,31 @@ class TensorRTEngine : public EngineBase {
  */
 class TRT_EngineManager {
  public:
-  TensorRTEngine* Create(int max_batch, int max_workspace,
-                         cudaStream_t* stream) {
-    engines_.emplace_back(new TensorRTEngine(max_batch, max_workspace, stream));
-    return engines_.back().get();
+  bool HasEngine(const std::string& name) const {
+    return engines_.count(name) != 0;
+  }
+
+  // Get an engine called `name`.
+  TensorRTEngine* Get(const std::string& name) const {
+    return engines_.at(name).get();
+  }
+
+  // Create or get an engine called `name`
+  TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
+                         const std::string& name) {
+    auto* p = new TensorRTEngine(max_batch, max_workspace, stream);
+    engines_[name].reset(p);
+    return p;
   }
 
   void DeleteALl() {
-    for (auto& ptr : engines_) {
-      ptr.reset(nullptr);
+    for (auto& item : engines_) {
+      item.second.reset(nullptr);
     }
   }
 
  private:
-  std::vector<std::unique_ptr<TensorRTEngine>> engines_;
+  std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index 4b1208c437..0ea273af9d 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -66,17 +66,25 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
 }  // namespace
 
 template <typename DeviceContext, typename T>
-void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
+void TensorRTEngineKernel<DeviceContext, T>::Prepare(
     const framework::ExecutionContext &context) const {
   VLOG(4) << "Prepare engine";
   // Get the ProgramDesc and pass to convert.
   framework::proto::BlockDesc block_desc;
   block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
-  max_batch_ = context.Attr<int>("max_batch");
+  int max_batch = context.Attr<int>("max_batch");
   auto max_workspace = context.Attr<int>("max_workspace");
-  engine_ = Singleton<TRT_EngineManager>::Global().Create(
-      max_batch_, max_workspace, &stream_);
-  engine_->InitNetwork();
+  auto params = context.Attr<std::vector<std::string>>("parameters");
+  std::unordered_set<std::string> parameters;
+  for (const auto &param : params) {
+    parameters.insert(param);
+  }
+
+  // TODO(Superjomn) replace this with a different stream
+  auto *engine = Singleton<TRT_EngineManager>::Global().Create(
+      max_batch, max_workspace, nullptr /*engine hold its own stream*/,
+      context.Attr<std::string>("engine_uniq_key"));
+  engine->InitNetwork();
 
   framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
   // Add inputs
@@ -87,24 +95,23 @@ void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
     PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
                       "TensorRT engine only takes LoDTensor as input");
     auto shape = var->GetShape();
-    engine_->DeclareInput(
+    engine->DeclareInput(
         input, FluidDataType2TRT(
                    var->Proto()->type().lod_tensor().tensor().data_type()),
         Vec2TRT_Dims(var->GetShape()));
   }
 
-  // TODO(Superjomn) parameters should be passed after analysised from outside.
   inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
-      block_desc, {}, context.scope(), engine_);
+      block_desc, parameters, context.scope(), engine);
 
   // Add outputs
   VLOG(4) << "declare outputs";
   for (auto &output : context.Outputs("Ys")) {
     VLOG(4) << "declare output " << output;
-    engine_->DeclareOutput(output);
+    engine->DeclareOutput(output);
   }
 
-  engine_->FreezeNetwork();
+  engine->FreezeNetwork();
 }
 
 class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -113,6 +120,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Xs", "A list of inputs.").AsDuplicable();
     AddOutput("Ys", "A list of outputs").AsDuplicable();
     AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
     AddAttr<int>("max_batch", "the maximum batch size.");
     AddAttr<int>("max_workspace", "the maximum batch size.");
     AddComment("TensorRT engine operator.");
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 4b089601ff..8455d24ddf 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -19,10 +19,14 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
 
 namespace paddle {
 namespace operators {
 
+using inference::Singleton;
+using inference::tensorrt::TRT_EngineManager;
+
 class TensorRTEngineOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -47,16 +51,18 @@ template <typename DeviceContext, typename T>
 class TensorRTEngineKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    if (!engine_) {
+    auto engine_name = context.Attr<std::string>("engine_uniq_key");
+    if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
       Prepare(context);
     }
+    auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
     auto input_names = context.op().Inputs("Xs");
     PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
     // Try to determine a batch_size
     auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
         context.scope(), input_names.front());
     int batch_size = tensor0.dims()[0];
-    PADDLE_ENFORCE_LE(batch_size, max_batch_);
+    PADDLE_ENFORCE_LE(batch_size, context.Attr<int>("max_batch"));
 
     // Convert input tensor from fluid to engine.
     for (const auto& x : context.Inputs("Xs")) {
@@ -64,20 +70,20 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
           context.scope(), x);
       if (platform::is_cpu_place(t.place())) {
-        engine_->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
-                                 t.memory_size());
+        engine->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
+                                t.memory_size());
       } else {
-        engine_->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
-                                 t.memory_size());
+        engine->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
+                                t.memory_size());
       }
     }
     // Execute the engine.
     PADDLE_ENFORCE_GT(batch_size, 0);
-    engine_->Execute(batch_size);
+    engine->Execute(batch_size);
     // Convert output tensor from engine to fluid
     for (const auto& y : context.Outputs("Ys")) {
       // convert output and copy to fluid.
-      nvinfer1::ITensor* trt_t = engine_->GetITensor(y);
+      nvinfer1::ITensor* trt_t = engine->GetITensor(y);
       auto dims = trt_t->getDimensions();
       // Use the output ITensor's dims to reshape the Fluid Tensor.
       std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
@@ -89,27 +95,22 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
       if (platform::is_cpu_place(fluid_t->place())) {
         // TODO(Superjomn) change this float to dtype size.
-        engine_->GetOutputInCPU(
+        engine->GetOutputInCPU(
             y, fluid_t->mutable_data<float>(platform::CPUPlace()),
             size * sizeof(float));
       } else {
-        engine_->GetOutputInGPU(
+        engine->GetOutputInGPU(
             y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
             size * sizeof(float));
       }
     }
 
-    cudaStreamSynchronize(stream_);
+    cudaStreamSynchronize(*engine->stream());
   }
 
  protected:
   // Build the engine.
   void Prepare(const framework::ExecutionContext& context) const;
-
- private:
-  mutable cudaStream_t stream_;
-  mutable inference::tensorrt::TensorRTEngine* engine_{nullptr};
-  mutable int max_batch_{0};
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
index 6f383de259..85330958cd 100644
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -79,6 +79,17 @@ void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
   attr->set_type(paddle::framework::proto::AttrType::LONG);
   attr->set_l(data);
 }
+template <>
+void SetAttr<std::vector<std::string>>(framework::proto::OpDesc* op,
+                                       const std::string& name,
+                                       const std::vector<std::string>& data) {
+  auto* attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRINGS);
+  for (const auto& s : data) {
+    attr->add_strings(s.c_str());
+  }
+}
 
 }  // namespace
 
@@ -123,11 +134,15 @@ TEST(TensorRTEngineOp, manual) {
   engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
   SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                        block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 30);
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 100);
   SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 1 << 10);
+  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
+  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
+                                    std::vector<std::string>({}));
 
   LOG(INFO) << "create engine op";
   auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+  LOG(INFO) << "engine_op " << engine_op.get();
 
   framework::Scope scope;
   platform::CPUPlace place;
@@ -145,6 +160,88 @@ TEST(TensorRTEngineOp, manual) {
   engine_op->Run(scope, place);
 }
 
+void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+
+  using shape_t = std::vector<int64_t>;
+
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+
+  auto AddFCLayer = [&](const std::string& x_name, const std::string& y_name,
+                        const std::string& z_name, bool x_created,
+                        const shape_t& x_shape, const shape_t& y_shape,
+                        const shape_t& z_shape) {
+
+    LOG(INFO) << "create fc op";
+    auto* fc = block_desc.AppendOp();
+    fc->SetType("mul");
+    fc->SetInput("X", std::vector<std::string>({x_name}));
+    fc->SetInput("Y", std::vector<std::string>({y_name}));
+    fc->SetOutput("Out", std::vector<std::string>({z_name}));
+
+    // Set inputs' variable shape in BlockDesc
+    if (!x_created) {
+      AddTensorToBlockDesc(block_, x_name,
+                           std::vector<int64_t>({batch_size, input_dim, 1, 1}));
+    }
+    AddTensorToBlockDesc(block_, y_name,
+                         std::vector<int64_t>({input_dim, output_dim}));
+    AddTensorToBlockDesc(block_, z_name,
+                         std::vector<int64_t>({batch_size, output_dim}));
+
+    // Prepare variables.
+    if (!x_created) {
+      CreateCPUTensor(&scope, x_name, std::vector<int64_t>(x_shape));
+    }
+    CreateCPUTensor(&scope, y_name, std::vector<int64_t>(y_shape));
+    CreateCPUTensor(&scope, z_name, std::vector<int64_t>(z_shape));
+
+    // It is wired, need to copy manually.
+    *block_->add_ops() = *fc->Proto();
+  };
+
+  // Test with 4 layer FC
+  AddFCLayer("x0", "y0", "z0", false, {batch_size, input_dim},
+             {input_dim, output_dim}, {batch_size, output_dim});
+  AddFCLayer("z0", "y1", "z1", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z1", "y2", "z2", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z2", "y3", "z3", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+
+  LOG(INFO) << "create tensorrt desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("tensorrt_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"}));
+
+  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
+                       block_->SerializeAsString());
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch", batch_size);
+  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 2 << 10);
+  SetAttr<std::vector<std::string>>(
+      engine_op_desc.Proto(), "parameters",
+      std::vector<std::string>({"y0", "y1", "y2", "y3"}));
+  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine");
+
+  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+
+  // Execute them.
+  engine_op->Run(scope, place);
+}
+
+// Test with a larger FC layer.
+TEST(TensorRTEngineOp, fc) { Execute(40, 256, 256); }
+
 }  // namespace operators
 }  // namespace paddle
 

From fd87c0e709227f45ca13562a9fb7aa0f56f3efb9 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Thu, 14 Jun 2018 23:15:22 -0700
Subject: [PATCH 43/69] Fix cast layer's doc

---
 python/paddle/fluid/layers/tensor.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 04efc40af5..91a3915367 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -111,8 +111,21 @@ def create_global_var(shape,
 
 def cast(x, dtype):
     """
-    This function takes in the input with input_dtype
-    and casts it to the output_dtype as the output.
+    This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts 
+    it to the output with :attr:`dtype`.
+
+    Args:
+        x (Variable): The input Variable for casting.
+        dtype(np.dtype|core.VarDesc.VarType|str): Data type of the output Variable.
+
+    Returns:
+        Variable: The output Variable after casting.
+
+    Examples:
+        .. code-block:: python
+             
+            data = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            result = fluid.layers.cast(x=data, dtype='float64')
     """
     helper = LayerHelper('cast', **locals())
     out = helper.create_tmp_variable(dtype=dtype)

From 1958654d6f15087c28b44759c1a8d004826f00ce Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Fri, 15 Jun 2018 14:28:17 +0800
Subject: [PATCH 44/69] refine \odot in elementwise_mul

---
 paddle/fluid/operators/elementwise_mul_op.cc  |  2 +-
 .../fluid/layers/layer_function_generator.py  | 28 +++++++++++++------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise_mul_op.cc
index ba343909bb..7cd67e74de 100644
--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_mul_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_mul, "Mul", "Out = X \\odot\\ Y");
+REGISTER_ELEMWISE_OP(elementwise_mul, "Mul", "Out = X \\\\odot Y");
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index cb60a3aec9..0f05ea2b08 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -44,6 +44,11 @@ def _type_to_str_(tp):
     return framework_pb2.AttrType.Name(tp)
 
 
+_two_dollar_pattern_ = re.compile(r"\$\$([^\$]+)\$\$")
+_single_dollar_pattern_ = re.compile(r"\$([^\$]+)\$")
+_two_bang_pattern_ = re.compile(r"!!([^!]+)!!")
+
+
 def _generate_doc_string_(op_proto):
     """
     Generate docstring by OpProto
@@ -55,22 +60,27 @@ def _generate_doc_string_(op_proto):
         str: the document string
     """
 
+    def escape_math(text):
+        return _two_bang_pattern_.sub(
+            r'$$\1$$',
+            _single_dollar_pattern_.sub(
+                r':math:`\1`', _two_dollar_pattern_.sub(r"!!\1!!", text)))
+
     if not isinstance(op_proto, framework_pb2.OpProto):
         raise TypeError("OpProto should be `framework_pb2.OpProto`")
 
     buf = cStringIO.StringIO()
-    buf.write(op_proto.comment)
+    buf.write(escape_math(op_proto.comment))
     buf.write('\nArgs:\n')
     for each_input in op_proto.inputs:
         line_begin = '    {0}: '.format(_convert_(each_input.name))
         buf.write(line_begin)
-        buf.write(each_input.comment)
+        buf.write(escape_math(each_input.comment))
         buf.write('\n')
-        buf.write(' ' * len(line_begin))
-        buf.write('Duplicable: ')
-        buf.write(str(each_input.duplicable))
-        buf.write('  Optional: ')
-        buf.write(str(each_input.dispensable))
+        if each_input.duplicable:
+            buf.write("  Duplicatable.")
+        if each_input.dispensable:
+            buf.write("  Optional.")
         buf.write('\n')
 
     skip_attrs = OpProtoHolder.generated_op_attr_names()
@@ -83,7 +93,7 @@ def _generate_doc_string_(op_proto):
         buf.write(' (')
         buf.write(_type_to_str_(each_attr.type))
         buf.write('): ')
-        buf.write(each_attr.comment)
+        buf.write(escape_math(each_attr.comment))
         buf.write('\n')
 
     if len(op_proto.outputs) != 0:
@@ -92,7 +102,7 @@ def _generate_doc_string_(op_proto):
         for each_opt in op_proto.outputs:
             if not each_opt.intermediate:
                 break
-        buf.write(each_opt.comment)
+        buf.write(escape_math(each_opt.comment))
 
     return buf.getvalue()
 

From 3571df8755b5e5566360ad07fd2682f1f031454a Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Fri, 15 Jun 2018 14:35:40 +0800
Subject: [PATCH 45/69] Remove unused '\n' in comments

---
 python/paddle/fluid/layers/layer_function_generator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 0f05ea2b08..7a95afa9a6 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -76,7 +76,6 @@ def _generate_doc_string_(op_proto):
         line_begin = '    {0}: '.format(_convert_(each_input.name))
         buf.write(line_begin)
         buf.write(escape_math(each_input.comment))
-        buf.write('\n')
         if each_input.duplicable:
             buf.write("  Duplicatable.")
         if each_input.dispensable:

From 279ebdd0b2ab40172d913a3eb1d051a41f24ddb7 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Thu, 14 Jun 2018 23:40:52 -0700
Subject: [PATCH 46/69] Fix  reciprocal op's doc

---
 paddle/fluid/operators/activation_op.cc | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index c73482eb12..c51e63982c 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -19,18 +19,18 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)               \
-  class OP_NAME##OpMaker                                                \
-      : public ::paddle::framework::OpProtoAndCheckerMaker {            \
-   public:                                                              \
-    void Make() override {                                              \
-      AddInput("X", "Input of " #OP_NAME " operator");                  \
-      AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X");   \
-      AddAttr<bool>("use_mkldnn",                                       \
-                    "(bool, default false) Only used in mkldnn kernel") \
-          .SetDefault(false);                                           \
-      AddComment(OP_COMMENT);                                           \
-    }                                                                   \
+#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)             \
+  class OP_NAME##OpMaker                                              \
+      : public ::paddle::framework::OpProtoAndCheckerMaker {          \
+   public:                                                            \
+    void Make() override {                                            \
+      AddInput("X", "Input of " #OP_NAME " operator");                \
+      AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X"); \
+      AddAttr<bool>("use_mkldnn",                                     \
+                    "(default false) Only used in mkldnn kernel")     \
+          .SetDefault(false);                                         \
+      AddComment(OP_COMMENT);                                         \
+    }                                                                 \
   }
 
 #define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE)              \

From bc46c527768f03ab06701d9cd48b877ddc04957d Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 15 Jun 2018 15:17:45 +0800
Subject: [PATCH 47/69] Add doc for while op

---
 python/paddle/fluid/layers/control_flow.py | 54 ++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 4fec682507..29713dcea9 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -607,6 +607,60 @@ class WhileGuard(BlockGuard):
 
 
 class While(object):
+    """
+    while loop control flow.
+
+    Args:
+        cond (Variable): condition used to compare.
+        name (str): The name of this layer.
+
+    Examples:
+          .. code-block:: python
+
+            # The value these d0, d1 and d2 can be fed from python.
+            d0 = fluid.layers.data(
+                "d0", shape=[10], append_batch_size=False, dtype='float32')
+            d1 = fluid.layers.data(
+                "d1", shape=[10], append_batch_size=False, dtype='float32')
+            d2 = fluid.layers.data(
+                "d2", shape=[10], append_batch_size=False, dtype='float32')
+            i = fluid.layers.zeros(shape=[1], dtype='int64')
+            i.stop_gradient = True
+            init = fluid.layers.zeros(shape=[10], dtype='float32')
+            # Initialize mem_array from init
+            mem_array = fluid.layers.array_write(x=init, i=i)
+            # Initialize data_array from d0
+            data_array = fluid.layers.array_write(x=d0, i=i)
+            # Set a value to data_array using d1[i].
+            i = fluid.layers.increment(i)
+            fluid.layers.array_write(d1, i, array=data_array)
+            # Set a value to data_array using d2[i].
+            i = fluid.layers.increment(i)
+            fluid.layers.array_write(d2, i, array=data_array)
+            # Create a idx to start the while loop.
+            i = fluid.layers.zeros(shape=[1], dtype='int64')
+            i.stop_gradient = True
+
+            array_len = fluid.layers.fill_constant(
+                shape=[1], dtype='int64', value=3)
+            array_len.stop_gradient = True
+            # Create the while loop condition.
+            cond = fluid.layers.less_than(x=i, y=array_len)
+
+            # Within the loop, perform sums.
+            while_op = fluid.layers.While(cond=cond)
+            with while_op.block():
+                d = fluid.layers.array_read(array=data_array, i=i)
+                prev = fluid.layers.array_read(array=mem_array, i=i)
+                result = fluid.layers.sums(input=[d, prev])
+
+                i = fluid.layers.increment(x=i, in_place=True)
+                fluid.layers.array_write(result, i=i, array=mem_array)
+                fluid.layers.less_than(x=i, y=array_len, cond=cond)
+
+            sum_result = fluid.layers.array_read(array=mem_array, i=i)
+    """
+
     BEFORE_WHILE_BLOCK = 0
     IN_WHILE_BLOCK = 1
     AFTER_WHILE_BLOCK = 2

From 35f64cb905a1ceff5087a1b3f785136d586008e3 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 15 Jun 2018 15:21:20 +0800
Subject: [PATCH 48/69] skip all tests in tests_parallel_executor_crf

---
 .../fluid/tests/unittests/test_parallel_executor_crf.py       | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index 163975555e..1ea7a6a568 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -173,6 +173,7 @@ class TestCRFModel(unittest.TestCase):
                           pe.run(feed=feeder.feed(cur_batch),
                                  fetch_list=[avg_cost.name]))[0]
 
+    @unittest.skip(reason="CI hangs")
     def test_update_sparse_parameter_all_reduce(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
@@ -181,6 +182,7 @@ class TestCRFModel(unittest.TestCase):
         self.check_network_convergence(
             is_sparse=True, build_strategy=build_strategy, use_cuda=False)
 
+    @unittest.skip(reason="CI hangs")
     def test_update_dense_parameter_all_reduce(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
@@ -189,6 +191,7 @@ class TestCRFModel(unittest.TestCase):
         self.check_network_convergence(
             is_sparse=False, build_strategy=build_strategy, use_cuda=False)
 
+    @unittest.skip(reason="CI hangs")
     def test_update_sparse_parameter_reduce(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
@@ -197,6 +200,7 @@ class TestCRFModel(unittest.TestCase):
         self.check_network_convergence(
             is_sparse=True, build_strategy=build_strategy, use_cuda=False)
 
+    @unittest.skip(reason="CI hangs")
     def test_update_dense_parameter_reduce(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce

From 67dc5c7f8aab84df120374d0f8671a56127f3e52 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Fri, 15 Jun 2018 00:29:37 -0700
Subject: [PATCH 49/69] Polish the doc of nce layer

---
 paddle/fluid/operators/nce_op.cc |  6 ++++--
 python/paddle/fluid/layers/nn.py | 28 +++++++++++++++++++++++++++-
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 06092e680a..e471f04662 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -128,8 +128,10 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
                               "user should avoid setting this attribute.")
         .SetDefault({});
     AddComment(R"DOC(
-Compute and return the noise-contrastive estimation training loss.
-See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+Compute and return the noise-contrastive estimation training loss. See 
+`Noise-contrastive estimation: A new estimation principle for unnormalized 
+statistical models 
+ <http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`_.
 By default this operator uses a uniform distribution for sampling.
 )DOC");
   }
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 27fbb0f053..0a45098bda 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3472,7 +3472,33 @@ def nce(input,
         num_neg_samples (int): ${num_neg_samples_comment}
     
     Returns:
-        Variable: output of nce layer.
+        Variable: The output nce loss.
+
+    Examples:
+        .. code-block:: python
+
+            window_size = 5
+            words = []
+            for i in xrange(window_size):
+                words.append(layers.data(
+                    name='word_{0}'.format(i), shape=[1], dtype='int64'))
+
+            dict_size = 10000
+            label_word = int(window_size / 2) + 1
+
+            embs = []
+            for i in xrange(window_size):
+                if i == label_word:
+                    continue
+
+                emb = layers.embedding(input=words[i], size=[dict_size, 32],
+                                       param_attr='emb.w', is_sparse=True)
+                embs.append(emb)
+
+            embs = layers.concat(input=embs, axis=1)
+            loss = layers.nce(input=embs, label=words[label_word],
+                          num_total_classes=dict_size, param_attr='nce.w',
+                          bias_attr='nce.b')
     """
     helper = LayerHelper('nce', **locals())
     assert isinstance(input, Variable)

From b00fbd962c0bff30307bcc833761e6a0e2ca075a Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 15 Jun 2018 15:30:04 +0800
Subject: [PATCH 50/69] fix error

---
 python/paddle/fluid/layers/io.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index e397515ff6..b004312d2d 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -637,10 +637,10 @@ def read_file(reader):
     out = [
         helper.create_tmp_variable(
             stop_gradient=True, dtype='float32')
-        for _ in range(len(file_obj.desc.shapes()))
+        for _ in range(len(reader.desc.shapes()))
     ]
     helper.append_op(
-        type='read', inputs={'Reader': [file_obj]}, outputs={'Out': out})
+        type='read', inputs={'Reader': [reader]}, outputs={'Out': out})
     if len(out) == 1:
         return out[0]
     else:

From 7b82353010976533ad10df80637fd88b4d26c627 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Fri, 15 Jun 2018 15:06:57 +0800
Subject: [PATCH 51/69] fix conv3d/conv3d_trans/slice/mean_iou doc

---
 paddle/fluid/operators/slice_op.cc | 37 ++++++++-------
 python/paddle/fluid/layers/nn.py   | 72 +++++++++++++++---------------
 2 files changed, 56 insertions(+), 53 deletions(-)

diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index 61bb445e8b..4bd23d5941 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -95,23 +95,26 @@ of that dimension. If the value passed to start or end is larger than
 the n (the number of elements in this dimension), it represents n. 
 For slicing to the end of a dimension with unknown size, it is recommended 
 to pass in INT_MAX. If axes are omitted, they are set to [0, ..., ndim-1].
-
-    Example 1:
-    Given:
-        data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-        axes = [0, 1]
-        starts = [1, 0]
-        ends = [2, 3]
-    Then:
-        result = [ [5, 6, 7], ]
-
-    Example 2:
-    Given:
-        data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-        starts = [0, 1]
-        ends = [-1, 1000]
-    Then:
-        result = [ [2, 3, 4], ]
+Following examples will explain how slice works:
+
+    .. code-block:: text
+
+        Cast1:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                axes = [0, 1]
+                starts = [1, 0]
+                ends = [2, 3]
+            Then:
+                result = [ [5, 6, 7], ]
+
+        Cast2:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                starts = [0, 1]
+                ends = [-1, 1000]
+            Then:
+                result = [ [2, 3, 4], ]
 )DOC");
   }
 };
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 5e162a4ae0..d1985efc58 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1326,10 +1326,8 @@ def conv2d(input,
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(
-              name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d = fluid.layers.conv2d(
-              input=data, num_filters=2, filter_size=3, act="relu")
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
     """
 
     num_channels = input.shape[1]
@@ -1431,8 +1429,7 @@ def conv3d(input,
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
-                   different.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
     Example:
 
@@ -1494,10 +1491,8 @@ def conv3d(input,
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(
-              name='data', shape=[3, 12, 32, 32], dtype='float32')
-          conv2d = fluid.layers.conv3d(
-              input=data, num_filters=2, filter_size=3, act="relu")
+          data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32')
+          conv3d = fluid.layers.conv3d(input=data, num_filters=2, filter_size=3, act="relu")
     """
 
     l_type = 'conv3d'
@@ -2105,32 +2100,36 @@ def conv2d_transpose(input,
     represent height and width, respectively. The details of convolution transpose
     layer, please refer to the following explanation and references
     `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
 
     For each input :math:`X`, the equation is:
 
     .. math::
 
-        Out = W \\ast X
+        Out = \sigma (W \\ast X + b)
 
-    In the above equation:
+    Where:
 
     * :math:`X`: Input value, a tensor with NCHW format.
     * :math:`W`: Filter value, a tensor with MCHW format.
-    * :math:`\\ast` : Convolution transpose operation.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
-                   different.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
     Example:
 
         - Input:
 
-          Input shape: $(N, C_{in}, H_{in}, W_{in})$
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
 
-          Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
+          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
 
         - Output:
 
-          Output shape: $(N, C_{out}, H_{out}, W_{out})$
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
 
         Where
 
@@ -2184,10 +2183,8 @@ def conv2d_transpose(input,
     Examples:
        .. code-block:: python
 
-          data = fluid.layers.data(
-              name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d_transpose = fluid.layers.conv2d_transpose(
-              input=data, num_filters=2, filter_size=3)
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
     """
     helper = LayerHelper("conv2d_transpose", **locals())
     if not isinstance(input, Variable):
@@ -2267,32 +2264,36 @@ def conv3d_transpose(input,
     two elements. These two elements represent height and width, respectively.
     The details of convolution transpose layer, please refer to the following
     explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
 
     For each input :math:`X`, the equation is:
 
     .. math::
 
-        Out = W \\ast X
+        Out = \sigma (W \\ast X + b)
 
     In the above equation:
 
     * :math:`X`: Input value, a tensor with NCDHW format.
     * :math:`W`: Filter value, a tensor with MCDHW format.
-    * :math:`\\ast` : Convolution transpose operation.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
-                   different.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
     Example:
 
         - Input:
 
-          Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
 
-          Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
+          Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`
 
         - Output:
 
-          Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
 
         Where
 
@@ -2347,10 +2348,8 @@ def conv3d_transpose(input,
     Examples:
        .. code-block:: python
 
-          data = fluid.layers.data(
-              name='data', shape=[3, 12, 32, 32], dtype='float32')
-          conv2d_transpose = fluid.layers.conv3d_transpose(
-              input=data, num_filters=2, filter_size=3)
+          data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32')
+          conv3d_transpose = fluid.layers.conv3d_transpose(input=data, num_filters=2, filter_size=3)
     """
     l_type = "conv3d_transpose"
     helper = LayerHelper(l_type, **locals())
@@ -4680,8 +4679,8 @@ def mean_iou(input, label, num_classes):
     IOU is defined as follows: 
     
     .. math::
-        
-        IOU = true_positive / (true_positive + false_positive + false_negative). 
+
+        IOU = \\frac{true\_positiv}{(true\_positive + false\_positive + false\_negative)}.
 
     The predictions are accumulated in a confusion matrix and mean-IOU 
     is then calculated from it.
@@ -4689,8 +4688,9 @@ def mean_iou(input, label, num_classes):
 
     Args:
         input (Variable): A Tensor of prediction results for semantic labels with type int32 or int64.
-        label (Variable):  A Tensor of ground truth labels with type int32 or int64. 
+        label (Variable): A Tensor of ground truth labels with type int32 or int64.
                            Its shape should be the same as input.
+        num_classes (int): The possible number of labels.
 
     Returns:
         mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1].

From 11f31d1e828090b91d31dc75765d50c09571e764 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 15 Jun 2018 15:52:21 +0800
Subject: [PATCH 52/69] follow comments

---
 python/paddle/fluid/layers/control_flow.py | 51 +++++-----------------
 1 file changed, 10 insertions(+), 41 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 29713dcea9..128eab7721 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -617,48 +617,17 @@ class While(object):
     Examples:
           .. code-block:: python
 
-            # The value these d0, d1 and d2 can be fed from python.
-            d0 = fluid.layers.data(
-                "d0", shape=[10], append_batch_size=False, dtype='float32')
-            d1 = fluid.layers.data(
-                "d1", shape=[10], append_batch_size=False, dtype='float32')
-            d2 = fluid.layers.data(
-                "d2", shape=[10], append_batch_size=False, dtype='float32')
-            i = fluid.layers.zeros(shape=[1], dtype='int64')
-            i.stop_gradient = True
-            init = fluid.layers.zeros(shape=[10], dtype='float32')
-            # Initialize mem_array from init
-            mem_array = fluid.layers.array_write(x=init, i=i)
-            # Initialize data_array from d0
-            data_array = fluid.layers.array_write(x=d0, i=i)
-            # Set a value to data_array using d1[i].
-            i = fluid.layers.increment(i)
-            fluid.layers.array_write(d1, i, array=data_array)
-            # Set a value to data_array using d2[i].
-            i = fluid.layers.increment(i)
-            fluid.layers.array_write(d2, i, array=data_array)
-            # Create a idx to start the while loop.
-            i = fluid.layers.zeros(shape=[1], dtype='int64')
-            i.stop_gradient = True
-
-            array_len = fluid.layers.fill_constant(
-                shape=[1], dtype='int64', value=3)
-            array_len.stop_gradient = True
-            # Create the while loop condition.
-            cond = fluid.layers.less_than(x=i, y=array_len)
-
-            # Within the loop, perform sums.
-            while_op = fluid.layers.While(cond=cond)
-            with while_op.block():
-                d = fluid.layers.array_read(array=data_array, i=i)
-                prev = fluid.layers.array_read(array=mem_array, i=i)
-                result = fluid.layers.sums(input=[d, prev])
-
-                i = fluid.layers.increment(x=i, in_place=True)
-                fluid.layers.array_write(result, i=i, array=mem_array)
-                fluid.layers.less_than(x=i, y=array_len, cond=cond)
+            d0 = layers.data("d0", shape=[10], dtype='float32')
+            data_array = layers.array_write(x=d0, i=i)
+            array_len = layers.fill_constant(shape=[1],dtype='int64', value=3)
 
-            sum_result = fluid.layers.array_read(array=mem_array, i=i)
+            cond = layers.less_than(x=i, y=array_len)
+            while_op = layers.While(cond=cond)
+            with while_op.block():
+                d = layers.array_read(array=data_array, i=i)
+                i = layers.increment(x=i, in_place=True)
+                layers.array_write(result, i=i, array=d)
+                layers.less_than(x=i, y=array_len, cond=cond)
     """
 
     BEFORE_WHILE_BLOCK = 0

From cc1239ffc97e6a5484dd323cfa926fbac9932b4e Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Fri, 15 Jun 2018 16:27:00 +0800
Subject: [PATCH 53/69] Update some doc about API reference. (#11495)

* Update some doc about layers' API.

* Fix format.

* Fix example bug in random_data_generator.

* Fix example bug in dropout.

* Follow comments and some small fix for some examples.
---
 paddle/fluid/operators/activation_op.cc       |  2 +-
 .../fluid/operators/detection/box_coder_op.cc | 41 ++++++----
 .../gaussian_random_batch_size_like_op.cc     |  9 ++-
 python/paddle/fluid/layers/io.py              | 16 ++--
 python/paddle/fluid/layers/nn.py              | 75 +++++++++++--------
 python/paddle/fluid/layers/tensor.py          | 20 ++++-
 6 files changed, 104 insertions(+), 59 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 93d1ce7179..bc03ec2f0c 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -112,7 +112,7 @@ $$out = \frac{1}{1 + e^{-x}}$$
 __attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC(
 Logsigmoid Activation Operator
 
-$$out = \log \frac{1}{1 + e^{-x}}$$
+$$out = \\log \\frac{1}{1 + e^{-x}}$$
 
 )DOC";
 
diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
index 8c4b4321b7..d0f95f727f 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -106,23 +106,36 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
               "and M represents the number of deocded boxes.");
 
     AddComment(R"DOC(
-Bounding Box Coder Operator.
+
+Bounding Box Coder.
+
 Encode/Decode the target bounding box with the priorbox information.
+
 The Encoding schema described below:
-ox = (tx - px) / pw / pxv
-oy = (ty - py) / ph / pyv
-ow = log(abs(tw / pw)) / pwv 
-oh = log(abs(th / ph)) / phv 
+
+    ox = (tx - px) / pw / pxv
+
+    oy = (ty - py) / ph / pyv
+
+    ow = log(abs(tw / pw)) / pwv 
+
+    oh = log(abs(th / ph)) / phv 
+
 The Decoding schema described below:
-ox = (pw * pxv * tx * + px) - tw / 2
-oy = (ph * pyv * ty * + py) - th / 2
-ow = exp(pwv * tw) * pw + tw / 2
-oh = exp(phv * th) * ph + th / 2
-where tx, ty, tw, th denote the target box's center coordinates, width and
-height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
-center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
-of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
-width and height.
+
+    ox = (pw * pxv * tx * + px) - tw / 2
+
+    oy = (ph * pyv * ty * + py) - th / 2
+
+    ow = exp(pwv * tw) * pw + tw / 2
+
+    oh = exp(phv * th) * ph + th / 2
+
+where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
+and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
+priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
+`phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the
+encoded/decoded coordinates, width and height.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
index 8050f61d45..4a97428148 100644
--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
@@ -36,11 +36,12 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
   void Apply() override {
     AddAttr<float>("mean",
                    "(float, default 0.0) "
-                   "mean of random tensor.")
+                   "The mean (or center) of the gaussian distribution.")
         .SetDefault(.0f);
     AddAttr<float>("std",
                    "(float, default 1.0) "
-                   "std of random tensor.")
+                   "The standard deviation (std, or spread) of the "
+                   "gaussian distribution.")
         .SetDefault(1.0f);
     AddAttr<int>("seed",
                  "(int, default 0) "
@@ -55,9 +56,11 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
         .SetDefault(framework::proto::VarType::FP32);
 
     AddComment(R"DOC(
-GaussianRandom Operator.
 
 Used to initialize tensors with gaussian random generator.
+The defalut mean of the distribution is 0. and defalut standard
+deviation (std) of the distribution is 1.. Uers can set mean and std
+by input arguments.
 )DOC");
   }
 };
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index aaf3ff671a..5dc18c633e 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -378,16 +378,16 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
        Variable: A Reader Variable from which we can get random data.
 
     Examples:
-       .. code-block:: python
 
-         reader = fluid.layers.io.random_data_generator(
-                                          low=0.0,
-                                          high=1.0,
-                                          shapes=[(3,224,224), (1)],
-                                          lod_levels=[0, 0])
+        .. code-block:: python
 
-         # Via the reader, we can use 'read_file' layer to get data:
-         image, label = fluid.layers.io.read_file(reader)
+            reader = fluid.layers.random_data_generator(
+                                             low=0.0,
+                                             high=1.0,
+                                             shapes=[[3,224,224], [1]],
+                                             lod_levels=[0, 0])
+            # Via the reader, we can use 'read_file' layer to get data:
+            image, label = fluid.layers.read_file(reader)
     """
     dtypes = [core.VarDesc.VarType.FP32] * len(shapes)
     shape_concat = []
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 5e162a4ae0..3816098383 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -364,8 +364,7 @@ def dynamic_lstm(input,
         cell_activation(str): The activation for cell output. Choices = ["sigmoid",
                               "tanh", "relu", "identity"], default "tanh".
         candidate_activation(str): The activation for candidate hidden state.
-                              Choices = ["sigmoid", "tanh",
-                                  "relu", "identity"],
+                              Choices = ["sigmoid", "tanh", "relu", "identity"],
                               default "tanh".
         dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
         name(str|None): A name for this layer(optional). If set None, the layer
@@ -540,27 +539,31 @@ def dynamic_lstmp(input,
         cell_activation(str): The activation for cell output. Choices = ["sigmoid",
                               "tanh", "relu", "identity"], default "tanh".
         candidate_activation(str): The activation for candidate hidden state.
-                              Choices = ["sigmoid", "tanh",
-                                  "relu", "identity"],
+                              Choices = ["sigmoid", "tanh", "relu", "identity"],
                               default "tanh".
         proj_activation(str): The activation for projection output.
-                              Choices = ["sigmoid", "tanh",
-                                  "relu", "identity"],
+                              Choices = ["sigmoid", "tanh", "relu", "identity"],
                               default "tanh".
         dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
 
     Returns:
-        tuple: The projection of hidden state, and cell state of LSTMP. The \
-               shape of projection is (T x P), for the cell state which is \
-               (T x D), and both LoD is the same with the `input`.
+        tuple: A tuple of two output variable: the projection of hidden state, \
+               and cell state of LSTMP. The shape of projection is (T x P), \
+               for the cell state which is (T x D), and both LoD is the same \
+               with the `input`.
 
     Examples:
+
         .. code-block:: python
 
+            dict_dim, emb_dim = 128, 64
+            data = fluid.layers.data(name='sequence', shape=[1],
+                                     dtype='int32', lod_level=1)
+            emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
             hidden_dim, proj_dim = 512, 256
-            fc_out = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+            fc_out = fluid.layers.fc(input=emb, size=hidden_dim * 4,
                                      act=None, bias_attr=None)
             proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out,
                                                      size=hidden_dim * 4,
@@ -626,10 +629,10 @@ def dynamic_gru(input,
                 candidate_activation='tanh',
                 h_0=None):
     """
-    **Dynamic GRU Layer**
+    **Gated Recurrent Unit (GRU) Layer**
 
     Refer to `Empirical Evaluation of Gated Recurrent Neural Networks on
-    Sequence Modeling <https://arxiv.org/abs/1412.3555>`_
+    Sequence Modeling <https://arxiv.org/abs/1412.3555>`_ .
 
     The formula is as follows:
 
@@ -676,17 +679,25 @@ def dynamic_gru(input,
             Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid".
         candidate_activation(str): The activation for candidate hidden state.
             Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
-        h_0 (Variable): The hidden output of the first time step.
+        h_0 (Variable): This is initial hidden state. If not set, default is
+            zero. This is a tensor with shape (N x D), where N is the number of
+            total time steps of input mini-batch feature and D is the hidden
+            size.
 
     Returns:
         Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \
-            and lod is the same with the input.
+            and sequence length is the same with the input.
 
     Examples:
+
         .. code-block:: python
 
+            dict_dim, emb_dim = 128, 64
+            data = fluid.layers.data(name='sequence', shape=[1],
+                                     dtype='int32', lod_level=1)
+            emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
             hidden_dim = 512
-            x = fluid.layers.fc(input=data, size=hidden_dim * 3)
+            x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
             hidden = fluid.layers.dynamic_gru(input=x, dim=hidden_dim)
     """
 
@@ -924,13 +935,13 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
 
     Drop or keep each element of `x` independently. Dropout is a regularization
     technique for reducing overfitting by preventing neuron co-adaption during
-    training. The dropout operator randomly set (according to the given dropout
+    training. The dropout operator randomly sets (according to the given dropout
     probability) the outputs of some units to zero, while others are remain
     unchanged.
 
     Args:
-        x (Variable): The input tensor.
-         dropout_prob (float): Probability of setting units to zero.
+        x (Variable): The input tensor variable.
+        dropout_prob (float): Probability of setting units to zero.
         is_test (bool): A flag indicating whether it is in test phrase or not.
         seed (int): A Python integer used to create random seeds. If this
                     parameter is set to None, a random seed is used.
@@ -940,13 +951,14 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
                          will be named automatically.
 
     Returns:
-        Variable: A tensor variable.
+        Variable: A tensor variable is the shape with `x`.
 
     Examples:
+
         .. code-block:: python
 
-          x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-          droped = fluid.layers.dropout(input=x, dropout_rate=0.5)
+            x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            droped = fluid.layers.dropout(x, dropout_prob=0.5)
     """
 
     helper = LayerHelper('dropout', **locals())
@@ -2990,32 +3002,33 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
     norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes
 
     .. math::
-    y = \frac{x}{ \sqrt{\sum {x^2} + epsion }}
+
+        y = \\frac{x}{ \sqrt{\sum {x^2} + epsion }}
 
     For `x` with more dimensions, this layer independently normalizes each 1-D
     slice along dimension `axis`.
 
     Args:
         x(Variable|list): The input tensor to l2_normalize layer.
-        axis(int): The axis on which to apply normalization. If `axis < 0`,
+        axis(int): The axis on which to apply normalization. If `axis < 0`, \
             the dimension to normalization is rank(X) + axis. -1 is the
             last dimension.
-        epsilon(float): The epsilon value is used to avoid division by zero,
+        epsilon(float): The epsilon value is used to avoid division by zero, \
             the defalut value is 1e-10.
-        name(str|None): A name for this layer(optional). If set None, the layer
+        name(str|None): A name for this layer(optional). If set None, the layer \
             will be named automatically.
 
-
     Returns:
-        Variable: The output tensor variable.
+        Variable: The output tensor variable is the same shape with `x`.
 
     Examples:
+
         .. code-block:: python
 
-          data = fluid.layers.data(name="data",
-                                   shape=(3, 17, 13),
-                                   dtype="float32")
-          normed = fluid.layers.l2_normalize(x=data, axis=1)
+            data = fluid.layers.data(name="data",
+                                     shape=(3, 17, 13),
+                                     dtype="float32")
+            normed = fluid.layers.l2_normalize(x=data, axis=1)
     """
 
     if len(x.shape) == 1:
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index f178248fd6..f585c88cbe 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -497,11 +497,27 @@ def save_combine(x, file_path, overwrite=True):
     Saves a list of variables into a single file.
 
     Args:
-        x(list): A list of Tensor/LoDTensor to be saved together in a single file.
+        x(list): A list of Tensor/LoDTensor variables to be saved together in
+                 a single file.
         file_path(str): The file path where variables will be saved.
-        overwrite(bool): Whether or not cover the given file when it has already 
+        overwrite(bool): Whether or not cover the given file when it has already
             existed. If it's set 'False' and the file is existed, a runtime 
             error will be thrown. 
+
+    Returns:
+        There is no return value.
+
+    Examples:
+
+        .. code-block:: python
+
+            v1 = fluid.layers.data(name="data",
+                                   shape=(4, 6),
+                                   dtype="float32")
+            v2 = fluid.layers.data(name="data",
+                                   shape=(6, 8, 4),
+                                   dtype="float32")
+            normed = fluid.layers.save_combine([v1, v2], file_path="output")
     """
     helper = LayerHelper("save_combine", **locals())
     helper.append_op(

From 566a94022337f227a86d4e2bcaa45dafc1cc36ae Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Fri, 15 Jun 2018 16:40:35 +0800
Subject: [PATCH 54/69] Implement a bilinear initializer for transposed
 convolution to do upsampling. (#11404)

* Implement a bilinear initializer for transposed convolution.

* Update some error message.
---
 python/paddle/fluid/initializer.py            | 102 +++++++++++++++++-
 .../fluid/tests/unittests/test_initializer.py |  17 +++
 2 files changed, 117 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 4e132ed261..c36ad324e7 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -15,11 +15,13 @@
 import framework
 import numpy as np
 import contextlib
+from framework import convert_np_dtype_to_dtype_
+from core import VarDesc
 
 __all__ = [
-    'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
+    'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'force_init_on_cpu',
     'init_on_cpu', 'ConstantInitializer', 'UniformInitializer',
-    'NormalInitializer', 'XavierInitializer'
+    'NormalInitializer', 'XavierInitializer', 'BilinearInitializer'
 ]
 
 _force_init_on_cpu_ = False
@@ -422,6 +424,101 @@ class MSRAInitializer(Initializer):
         return op
 
 
+class BilinearInitializer(Initializer):
+    """Implements the bilinear initializer.
+
+    This initializer can be used in transposed convolution operator to
+    act as upsampling. Users can upsample a feature map with shape of
+    (B, C, H, W) by any integer factor. The usage is:
+  
+    >>>  factor = 2
+    >>>  w_attr = ParamAttr(learning_rate=0., regularizer=L2Decay(0.),
+    >>>                     initializer=Bilinear())
+    >>>  conv_up = fluid.layers.conv2d_transpose(
+    >>>      input,
+    >>>      num_filters=C,
+    >>>      output_size=None,
+    >>>      filter_size=2 * factor - factor % 2,
+    >>>      padding=ceil((factor - 1) / 2.),
+    >>>      stride=factor,
+    >>>      groups=C,
+    >>>      param_attr=w_attr,
+    >>>      bias_attr=False)
+
+
+    Where, `num_filters=C` and `groups=C` means this is channel-wise tranposed
+    convolution. The filter shape will be (C, 1, K, K) where K is `filer_size`,
+    This initializer will set a (K, K) interpolation kernel for every channel
+    of the filter identically. The resulting shape of the output feature map
+    will be (B, C, factor * H, factor * W). Note that the learning rate and the
+    weight decay are set to 0 in order to keep coefficient values of bilinear
+    interpolation unchanged during training. 
+    """
+
+    def __init__(self):
+        """Constructor for BilinearInitializer.
+        """
+        super(BilinearInitializer, self).__init__()
+
+    def __call__(self, var, block):
+        """Add biliear initialization ops for a variable
+
+        Args:
+            var (Variable): Variable that needs to be initialized.
+            block (Block): The block in which initialization ops should
+                           be added.
+
+        Returns:
+            the initialization op
+
+        Raises:
+            ValueError: If type of `var` and `block` is not right.
+                        If the shape of `var` size is not 4 and
+                        var.shape[2] != var.shape[3].
+        """
+        if not isinstance(var, framework.Variable):
+            raise ValueError("var must be framework.Variable.")
+
+        if not isinstance(block, framework.Block):
+            raise ValueError("block must be framework.Block.")
+
+        shape = var.shape
+        if len(shape) != 4:
+            raise ValueError("the length of shape must be 4.")
+        if shape[2] != shape[3]:
+            raise ValueError("shape[2] must be equal to shape[3].")
+
+        weight = np.zeros(np.prod(var.shape), dtype='float32')
+        size = shape[3]
+        # factor
+        f = np.ceil(size / 2.)
+        # center
+        c = (2 * f - 1 - f % 2) / (2. * f)
+        for i in range(np.prod(shape)):
+            x = i % size
+            y = (i / size) % size
+            weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
+        weight = np.reshape(weight, shape)
+
+        if var.dtype == VarDesc.VarType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in weight.flat]
+        else:
+            raise ValueError("Unsupported dtype %s", input.dtype)
+        if np.prod(shape) > 1024 * 1024:
+            raise ValueError("The size of input is too big. ")
+        op = block.append_op(
+            type='assign_value',
+            outputs={'Out': [var]},
+            attrs={
+                'dtype': var.dtype,
+                'shape': list(shape),
+                value_name: values
+            })
+        var.op = op
+        return op
+
+
 # We short the class name, since users will use the initializer with the package
 # name. The sample code:
 #
@@ -436,3 +533,4 @@ Uniform = UniformInitializer
 Normal = NormalInitializer
 Xavier = XavierInitializer
 MSRA = MSRAInitializer
+Bilinear = BilinearInitializer
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 587e2025e1..15a72cb605 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -364,5 +364,22 @@ class TestMSRAInitializer(unittest.TestCase):
         self.assertEqual(init_op.attr('seed'), 134)
 
 
+class TestMSRAInitializer(unittest.TestCase):
+    def test_bilinear_initializer(self):
+        """Test the bilinear initializer with supplied arguments
+        """
+        program = framework.Program()
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[8, 1, 3, 3],
+            lod_level=0,
+            name="param",
+            initializer=initializer.BilinearInitializer())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'assign_value')
+
+
 if __name__ == '__main__':
     unittest.main()

From a427e769584e9384c341bfc1fc5fface6d6bcf32 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 15 Jun 2018 16:42:16 +0800
Subject: [PATCH 55/69] skip use_mkldnn if do not use it

---
 paddle/testing/paddle_gtest_main.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 586ec48477..7772dc97f5 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -30,7 +30,8 @@ int main(int argc, char** argv) {
   new_argv.push_back(
       strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
 #else
-  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,use_mkldnn"));
+  new_argv.push_back(strdup("--undefok=use_mkldnn"));
 #endif
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();

From 4a0f3743c34b8f75e564dffd37a656cf63f89d24 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 15 Jun 2018 17:03:36 +0800
Subject: [PATCH 56/69] Refine API doc

---
 python/paddle/fluid/layers/learning_rate_scheduler.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 716cc7824e..fe9b40b817 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -209,6 +209,14 @@ def polynomial_decay(learning_rate,
 def piecewise_decay(boundaries, values):
     """Applies piecewise decay to the initial learning rate.
 
+    Args:
+        boundaries: A list of steps numbers.
+        values: A list of learning rate values that will be picked during
+            different step boundaries.
+
+    Returns:
+        The decayed learning rate.
+
     >>> boundaries = [10000, 20000]
     >>> values = [1.0, 0.5, 0.1]
     >>>

From cafdeb0a403b7a62ddebccf559489520afe5b972 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Fri, 15 Jun 2018 02:24:17 -0700
Subject: [PATCH 57/69] Fix docs for detection_output & target_assign

---
 python/paddle/fluid/layers/detection.py | 53 ++++++++++++++++---------
 python/paddle/fluid/layers/nn.py        | 13 +++---
 2 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index edf528a595..f46ca7f132 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -97,7 +97,9 @@ def detection_output(loc,
         nms_eta(float): The parameter for adaptive NMS.
 
     Returns:
-        Variable: The detection outputs is a LoDTensor with shape [No, 6].
+        Variable: 
+        
+            The detection outputs is a LoDTensor with shape [No, 6].
             Each row has six values: [label, confidence, xmin, ymin, xmax, ymax].
             `No` is the total number of detections in this mini-batch. For each
             instance, the offsets in first dimension are called LoD, the offset
@@ -110,15 +112,15 @@ def detection_output(loc,
     Examples:
         .. code-block:: python
 
-        pb = layers.data(name='prior_box', shape=[10, 4],
+            pb = layers.data(name='prior_box', shape=[10, 4],
                          append_batch_size=False, dtype='float32')
-        pbv = layers.data(name='prior_box_var', shape=[10, 4],
+            pbv = layers.data(name='prior_box_var', shape=[10, 4],
                           append_batch_size=False, dtype='float32')
-        loc = layers.data(name='target_box', shape=[2, 21, 4],
+            loc = layers.data(name='target_box', shape=[2, 21, 4],
                           append_batch_size=False, dtype='float32')
-        scores = layers.data(name='scores', shape=[2, 21, 10],
+            scores = layers.data(name='scores', shape=[2, 21, 10],
                           append_batch_size=False, dtype='float32')
-        nmsed_outs = fluid.layers.detection_output(scores=scores,
+            nmsed_outs = fluid.layers.detection_output(scores=scores,
                                        loc=loc,
                                        prior_box=pb,
                                        prior_box_var=pbv)
@@ -296,8 +298,6 @@ def target_assign(input,
                   mismatch_value=None,
                   name=None):
     """
-    **Target assigner operator**
-
     This operator can be, for given the target bounding boxes or labels,
     to assign classification and regression targets to each prediction as well as
     weights to prediction. The weights is used to specify which prediction would
@@ -311,20 +311,24 @@ def target_assign(input,
 
     1. Assigning all outpts based on `match_indices`:
 
-    If id = match_indices[i][j] > 0,
+    .. code-block:: text
+
+        If id = match_indices[i][j] > 0,
 
-        out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
-        out_weight[i][j] = 1.
+            out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
+            out_weight[i][j] = 1.
 
-    Otherwise,
+        Otherwise,
 
-        out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
-        out_weight[i][j] = 0.
+            out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
+            out_weight[i][j] = 0.
 
     2. Assigning out_weight based on `neg_indices` if `neg_indices` is provided:
 
     Assumed that the row offset for each instance in `neg_indices` is called neg_lod,
     for i-th instance and each `id` of neg_indices in this instance:
+    
+    .. code-block:: text
 
         out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
         out_weight[i][id] = 1.0
@@ -341,10 +345,23 @@ def target_assign(input,
        mismatch_value (float32): Fill this value to the mismatched location.
 
     Returns:
-       out (Variable): The output is a 3D Tensor with shape [N, P, K],
-           N and P is the same as they are in `neg_indices`, K is the
-           same as it in input of X. If `match_indices[i][j]`.
-       out_weight (Variable): The weight for output with the shape of [N, P, 1].
+        tuple: 
+        
+               A tuple(out, out_weight) is returned. out is a 3D Tensor with 
+               shape [N, P, K], N and P is the same as they are in 
+               `neg_indices`, K is the same as it in input of X. If 
+               `match_indices[i][j]`. out_weight is the weight for output with 
+               the shape of [N, P, 1].
+
+    Examples:
+
+        .. code-block:: python
+
+            matched_indices, matched_dist = fluid.layers.bipartite_match(iou)
+            gt = layers.data(
+                        name='gt', shape=[1, 1], dtype='int32', lod_level=1)
+            trg, trg_weight = layers.target_assign(
+                            gt, matched_indices, mismatch_value=0)
     """
     helper = LayerHelper('target_assign', **locals())
     out = helper.create_tmp_variable(dtype=input.dtype)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 0a45098bda..5d417daea1 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3466,7 +3466,9 @@ def nce(input,
         input (Variable): input variable.
         label (Variable): label.
         num_total_classes (int):${num_total_classes_comment}
-        sample_weight (int): ${sample_weight_comment}
+        sample_weight (Variable|None): A Variable of shape [batch_size, 1] 
+            storing a weight for each sample. The default weight for each 
+            sample is 1.0.
         param_attr (ParamAttr|None): attributes for parameter
         bias_attr (ParamAttr|None): attributes for bias
         num_neg_samples (int): ${num_neg_samples_comment}
@@ -4638,10 +4640,6 @@ def random_crop(x, shape, seed=None):
     """
     ${comment}
 
-    Examples:
-        >>> img = fluid.layers.data("img", [3, 256, 256])
-        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
-
     Args:
         x(${x_type}): ${x_comment}
         shape(${shape_type}): ${shape_comment}
@@ -4650,7 +4648,10 @@ def random_crop(x, shape, seed=None):
 
     Returns:
         ${out_comment}
-
+    
+    Examples:
+        >>> img = fluid.layers.data("img", [3, 256, 256])
+        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
     """
     helper = LayerHelper("random_crop", **locals())
     dtype = helper.input_dtype()

From a219f3cc19a7be055eb484a611e99e44a965541b Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 15 Jun 2018 17:36:07 +0800
Subject: [PATCH 58/69] follow comments

---
 .../fluid/layers/learning_rate_scheduler.py   | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index fe9b40b817..fef1dca61b 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -209,6 +209,18 @@ def polynomial_decay(learning_rate,
 def piecewise_decay(boundaries, values):
     """Applies piecewise decay to the initial learning rate.
 
+      The algorithm can be described as the code below.
+
+      .. code-block:: python
+
+        boundaries = [10000, 20000]
+        values = [1.0, 0.5, 0.1]
+        if step < 10000:
+            learning_rate = 1.0
+        elif 10000 <= step < 20000:
+            learning_rate = 0.5
+        else:
+            learning_rate = 0.1
     Args:
         boundaries: A list of steps numbers.
         values: A list of learning rate values that will be picked during
@@ -217,15 +229,7 @@ def piecewise_decay(boundaries, values):
     Returns:
         The decayed learning rate.
 
-    >>> boundaries = [10000, 20000]
-    >>> values = [1.0, 0.5, 0.1]
-    >>>
-    >>> if step < 10000:
-    >>>     learning_rate = 1.0
-    >>> elif 10000 <= step < 20000:
-    >>>     learning_rate = 0.5
-    >>> else:
-    >>>     learning_rate = 0.1
+
     """
 
     if len(values) - len(boundaries) != 1:

From f6daab438db0570b098963fb7f91dd01110918db Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 15 Jun 2018 17:59:04 +0800
Subject: [PATCH 59/69] fix a bug

---
 python/paddle/fluid/layers/tensor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 09c0b2fa79..973059a2ce 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -115,7 +115,7 @@ def create_global_var(shape,
     """
     helper = LayerHelper("global_var", **locals())
     var = helper.create_global_variable(
-        dtype=dtype, shape=shape, persistable=persistable)
+        dtype=dtype, shape=shape, persistable=persistable, name=name)
     helper.set_variable_initializer(
         var, initializer=Constant(
             value=float(value), force_cpu=force_cpu))

From 316eb3e968b8310f38a9308e813e15902f90f771 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Fri, 15 Jun 2018 03:03:28 -0700
Subject: [PATCH 60/69] Add doc for layers.auc

---
 python/paddle/fluid/layers/metric.py | 37 ++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric.py
index a1c64ce277..15d7c50bf4 100644
--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
@@ -53,6 +53,43 @@ def accuracy(input, label, k=1, correct=None, total=None):
 
 
 def auc(input, label, curve='ROC', num_thresholds=200):
+    """
+    **Area Under The Curve (AUC) Layer**
+
+    This implementation computes the AUC according to forward output and label.
+    It is used very widely in binary classification evaluation. 
+
+    As a note: If input label contains values other than 0 and 1, it will be 
+    cast to bool. You can find the relevant definitions `here 
+     <https://en.wikipedia.org/wiki/Receiver_operating_characteristic
+      #Area_under_the_curve>`_.
+
+    There are two types of possible curves:
+    1. ROC: Receiver operating characteristic
+    2. PR: Precision Recall
+
+    Args:
+        input(Variable): A floating-point 2D Variable, values are in the range 
+                         [0, 1]. Each row is sorted in descending order. This 
+                         input should be the output of topk. Typically, this 
+                         Variable indicates the probability of each label.
+        label(Variable): A 2D int Variable indicating the label of the training 
+                         data. The height is batch size and width is always 1.
+        curve(str): Curve type, can be 'ROC' or 'PR'. Default 'ROC'.
+        num_thresholds(int): The number of thresholds to use when discretizing 
+                             the roc curve. Default 200.
+
+    Returns:
+        Variable: A scalar representing the current AUC.
+
+    Examples:
+        .. code-block:: python
+        
+        # network is a binary classification model and label the ground truth
+        prediction = network(image, is_infer=True)
+        auc_out=fluid.layers.auc(input=prediction, label=label)
+    """
+
     warnings.warn(
         "This interface not recommended, fluid.layers.auc compute the auc at every minibatch, \
         but can not aggregate them and get the pass AUC, because pass \

From f3a777d8e299b9d740e06f2dc51a88b5211f789d Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Fri, 15 Jun 2018 03:40:16 -0700
Subject: [PATCH 61/69] Fix the display of reciprocal's formula

---
 paddle/fluid/operators/activation_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 5065244c44..92fbbc2854 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -196,7 +196,7 @@ $out = [x]$
 __attribute__((unused)) constexpr char ReciprocalDoc[] = R"DOC(
 Reciprocal Activation Operator.
 
-$$out = \frac{1}{x}$$
+$$out = \\frac{1}{x}$$
 
 )DOC";
 

From cff5232e76ab6b424fc4453dc3cdc463f5680030 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Fri, 15 Jun 2018 17:58:30 +0800
Subject: [PATCH 62/69] remove Non-ASCII character '\xc2'

---
 python/paddle/fluid/layers/control_flow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 532ffd754d..82f3d66ded 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -236,7 +236,7 @@ class ParallelDo(object):
     """
     ParallelDo is used to represent multi-thread data parallel processing.
 
-    Its vanilla implementation can be shown as the following (:math:`|` means 
+    Its vanilla implementation can be shown as the following (:math:`|` means
     single thread and :math:`||||` means multiple threads)
 
     .. code-block:: text
@@ -252,7 +252,7 @@ class ParallelDo(object):
         ||||   Compute backward pass in parallel
         |      accumulate param@grad from different devices to the first device
         |      Merge input@grad from different devices
-        |      Copy param@grad to the place of parallel_do_op
+        |      Copy param@grad to the place of parallel_do_op
 
     Examples:
 

From 23ec12cfe95bed8e0d9e7c5840451d5f50d1cab2 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Fri, 15 Jun 2018 04:43:19 -0700
Subject: [PATCH 63/69] Fix the problem that metric cannot display

---
 doc/fluid/api/gen_doc.sh      |   2 +-
 doc/fluid/api/initializer.rst |  14 +++
 doc/fluid/api/io.rst          |  36 +++++++
 doc/fluid/api/layers.rst      | 181 +++++++++++++++++++++++++++++-----
 doc/fluid/api/optimizer.rst   |   7 ++
 doc/fluid/api/profiler.rst    |  12 +++
 6 files changed, 225 insertions(+), 27 deletions(-)

diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh
index 27f2419c06..acc8b4aa3f 100755
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler > layers.rst
+python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst
 
 for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
 do
diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst
index c49a98c744..57efc9823c 100644
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@@ -33,6 +33,13 @@ Xavier
     :members:
     :noindex:
 
+Bilinear
+--------
+
+..  autoclass:: paddle.fluid.initializer.Bilinear
+    :members:
+    :noindex:
+
 force_init_on_cpu
 -----------------
 
@@ -73,3 +80,10 @@ XavierInitializer
     :members:
     :noindex:
 
+BilinearInitializer
+-------------------
+
+..  autoclass:: paddle.fluid.initializer.BilinearInitializer
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst
index dd9d88b669..21334c9eda 100644
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -59,3 +59,39 @@ get_inference_program
 ..  autofunction:: paddle.fluid.io.get_inference_program
     :noindex:
 
+save_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.save_checkpoint
+    :noindex:
+
+load_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.load_checkpoint
+    :noindex:
+
+clean_checkpoint
+----------------
+
+..  autofunction:: paddle.fluid.io.clean_checkpoint
+    :noindex:
+
+load_persist_vars_without_grad
+------------------------------
+
+..  autofunction:: paddle.fluid.io.load_persist_vars_without_grad
+    :noindex:
+
+save_persist_vars_without_grad
+------------------------------
+
+..  autofunction:: paddle.fluid.io.save_persist_vars_without_grad
+    :noindex:
+
+get_latest_checkpoint_serial
+----------------------------
+
+..  autofunction:: paddle.fluid.io.get_latest_checkpoint_serial
+    :noindex:
+
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index 8d1c9247b1..1f8f636040 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -181,6 +181,12 @@ Print
 ..  autofunction:: paddle.fluid.layers.Print
     :noindex:
 
+is_empty
+--------
+
+..  autofunction:: paddle.fluid.layers.is_empty
+    :noindex:
+
 device
 ======
 
@@ -219,6 +225,12 @@ Send
 ..  autofunction:: paddle.fluid.layers.Send
     :noindex:
 
+Recv
+----
+
+..  autofunction:: paddle.fluid.layers.Recv
+    :noindex:
+
 open_recordio_file
 ------------------
 
@@ -255,6 +267,25 @@ double_buffer
 ..  autofunction:: paddle.fluid.layers.double_buffer
     :noindex:
 
+random_data_generator
+---------------------
+
+..  autofunction:: paddle.fluid.layers.random_data_generator
+    :noindex:
+
+Preprocessor
+------------
+
+..  autoclass:: paddle.fluid.layers.Preprocessor
+    :members:
+    :noindex:
+
+load
+----
+
+..  autofunction:: paddle.fluid.layers.load
+    :noindex:
+
 nn
 ==
 
@@ -399,10 +430,9 @@ conv2d_transpose
 conv3d_transpose
 ----------------
 
-..  autofunction:: paddle.fluid.layers.conv2d_transpose
+..  autofunction:: paddle.fluid.layers.conv3d_transpose
     :noindex:
 
-
 sequence_expand
 ---------------
 
@@ -613,6 +643,48 @@ roi_pool
 ..  autofunction:: paddle.fluid.layers.roi_pool
     :noindex:
 
+dice_loss
+---------
+
+..  autofunction:: paddle.fluid.layers.dice_loss
+    :noindex:
+
+image_resize
+------------
+
+..  autofunction:: paddle.fluid.layers.image_resize
+    :noindex:
+
+image_resize_short
+------------------
+
+..  autofunction:: paddle.fluid.layers.image_resize_short
+    :noindex:
+
+resize_bilinear
+---------------
+
+..  autofunction:: paddle.fluid.layers.resize_bilinear
+    :noindex:
+
+gather
+------
+
+..  autofunction:: paddle.fluid.layers.gather
+    :noindex:
+
+random_crop
+-----------
+
+..  autofunction:: paddle.fluid.layers.random_crop
+    :noindex:
+
+mean_iou
+--------
+
+..  autofunction:: paddle.fluid.layers.mean_iou
+    :noindex:
+
 ops
 ===
 
@@ -718,12 +790,6 @@ logical_not
 ..  autofunction:: paddle.fluid.layers.logical_not
     :noindex:
 
-uniform_random
---------------
-
-..  autofunction:: paddle.fluid.layers.uniform_random
-    :noindex:
-
 uniform_random_batch_size_like
 ------------------------------
 
@@ -742,12 +808,6 @@ gaussian_random_batch_size_like
 ..  autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
     :noindex:
 
-cumsum
-------
-
-..  autofunction:: paddle.fluid.layers.cumsum
-    :noindex:
-
 scatter
 -------
 
@@ -760,6 +820,30 @@ sum
 ..  autofunction:: paddle.fluid.layers.sum
     :noindex:
 
+slice
+-----
+
+..  autofunction:: paddle.fluid.layers.slice
+    :noindex:
+
+polygon_box_transform
+---------------------
+
+..  autofunction:: paddle.fluid.layers.polygon_box_transform
+    :noindex:
+
+shape
+-----
+
+..  autofunction:: paddle.fluid.layers.shape
+    :noindex:
+
+maxout
+------
+
+..  autofunction:: paddle.fluid.layers.maxout
+    :noindex:
+
 sigmoid
 -------
 
@@ -916,18 +1000,6 @@ stanh
 ..  autofunction:: paddle.fluid.layers.stanh
     :noindex:
 
-hard_shrink
------------
-
-..  autofunction:: paddle.fluid.layers.hard_shrink
-    :noindex:
-
-thresholded_relu
-----------------
-
-..  autofunction:: paddle.fluid.layers.thresholded_relu
-    :noindex:
-
 hard_sigmoid
 ------------
 
@@ -940,6 +1012,30 @@ swish
 ..  autofunction:: paddle.fluid.layers.swish
     :noindex:
 
+uniform_random
+--------------
+
+..  autofunction:: paddle.fluid.layers.uniform_random
+    :noindex:
+
+hard_shrink
+-----------
+
+..  autofunction:: paddle.fluid.layers.hard_shrink
+    :noindex:
+
+cumsum
+------
+
+..  autofunction:: paddle.fluid.layers.cumsum
+    :noindex:
+
+thresholded_relu
+----------------
+
+..  autofunction:: paddle.fluid.layers.thresholded_relu
+    :noindex:
+
 tensor
 ======
 
@@ -997,6 +1093,18 @@ fill_constant
 ..  autofunction:: paddle.fluid.layers.fill_constant
     :noindex:
 
+argmin
+------
+
+..  autofunction:: paddle.fluid.layers.argmin
+    :noindex:
+
+argmax
+------
+
+..  autofunction:: paddle.fluid.layers.argmax
+    :noindex:
+
 ones
 ----
 
@@ -1012,6 +1120,12 @@ zeros
 detection
 =========
 
+prior_box
+---------
+
+..  autofunction:: paddle.fluid.layers.prior_box
+    :noindex:
+
 multi_box_head
 --------------
 
@@ -1099,3 +1213,18 @@ noam_decay
 ..  autofunction:: paddle.fluid.layers.noam_decay
     :noindex:
 
+metric
+======
+
+accuracy
+--------
+
+..  autofunction:: paddle.fluid.layers.accuracy
+    :noindex:
+
+auc
+---
+
+..  autofunction:: paddle.fluid.layers.auc
+    :noindex:
+
diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst
index 79a0995fce..6ad44bb690 100644
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -89,6 +89,13 @@ DecayedAdagradOptimizer
     :members:
     :noindex:
 
+RMSPropOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+    :members:
+    :noindex:
+
 Adadelta
 --------
 
diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst
index 74d102dcb0..39fda65863 100644
--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -23,3 +23,15 @@ profiler
 ..  autofunction:: paddle.fluid.profiler.profiler
     :noindex:
 
+start_profiler
+--------------
+
+..  autofunction:: paddle.fluid.profiler.start_profiler
+    :noindex:
+
+stop_profiler
+-------------
+
+..  autofunction:: paddle.fluid.profiler.stop_profiler
+    :noindex:
+

From 68811bcb5d93a9bcbafae81c0ec866e936e3de25 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Fri, 15 Jun 2018 05:08:32 -0700
Subject: [PATCH 64/69] Format the doc of layers.auc

---
 python/paddle/fluid/layers/metric.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric.py
index 15d7c50bf4..ed2f05e5a9 100644
--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
@@ -59,14 +59,14 @@ def auc(input, label, curve='ROC', num_thresholds=200):
     This implementation computes the AUC according to forward output and label.
     It is used very widely in binary classification evaluation. 
 
-    As a note: If input label contains values other than 0 and 1, it will be 
-    cast to bool. You can find the relevant definitions `here 
-     <https://en.wikipedia.org/wiki/Receiver_operating_characteristic
-      #Area_under_the_curve>`_.
+    Note: If input label contains values other than 0 and 1, it will be cast 
+    to `bool`. Find the relevant definitions `here <https://en.wikipedia.org\
+    /wiki/Receiver_operating_characteristic#Area_under_the_curve>`_.
 
     There are two types of possible curves:
-    1. ROC: Receiver operating characteristic
-    2. PR: Precision Recall
+
+        1. ROC: Receiver operating characteristic;
+        2. PR: Precision Recall
 
     Args:
         input(Variable): A floating-point 2D Variable, values are in the range 
@@ -85,9 +85,9 @@ def auc(input, label, curve='ROC', num_thresholds=200):
     Examples:
         .. code-block:: python
         
-        # network is a binary classification model and label the ground truth
-        prediction = network(image, is_infer=True)
-        auc_out=fluid.layers.auc(input=prediction, label=label)
+            # network is a binary classification model and label the ground truth
+            prediction = network(image, is_infer=True)
+            auc_out=fluid.layers.auc(input=prediction, label=label)
     """
 
     warnings.warn(

From 2efb0e5b70b2291151a14963813f7a968cad797a Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Fri, 15 Jun 2018 05:22:16 -0700
Subject: [PATCH 65/69] cased correction

---
 python/paddle/fluid/layers/metric.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric.py
index ed2f05e5a9..0a978eaa37 100644
--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
@@ -54,7 +54,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
 
 def auc(input, label, curve='ROC', num_thresholds=200):
     """
-    **Area Under The Curve (AUC) Layer**
+    **Area Under the Curve (AUC) Layer**
 
     This implementation computes the AUC according to forward output and label.
     It is used very widely in binary classification evaluation. 

From dd55cc16472f9669029276e2c198bd3f2ee71b52 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Fri, 15 Jun 2018 08:31:23 -0500
Subject: [PATCH 66/69] fix warning (#11518)

---
 paddle/fluid/operators/crop_op.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index 91cfbbda73..772e80bbea 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -52,7 +52,7 @@ static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
   } else {
     res = ctx.Attr<std::vector<int>>("offsets");
     PADDLE_ENFORCE_EQ(
-        rank, res.size(),
+        rank, static_cast<int>(res.size()),
         "Offsets size should be equal to dimension size of input tensor.");
   }
   return res;

From 53d1d0f0f2e0c0ab87150d4b4e8a77530b8d227c Mon Sep 17 00:00:00 2001
From: Wu Yi <typhoonzero1986@gmail.com>
Date: Fri, 15 Jun 2018 23:12:46 +0800
Subject: [PATCH 67/69] add LARS support (#10374)

---
 .../fluid/layers/learning_rate_scheduler.py   | 41 ++++++++++++++++++-
 python/paddle/fluid/optimizer.py              | 23 ++++++++---
 .../fluid/tests/book/test_recognize_digits.py |  2 +-
 3 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index fef1dca61b..e0ac0846a6 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -25,10 +25,11 @@ import nn
 import ops
 import tensor
 from ..initializer import init_on_cpu
+from ..framework import default_main_program, Parameter
 
 __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
-    'polynomial_decay', 'piecewise_decay', 'noam_decay'
+    'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS'
 ]
 
 
@@ -261,3 +262,41 @@ def piecewise_decay(boundaries, values):
                 tensor.assign(last_value_var, lr)
 
     return lr
+
+
+def append_LARS(params_grads, learning_rate, weight_decay):
+    """Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for
+       each layer.
+
+    ```python
+        learning_rate *= local_gw_ratio * sqrt(sumsq(param))
+                        / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
+    ```
+
+    Args:
+        learning_rate: A learning rate Variable. This
+          is the global learning rate for LARS.
+        weight_decay: A Python `float` number.
+
+    Returns:
+        The decayed learning rate
+    """
+
+    def _balanced_weight(param_norm, grad_norm):
+        if weight_decay == 1.0:
+            return grad_norm + param_norm
+        else:
+            return grad_norm + weight_decay * param_norm
+
+    for param, grad in params_grads:
+        param_lr = param.optimize_attr['learning_rate']
+        param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param)))
+        grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
+        if type(param_lr) == float and param_lr == 1.0:
+            decayed_lr = learning_rate * param_norm \
+                         / _balanced_weight(param_norm, grad_norm)
+        else:
+            decayed_lr = learning_rate * param_lr * param_norm \
+                         / _balanced_weight(param_norm, grad_norm)
+        # set back param local learning rate
+        param.optimize_attr['learning_rate'] = decayed_lr
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 115362c6bf..54fe935627 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import re
 from collections import defaultdict
-from paddle.fluid.framework import Program
+from paddle.fluid.framework import Program, Variable
 import framework
 import layers
 from backward import append_backward
@@ -41,7 +41,10 @@ class Optimizer(object):
     but need to use one of it's implementation.
     """
 
-    def __init__(self, learning_rate, regularization=None):
+    def __init__(self,
+                 learning_rate,
+                 regularization=None,
+                 LARS_weight_decay=0.0):
         if not isinstance(learning_rate, float) and \
                 not isinstance(learning_rate, framework.Variable):
             raise TypeError("learning rate should be float or Variable")
@@ -61,6 +64,7 @@ class Optimizer(object):
         # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
         self._accumulators = defaultdict(lambda: dict())
         self.helper = None
+        self._LARS_weight_decay = LARS_weight_decay
 
     def _create_global_learning_rate(self):
         lr = self.global_learning_rate()
@@ -100,10 +104,15 @@ class Optimizer(object):
         # create learning rate variable for every parameter
         param = param_and_grad[0]
         param_lr = param.optimize_attr['learning_rate']
-        if param_lr == 1.0:
-            return self.global_learning_rate()
+        if type(param_lr) == Variable:
+            # param learning rate has been updated (LARS)
+            print("returns updated param lr ", param_lr)
+            return param_lr
         else:
-            return self.global_learning_rate() * param_lr
+            if param_lr == 1.0:
+                return self.global_learning_rate()
+            else:
+                return self.global_learning_rate() * param_lr
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
@@ -210,6 +219,10 @@ class Optimizer(object):
             self._create_accumulators(loss.block,
                                       [p[0] for p in parameters_and_grads])
             self._create_global_learning_rate()
+            if self._LARS_weight_decay > 0.0:
+                layers.append_LARS(parameters_and_grads,
+                                   self.global_learning_rate(),
+                                   self._LARS_weight_decay)
 
             optimize_ops = []
             for param_and_grad in parameters_and_grads:
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index 578b1162fb..25bcb8a641 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -94,7 +94,7 @@ def train(nn_type,
 
     test_program = fluid.default_main_program().clone(for_test=True)
 
-    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001, LARS_weight_decay=0.3)
     optimizer.minimize(avg_loss)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

From 417fcf4f43f350ee4cb8407f8e1d98da4dc0b9dd Mon Sep 17 00:00:00 2001
From: Kexin Zhao <kexin.zhao.paddle@gmail.com>
Date: Fri, 15 Jun 2018 08:50:01 -0700
Subject: [PATCH 68/69] Modify Pybind LoDTensor API according to length-based
 LoD (#11106)

* add lod_tensor util and modify pybind

* refind pybind LoDTensor API and modify LoDTensor and DataFeeder test

* fix test error

* fix detection map op test

* fix reorder_lod_tensor test

* fix seq_concat_op

* fix chunk evel op test

* fix target assign op

* fix warp ctc op

* address comments step 1: reverse reset_lod op

* step 2: modify op test

* add warning message

* remove has_valid_lod

* add back has_valid_lod

* address comments

* add exception catching trial
---
 benchmark/fluid/models/machine_translation.py |  15 --
 .../fluid/models/stacked_dynamic_lstm.py      |  15 --
 paddle/fluid/framework/lod_tensor.cc          |  33 +++++
 paddle/fluid/framework/lod_tensor.h           |  14 ++
 paddle/fluid/framework/lod_tensor_test.cc     |  32 +++++
 paddle/fluid/pybind/pybind.cc                 |  74 ++++++++--
 python/paddle/fluid/data_feeder.py            |   7 +-
 python/paddle/fluid/lod_tensor.py             |  83 +----------
 python/paddle/fluid/tests/test_data_feeder.py |  17 ++-
 python/paddle/fluid/tests/test_lod_tensor.py  |  65 +++++----
 .../paddle/fluid/tests/unittests/op_test.py   |  14 +-
 .../tests/unittests/test_batch_norm_op.py     |   2 +-
 .../unittests/test_beam_search_decode_op.py   |  22 +--
 .../tests/unittests/test_beam_search_op.py    |   8 +-
 .../unittests/test_bipartite_match_op.py      |  20 +--
 .../tests/unittests/test_box_coder_op.py      |  18 ++-
 .../tests/unittests/test_chunk_eval_op.py     |   8 +-
 .../tests/unittests/test_crf_decoding_op.py   |  31 +++--
 .../fluid/tests/unittests/test_ctc_align.py   |  12 +-
 .../tests/unittests/test_detection_map_op.py  |  36 ++---
 .../unittests/test_dynrnn_gradient_check.py   |   6 +-
 .../unittests/test_dynrnn_static_input.py     |  47 ++++---
 .../tests/unittests/test_edit_distance_op.py  |  44 +++---
 .../tests/unittests/test_feed_fetch_method.py |   7 +-
 .../test_fill_constant_batch_size_like_op.py  |   2 +-
 .../fluid/tests/unittests/test_gru_op.py      |  12 +-
 .../tests/unittests/test_iou_similarity_op.py |   4 +-
 .../unittests/test_linear_chain_crf_op.py     |  14 +-
 .../tests/unittests/test_lod_rank_table.py    |   3 +-
 .../tests/unittests/test_lod_reset_op.py      |  44 +++---
 .../tests/unittests/test_lod_tensor_array.py  |   8 +-
 .../unittests/test_lod_tensor_array_ops.py    |  35 ++---
 .../fluid/tests/unittests/test_lstm_op.py     |  40 +++---
 .../fluid/tests/unittests/test_lstmp_op.py    |  34 ++---
 .../unittests/test_mine_hard_examples_op.py   |   4 +-
 .../tests/unittests/test_multiclass_nms_op.py |   4 +-
 .../fluid/tests/unittests/test_one_hot_op.py  |  20 +--
 .../fluid/tests/unittests/test_print_op.py    |   4 +-
 .../unittests/test_reorder_lod_tensor.py      |  52 ++++---
 .../fluid/tests/unittests/test_roi_pool_op.py |   3 +-
 .../fluid/tests/unittests/test_row_conv_op.py |  15 +-
 .../tests/unittests/test_seq_concat_op.py     |  51 +++----
 .../fluid/tests/unittests/test_seq_conv.py    |  57 +++++---
 .../fluid/tests/unittests/test_seq_pool.py    | 129 ++++++++++--------
 .../tests/unittests/test_sequence_erase_op.py |  16 ++-
 .../tests/unittests/test_sequence_expand.py   |  40 +++---
 .../tests/unittests/test_sequence_reshape.py  |  16 +--
 .../tests/unittests/test_sequence_slice_op.py |  12 +-
 .../unittests/test_sequence_softmax_op.py     |  13 +-
 .../tests/unittests/test_shrink_rnn_memory.py |   6 +-
 .../test_split_and_merge_lod_tensor_op.py     |  13 +-
 .../tests/unittests/test_target_assign_op.py  |  43 +++---
 .../fluid/tests/unittests/test_tensor.py      |  27 ++--
 .../fluid/tests/unittests/test_warpctc_op.py  |  35 +++--
 .../unittests/test_weight_normalization.py    |   8 +-
 .../paddle/fluid/tests/unittests/testsuite.py |   4 +-
 tools/codestyle/cpplint_pre_commit.hook       |   2 +-
 57 files changed, 765 insertions(+), 635 deletions(-)

diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py
index 69541adf6b..17f6b03826 100644
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -173,21 +173,6 @@ def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
         return avg_cost, feeding_list
 
 
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    lod_t = core.LoDTensor()
-    lod_t.set(flattened_data, place)
-    lod_t.set_lod([lod])
-    return lod_t, lod[-1]
-
-
 def lodtensor_to_ndarray(lod_tensor):
     dims = lod_tensor.get_dims()
     ndarray = np.zeros(shape=dims).astype('float32')
diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py
index 211869af4e..3231542a17 100644
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -125,18 +125,3 @@ def get_model(args):
         batch_size=args.batch_size)
 
     return loss, inference_program, adam, train_reader, test_reader, batch_acc
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index a56674cbe2..e331c8128f 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -410,5 +410,38 @@ void LoDTensor::MergeLoDTensor(
   }
 }
 
+LoD ConvertToLengthBasedLoD(const LoD &offset_lod) {
+  LoD length_lod;
+  length_lod.reserve(offset_lod.size());
+  for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) {
+    std::vector<size_t> level;
+    if (offset_lod[lvl].size() > 0) {
+      level.reserve(offset_lod[lvl].size() - 1);
+    }
+    for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) {
+      level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]);
+    }
+    length_lod.push_back(level);
+  }
+  return length_lod;
+}
+
+LoD ConvertToOffsetBasedLoD(const LoD &length_lod) {
+  LoD offset_lod;
+  offset_lod.reserve(length_lod.size());
+  for (size_t lvl = 0; lvl < length_lod.size(); ++lvl) {
+    std::vector<size_t> level;
+    level.reserve(length_lod[lvl].size() + 1);
+    size_t tmp = 0;
+    level.push_back(tmp);
+    for (size_t idx = 0; idx < length_lod[lvl].size(); ++idx) {
+      tmp += length_lod[lvl][idx];
+      level.push_back(tmp);
+    }
+    offset_lod.push_back(level);
+  }
+  return offset_lod;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 1159fee39b..4a2729373b 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -226,5 +226,19 @@ extern void WriteToRecordIO(recordio::Writer* writer,
 extern std::vector<LoDTensor> ReadFromRecordIO(
     recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);
 
+/*
+ * Convert between length-based LoD and offset-based LoD.
+ * The implementation of LoDTensor class use offset-based LoD.
+ * However, we want to expose the more user-friendly length-based
+ * LoD to the Python side instead.
+ *
+ * Example:
+ * If offset_lod = [[0, 2, 3],[0, 3, 5, 9]]
+ * then length_lod = [[2, 1], [3, 2, 4]]
+ */
+LoD ConvertToLengthBasedLoD(const LoD& offset_lod);
+
+LoD ConvertToOffsetBasedLoD(const LoD& length_lod);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index 2ceffc9331..6dfe7d2d8c 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -228,6 +228,38 @@ TEST(LoD, CheckAbsLoD) {
   ASSERT_FALSE(CheckAbsLoD(abs_lod0));
 }
 
+TEST(LoD, ConvertToLengthBasedLoD) {
+  LoD offset_lod;
+  offset_lod.push_back(std::vector<size_t>({0, 2}));
+  offset_lod.push_back(std::vector<size_t>({0, 1, 3}));
+  offset_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  LoD length_lod = ConvertToLengthBasedLoD(offset_lod);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({2}));
+  expected.push_back(std::vector<size_t>({1, 2}));
+  expected.push_back(std::vector<size_t>({2, 2, 1}));
+
+  EXPECT_EQ(length_lod, expected);
+}
+
+TEST(LoD, ConvertToOffsetBasedLoD) {
+  LoD length_lod;
+  length_lod.push_back(std::vector<size_t>({2}));
+  length_lod.push_back(std::vector<size_t>({1, 2}));
+  length_lod.push_back(std::vector<size_t>({2, 2, 1}));
+
+  LoD offset_lod = ConvertToOffsetBasedLoD(length_lod);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({0, 2}));
+  expected.push_back(std::vector<size_t>({0, 1, 3}));
+  expected.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  EXPECT_EQ(offset_lod, expected);
+}
+
 template <typename T>
 static void TestRecordIO() {
   LoDTensor tensor;
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index bd5c613f8c..74036bcb31 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -144,28 +144,74 @@ PYBIND11_PLUGIN(core) {
   py::class_<LoDTensor, Tensor>(m, "LoDTensor")
       .def_buffer(
           [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
-      .def(
-          "__init__",
-          [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
-            LoD new_lod;
-            new_lod.reserve(lod.size());
-            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-            new (&instance) LoDTensor(new_lod);
-          })
+      .def("__init__",
+           [](LoDTensor &instance, const std::vector<std::vector<size_t>>
+                                       &recursive_sequence_lengths) {
+             LoD new_lod;
+             new_lod.reserve(recursive_sequence_lengths.size());
+             std::copy(recursive_sequence_lengths.begin(),
+                       recursive_sequence_lengths.end(),
+                       std::back_inserter(new_lod));
+             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
+             PADDLE_ENFORCE(
+                 CheckLoD(new_offset_lod, -1),
+                 "the provided recursive_sequence_lengths info is invalid");
+             new (&instance) LoDTensor(new_offset_lod);
+           })
       .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
       .def("set_lod",
            [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
+             // the input lod is offset-based level-of-detail info
+             LOG(WARNING)
+                 << "set_lod is deprecated and will be removed by 9.2018, "
+                    "please switch to set_recursive_sequence_lengths.";
              LoD new_lod;
              new_lod.reserve(lod.size());
              std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+             PADDLE_ENFORCE(CheckLoD(new_lod, vectorize(self.dims()).front()),
+                            "the provided lod info is invalid");
              self.set_lod(new_lod);
            })
-      .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
-        auto lod = self.lod();
-        std::vector<std::vector<size_t>> new_lod;
-        new_lod.reserve(lod.size());
-        std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-        return new_lod;
+      .def("set_recursive_sequence_lengths",
+           [](LoDTensor &self, const std::vector<std::vector<size_t>>
+                                   &recursive_sequence_lengths) {
+             // the input recursive_sequence_lengths is length-based
+             // level-of-detail info
+             LoD new_lod;
+             new_lod.reserve(recursive_sequence_lengths.size());
+             std::copy(recursive_sequence_lengths.begin(),
+                       recursive_sequence_lengths.end(),
+                       std::back_inserter(new_lod));
+             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
+             PADDLE_ENFORCE(
+                 CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
+                 "the provided recursive_sequence_lengths info is invalid");
+             self.set_lod(new_offset_lod);
+           })
+      .def("lod",
+           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
+             // output the offset-based lod info
+             LOG(WARNING) << "lod is deprecated and will be removed by 9.2018, "
+                             "please switch to recursive_sequence_lengths.";
+             LoD lod = self.lod();
+             std::vector<std::vector<size_t>> new_lod;
+             new_lod.reserve(lod.size());
+             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+             return new_lod;
+           })
+      .def("recursive_sequence_lengths",
+           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
+             // output the length-based lod info
+             LoD lod = ConvertToLengthBasedLoD(self.lod());
+             std::vector<std::vector<size_t>> new_lod;
+             new_lod.reserve(lod.size());
+             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+             return new_lod;
+           })
+      .def("has_valid_recursive_sequence_lengths", [](LoDTensor &self) -> bool {
+        // Check that the lod info is valid and match the outermost
+        // dimension of the LoDTensor data
+        return CheckLoD(self.lod(), vectorize(self.dims()).front());
       });
 
   py::class_<SelectedRows>(m, "SelectedRows")
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index e2013137b1..ac39600201 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -47,7 +47,7 @@ class DataToLoDTensorConverter(object):
         self.lod = []
 
         for i in six.range(lod_level):
-            self.lod.append([0])
+            self.lod.append([])
 
     def feed(self, data):
         self._feed_impl_(data, self.lod, self.lod_level)
@@ -56,8 +56,7 @@ class DataToLoDTensorConverter(object):
         if lod_level == 0:
             self.data.append(data)
         else:
-            cur_lod_len = len(data)
-            lod[0].append(lod[0][-1] + cur_lod_len)
+            lod[0].append(len(data))
             for each_data in data:
                 self._feed_impl_(each_data, lod[1:], lod_level - 1)
 
@@ -66,7 +65,7 @@ class DataToLoDTensorConverter(object):
         t = core.LoDTensor()
         t.set(arr, self.place)
         if self.lod_level > 0:
-            t.set_lod(self.lod)
+            t.set_recursive_sequence_lengths(self.lod)
         return t
 
 
diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
index 9946d0a4ff..61be39c259 100644
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -18,80 +18,6 @@ import numpy as np
 __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
 
 
-def _validate_lod(lod, tensor_height=-1):
-    """Check whether the input length-based lod info is valid.
-
-    There are several things to check:
-    1. lod should be a list of lists. Empty list is fine.
-    2. The length of each sublist (a lod level) should be at least one.
-    3. Each element in each lod level should be an integer greater than 0.
-    4. The sum of one lod level should be equal to the length of the next lod level.
-    5. The sum of the last lod level should be equal to the tensor height. 
-       Bypass this check if user does not provide tensor_height as input.
-
-    Args:
-        lod: the length-based lod info, e.g., [[2, 3], [2, 1, 2, 3, 4]].
-        tensor_height: the outermost dimension of the tensor with which the input 
-            lod is associated with. 
-
-    Returns:
-        A boolean indicating whether the input lod is valid or not.
-    """
-    assert isinstance(lod, list), "lod should be a list"
-    # Empty lod is fine
-    if len(lod) == 0:
-        return True
-
-    lod_sum = []
-    for level in lod:
-        assert isinstance(level, list), "each item in lod should be a list"
-        # Each level of lod should have at least one length info
-        if len(level) < 1:
-            return False
-        level_sum = 0
-        for lod_len in level:
-            # Each length in a level should be > 0
-            if lod_len <= 0:
-                return False
-            level_sum += lod_len
-        lod_sum.append(level_sum)
-
-    for idx, val in enumerate(lod_sum[:-1]):
-        # Each level's sum should be equal to 
-        # the number of items in the next level
-        if val != len(lod[idx + 1]):
-            return False
-
-    if tensor_height == -1:
-        return True
-    else:
-        # Last level's sum should be equal to the tensor height
-        return lod_sum[-1] == tensor_height
-
-
-def _convert_lod(lod):
-    """Convert a length-based lod to a offset-based lod.
-
-    If the length-based lod is [[2, 3], [2, 1, 2, 3, 4]],
-    then the offset-based lod is [[0, 2, 5], [0, 2, 3, 5, 8, 12]].
-
-    Args:
-        lod: a length-based lod info. 
-
-    Returns:
-        A list of lists as the offset-based lod converted to from the input lod.
-    """
-    new_lod = []
-    for level in lod:
-        cur_len = 0
-        new_level = [cur_len]
-        for lod_len in level:
-            cur_len += lod_len
-            new_level.append(cur_len)
-        new_lod.append(new_level)
-    return new_lod
-
-
 def create_lod_tensor(data, lod, place):
     """Create a lod tensor from a numpy array, a list, or an existing lod tensor.
 
@@ -139,11 +65,11 @@ def create_lod_tensor(data, lod, place):
         flattened_data = flattened_data.reshape([len(flattened_data), 1])
         return create_lod_tensor(flattened_data, lod, place)
     elif isinstance(data, np.ndarray):
-        assert _validate_lod(lod,
-                             data.shape[0]), "the provided lod info is invalid"
         tensor = core.LoDTensor()
         tensor.set(data, place)
-        tensor.set_lod(_convert_lod(lod))
+        tensor.set_recursive_sequence_lengths(lod)
+        assert tensor.has_valid_recursive_sequence_lengths(
+        ), "the provided lod info is invalid"
         return tensor
     else:
         raise TypeError(
@@ -181,9 +107,8 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high):
         A fluid LoDTensor object with tensor data and lod info. 
     """
     assert isinstance(base_shape, list), "base_shape should be a list"
-    converted_lod = _convert_lod(lod)
     # append the total number of basic elements to the front of its shape
-    overall_shape = [converted_lod[-1][-1]] + base_shape
+    overall_shape = [sum(lod[-1])] + base_shape
     # the range of integer data elements is [low, high]    
     data = np.random.random_integers(low, high, overall_shape).astype("int64")
     return create_lod_tensor(data, lod, place)
diff --git a/python/paddle/fluid/tests/test_data_feeder.py b/python/paddle/fluid/tests/test_data_feeder.py
index ce3ba3ebc5..30b7a634a2 100644
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
@@ -22,12 +22,11 @@ class TestDataFeeder(unittest.TestCase):
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
         feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
         result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])])
-        print(result)
 
         self.assertEqual(result['image'].shape(), [2, 1, 28, 28])
         self.assertEqual(result['label'].shape(), [2, 1])
-        self.assertEqual(result['image'].lod(), [])
-        self.assertEqual(result['label'].lod(), [])
+        self.assertEqual(result['image'].recursive_sequence_lengths(), [])
+        self.assertEqual(result['label'].recursive_sequence_lengths(), [])
 
     def test_lod_level_1_converter(self):
         # lod_level = 1
@@ -42,12 +41,12 @@ class TestDataFeeder(unittest.TestCase):
         # label = [1] * len(data)
         result = feeder.feed(
             [([1, 2, 3], [1]), ([4, 5], [1]), ([6, 7, 8, 9], [1])])
-        print(result)
 
         self.assertEqual(result['sentences'].shape(), [9, 1])
         self.assertEqual(result['label'].shape(), [3, 1])
-        self.assertEqual(result['sentences'].lod(), [[0, 3, 5, 9]])
-        self.assertEqual(result['label'].lod(), [])
+        self.assertEqual(result['sentences'].recursive_sequence_lengths(),
+                         [[3, 2, 4]])
+        self.assertEqual(result['label'].recursive_sequence_lengths(), [])
 
     def test_lod_level_2_converter(self):
         # lod_level = 2
@@ -62,12 +61,12 @@ class TestDataFeeder(unittest.TestCase):
         # label = [1] * len(data)
         result = feeder.feed(
             [([[1, 2, 3], [4, 5]], [1]), ([[6, 7, 8, 9]], [1])])
-        print(result)
 
         self.assertEqual(result['paragraphs'].shape(), [9, 1])
         self.assertEqual(result['label'].shape(), [2, 1])
-        self.assertEqual(result['paragraphs'].lod(), [[0, 2, 3], [0, 3, 5, 9]])
-        self.assertEqual(result['label'].lod(), [])
+        self.assertEqual(result['paragraphs'].recursive_sequence_lengths(),
+                         [[2, 1], [3, 2, 4]])
+        self.assertEqual(result['label'].recursive_sequence_lengths(), [])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py
index 013d72f418..b7e7f5801f 100644
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
@@ -13,44 +13,41 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
-from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor, _validate_lod, _convert_lod
-import numpy
+from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor
+import numpy as np
 import unittest
 
 
 class TestLoDTensor(unittest.TestCase):
-    def test_validate_lod(self):
-        lod = (1, 2, 1)
-        self.assertRaises(AssertionError, _validate_lod, lod, -1)
-        lod = [[1, 2], (2, 3)]
-        self.assertRaises(AssertionError, _validate_lod, lod, -1)
-        lod = [1, 2, 3]
-        self.assertRaises(AssertionError, _validate_lod, lod, -1)
-
+    def test_pybind_lod(self):
+        tensor = fluid.LoDTensor()
         lod = []
-        self.assertTrue(_validate_lod(lod, -1))
+        tensor.set_recursive_sequence_lengths(lod)
         lod = [[], [1], [3]]
-        self.assertFalse(_validate_lod(lod, -1))
-        lod = [[0], [-1], [3]]
-        self.assertFalse(_validate_lod(lod, -1))
+        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths, lod)
+        lod = [[0], [2], [3]]
+        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths, lod)
 
-        # Each level's sum should be equal to the number of items in the next level
-        # Moreover, last level's sum should be equal to the tensor height
-        lod = [[2, 3], [1, 3, 1, 2, 1]]
-        self.assertTrue(_validate_lod(lod, tensor_height=8))
-        lod = [[1, 3], [2, 1, 3]]
-        self.assertFalse(_validate_lod(lod, tensor_height=6))
-        lod = [[1, 3], [2, 1, 3, 4]]
-        self.assertFalse(_validate_lod(lod, tensor_height=5))
-
-    def test_convert_lod(self):
         lod = [[1, 2, 3]]
-        converted_lod = [[0, 1, 3, 6]]
-        self.assertEqual(_convert_lod(lod), converted_lod)
+        tensor.set_recursive_sequence_lengths(lod)
+        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        tensor.set(np.random.random([6, 1]), fluid.CPUPlace())
+        self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
+        tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
+        self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
 
+        # Each level's sum should be equal to the number of items in the next level
+        # Moreover, last level's sum should be equal to the tensor height
+        lod = [[2, 3], [1, 3, 1, 2, 2]]
+        tensor.set_recursive_sequence_lengths(lod)
+        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        tensor.set(np.random.random([8, 1]), fluid.CPUPlace())
+        self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
         lod = [[2, 3], [1, 3, 1, 2, 1]]
-        converted_lod = [[0, 2, 5], [0, 1, 4, 5, 7, 8]]
-        self.assertEqual(_convert_lod(lod), converted_lod)
+        tensor.set_recursive_sequence_lengths(lod)
+        self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
+        tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
+        self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
 
     def test_create_lod_tensor(self):
         # Create LoDTensor from a list
@@ -60,19 +57,19 @@ class TestLoDTensor(unittest.TestCase):
         self.assertRaises(AssertionError, create_lod_tensor, data, wrong_lod,
                           fluid.CPUPlace())
         tensor = create_lod_tensor(data, correct_lod, fluid.CPUPlace())
-        self.assertEqual(tensor.lod(), [[0, 3, 5]])
+        self.assertEqual(tensor.recursive_sequence_lengths(), correct_lod)
 
         # Create LoDTensor from numpy array
-        data = numpy.random.random([10, 1])
+        data = np.random.random([10, 1])
         lod = [[2, 1], [3, 3, 4]]
         tensor = create_lod_tensor(data, lod, fluid.CPUPlace())
-        self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]])
+        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
 
         # Create LoDTensor from another LoDTensor, they are differnt instances
         new_lod = [[2, 2, 1], [1, 2, 2, 3, 2]]
         new_tensor = create_lod_tensor(tensor, new_lod, fluid.CPUPlace())
-        self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]])
-        self.assertEqual(new_tensor.lod(), [[0, 2, 4, 5], [0, 1, 3, 5, 8, 10]])
+        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        self.assertEqual(new_tensor.recursive_sequence_lengths(), new_lod)
 
     def test_create_random_int_lodtensor(self):
         # The shape of a word, commonly used in speech and NLP problem, is [1]
@@ -83,7 +80,7 @@ class TestLoDTensor(unittest.TestCase):
         high = dict_size - 1
         tensor = create_random_int_lodtensor(lod, shape,
                                              fluid.CPUPlace(), low, high)
-        self.assertEqual(tensor.lod(), [[0, 2, 5, 10]])
+        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
         self.assertEqual(tensor.shape(), [10, 1])
 
 
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 307caae4b0..e056ef9952 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -162,7 +162,7 @@ class OpTest(unittest.TestCase):
                     tensor = core.LoDTensor()
                     if isinstance(np_value, tuple):
                         tensor.set(np_value[0], place)
-                        tensor.set_lod(np_value[1])
+                        tensor.set_recursive_sequence_lengths(np_value[1])
                     else:
                         tensor.set(np_value, place)
                     feed_map[name] = tensor
@@ -170,7 +170,8 @@ class OpTest(unittest.TestCase):
                 tensor = core.LoDTensor()
                 if isinstance(self.inputs[var_name], tuple):
                     tensor.set(self.inputs[var_name][0], place)
-                    tensor.set_lod(self.inputs[var_name][1])
+                    tensor.set_recursive_sequence_lengths(self.inputs[var_name][
+                        1])
                 else:
                     tensor.set(self.inputs[var_name], place)
                 feed_map[var_name] = tensor
@@ -293,7 +294,8 @@ class OpTest(unittest.TestCase):
                         str(place))
                     if isinstance(expect, tuple):
                         self.assertListEqual(
-                            actual.lod(), expect[1], "Output (" + sub_out_name +
+                            actual.recursive_sequence_lengths(), expect[1],
+                            "Output (" + sub_out_name +
                             ") has different lod at " + str(place))
             else:
                 idx = find_actual(out_name, fetch_list)
@@ -307,8 +309,8 @@ class OpTest(unittest.TestCase):
                     "Output (" + out_name + ") has diff at " + str(place) +
                     str(actual_t) + "\n" + str(expect_t))
                 if isinstance(expect, tuple):
-                    self.assertListEqual(actual.lod(), expect[1],
-                                         "Output (" + out_name +
+                    self.assertListEqual(actual.recursive_sequence_lengths(),
+                                         expect[1], "Output (" + out_name +
                                          ") has different lod at " + str(place))
 
     def _get_places(self):
@@ -408,7 +410,7 @@ class OpTest(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(np_value, place)
         if lod is not None:
-            tensor.set_lod(lod)
+            tensor.set_recursive_sequence_lengths(lod)
         return tensor
 
     @staticmethod
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 4216d83653..01e5749bdb 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -128,7 +128,7 @@ def create_or_get_tensor(scope, var_name, var, place):
     tensor = scope.var(var_name).get_tensor()
     if var is not None:
         assert isinstance(var, np.ndarray)
-        tensor.set_lod([[]])
+        tensor.set_recursive_sequence_lengths([])
         tensor.set_dims(var.shape)
         tensor.set(var, place)
     return tensor
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
index 7976dd7c3f..4e1687477c 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
@@ -26,36 +26,36 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
 
     def append_lod_tensor(self, tensor_array, lod, data):
         lod_tensor = core.LoDTensor()
-        lod_tensor.set_lod(lod)
+        lod_tensor.set_recursive_sequence_lengths(lod)
         lod_tensor.set(data, self.place)
         tensor_array.append(lod_tensor)
 
     def test_get_set(self):
         ids = self.scope.var("ids").get_lod_tensor_array()
         self.append_lod_tensor(
-            ids, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
+            ids, [[3, 3], [1, 1, 1, 1, 1, 1]],
             np.array(
                 [1, 2, 3, 4, 5, 6], dtype="int64"))
         self.append_lod_tensor(
-            ids, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
+            ids, [[3, 3], [1, 0, 2, 2, 0, 1]],
             np.array(
                 [0, 1, 2, 3, 4, 5], dtype="int64"))
         self.append_lod_tensor(
-            ids, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
+            ids, [[3, 3], [0, 1, 1, 1, 1, 1]],
             np.array(
                 [0, 1, 2, 3, 4], dtype="int64"))
 
         scores = self.scope.var("scores").get_lod_tensor_array()
         self.append_lod_tensor(
-            scores, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
+            scores, [[3, 3], [1, 1, 1, 1, 1, 1]],
             np.array(
                 [1, 2, 3, 4, 5, 6], dtype="float64"))
         self.append_lod_tensor(
-            scores, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
+            scores, [[3, 3], [1, 0, 2, 2, 0, 1]],
             np.array(
                 [0, 1, 2, 3, 4, 5], dtype="float64"))
         self.append_lod_tensor(
-            scores, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
+            scores, [[3, 3], [0, 1, 1, 1, 1, 1]],
             np.array(
                 [0, 1, 2, 3, 4], dtype="float64"))
 
@@ -73,9 +73,11 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
 
         beam_search_decode_op.run(self.scope, self.place)
 
-        expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
-        self.assertEqual(sentence_ids.lod(), expected_lod)
-        self.assertEqual(sentence_scores.lod(), expected_lod)
+        expected_lod = [[4, 4], [1, 2, 3, 3, 1, 3, 3, 3]]
+        self.assertEqual(sentence_ids.recursive_sequence_lengths(),
+                         expected_lod)
+        self.assertEqual(sentence_scores.recursive_sequence_lengths(),
+                         expected_lod)
 
         expected_data = np.array(
             [2, 1, 0, 3, 1, 0, 3, 2, 1, 5, 4, 3, 2, 4, 4, 3, 6, 5, 4], "int64")
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
index bc708f3aff..5a14178c27 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -48,18 +48,18 @@ class BeamSearchOpTester(unittest.TestCase):
         op.run(self.scope, core.CPUPlace())
         selected_ids = self.scope.find_var("selected_ids").get_tensor()
         print 'selected_ids', np.array(selected_ids)
-        print 'lod', selected_ids.lod()
+        print 'lod', selected_ids.recursive_sequence_lengths()
 
     def _create_pre_ids(self):
         np_data = np.array([[1, 2, 3, 4]], dtype='int64')
         tensor = create_tensor(self.scope, "pre_ids", np_data)
 
     def _create_ids(self):
-        self.lod = [[0, 1, 4], [0, 1, 2, 3, 4]]
+        self.lod = [[1, 3], [1, 1, 1, 1]]
         np_data = np.array(
             [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int64')
         tensor = create_tensor(self.scope, "ids", np_data)
-        tensor.set_lod(self.lod)
+        tensor.set_recursive_sequence_lengths(self.lod)
 
     def _create_scores(self):
         np_data = np.array(
@@ -71,7 +71,7 @@ class BeamSearchOpTester(unittest.TestCase):
             ],
             dtype='float32')
         tensor = create_tensor(self.scope, "scores", np_data)
-        tensor.set_lod(self.lod)
+        tensor.set_recursive_sequence_lengths(self.lod)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index f7461ee6da..1a245fd756 100644
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -65,23 +65,25 @@ def batch_bipartite_match(distance, lod, match_type=None, dist_threshold=None):
         distance (numpy.array) : The distance of two entries with shape [M, N].
         lod (list of int): The offsets of each input in this batch.
     """
-    n = len(lod) - 1
+    n = len(lod)
     m = distance.shape[1]
     match_indices = -1 * np.ones((n, m), dtype=np.int)
     match_dist = np.zeros((n, m), dtype=np.float32)
-    for i in range(len(lod) - 1):
-        bipartite_match(distance[lod[i]:lod[i + 1], :], match_indices[i, :],
-                        match_dist[i, :])
+    cur_offset = 0
+    for i in range(n):
+        bipartite_match(distance[cur_offset:(cur_offset + lod[i]), :],
+                        match_indices[i, :], match_dist[i, :])
         if match_type == 'per_prediction':
-            argmax_match(distance[lod[i]:lod[i + 1], :], match_indices[i, :],
-                         match_dist[i, :], dist_threshold)
+            argmax_match(distance[cur_offset:(cur_offset + lod[i]), :],
+                         match_indices[i, :], match_dist[i, :], dist_threshold)
+        cur_offset += lod[i]
     return match_indices, match_dist
 
 
 class TestBipartiteMatchOpWithLoD(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'
-        lod = [[0, 5, 11, 23]]
+        lod = [[5, 6, 12]]
         dist = np.random.random((23, 217)).astype('float32')
         match_indices, match_dist = batch_bipartite_match(dist, lod[0])
 
@@ -98,7 +100,7 @@ class TestBipartiteMatchOpWithLoD(OpTest):
 class TestBipartiteMatchOpWithoutLoD(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'
-        lod = [[0, 8]]
+        lod = [[8]]
         dist = np.random.random((8, 17)).astype('float32')
         match_indices, match_dist = batch_bipartite_match(dist, lod[0])
 
@@ -115,7 +117,7 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
 class TestBipartiteMatchOpWithPerPredictionType(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'
-        lod = [[0, 5, 11, 23]]
+        lod = [[5, 6, 12]]
         dist = np.random.random((23, 237)).astype('float32')
         match_indices, match_dist = batch_bipartite_match(dist, lod[0],
                                                           'per_prediction', 0.5)
diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
index b4c48d85f2..4ce9a4783e 100644
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
@@ -81,15 +81,19 @@ def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type,
     n = target_box.shape[0]
     m = prior_box.shape[0]
     output_box = np.zeros((n, m, 4), dtype=np.float32)
-    for i in range(len(lod) - 1):
+    cur_offset = 0
+    for i in range(len(lod)):
         if (code_type == "EncodeCenterSize"):
-            box_coder(target_box[lod[i]:lod[i + 1], :], prior_box,
-                      prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
+            box_coder(target_box[cur_offset:(cur_offset + lod[i]), :],
+                      prior_box, prior_box_var,
+                      output_box[cur_offset:(cur_offset + lod[i]), :, :],
                       code_type, box_normalized)
         elif (code_type == "DecodeCenterSize"):
-            box_coder(target_box[lod[i]:lod[i + 1], :, :], prior_box,
-                      prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
+            box_coder(target_box[cur_offset:(cur_offset + lod[i]), :, :],
+                      prior_box, prior_box_var,
+                      output_box[cur_offset:(cur_offset + lod[i]), :, :],
                       code_type, box_normalized)
+        cur_offset += lod[i]
     return output_box
 
 
@@ -99,7 +103,7 @@ class TestBoxCoderOp(OpTest):
 
     def setUp(self):
         self.op_type = "box_coder"
-        lod = [[0, 1, 2, 3, 4, 5]]
+        lod = [[1, 1, 1, 1, 1]]
         prior_box = np.random.random((10, 4)).astype('float32')
         prior_box_var = np.random.random((10, 4)).astype('float32')
         target_box = np.random.random((5, 10, 4)).astype('float32')
@@ -152,7 +156,7 @@ class TestBoxCoderOpWithLoD(OpTest):
 
     def setUp(self):
         self.op_type = "box_coder"
-        lod = [[0, 4, 12, 20]]
+        lod = [[4, 8, 8]]
         prior_box = np.random.random((10, 4)).astype('float32')
         prior_box_var = np.random.random((10, 4)).astype('float32')
         target_box = np.random.random((20, 4)).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
index 050df2801c..23932194f0 100644
--- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
@@ -144,10 +144,10 @@ class TestChunkEvalOp(OpTest):
         starts = sorted(starts)
         self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = self.gen_chunks(
             infer, label, starts)
-        self.inputs = {
-            'Inference': (infer, [starts]),
-            'Label': (label, [starts])
-        }
+        lod = []
+        for i in range(len(starts) - 1):
+            lod.append(starts[i + 1] - starts[i])
+        self.inputs = {'Inference': (infer, [lod]), 'Label': (label, [lod])}
         precision = float(
             self.num_correct_chunks
         ) / self.num_infer_chunks if self.num_infer_chunks else 0
diff --git a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
index f397f542bb..122b076c2d 100644
--- a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
@@ -22,9 +22,9 @@ from op_test import OpTest
 class CRFDecoding(object):
     def __init__(self, emission_weights, transition_weights,
                  seq_start_positions):
-        assert (emission_weights.shape[0] == seq_start_positions[-1])
+        assert (emission_weights.shape[0] == sum(seq_start_positions))
         self.tag_num = emission_weights.shape[1]
-        self.seq_num = len(seq_start_positions) - 1
+        self.seq_num = len(seq_start_positions)
 
         self.seq_start_positions = seq_start_positions
         self.x = emission_weights
@@ -34,9 +34,9 @@ class CRFDecoding(object):
         self.w = transition_weights[2:, :]
 
         self.track = np.zeros(
-            (seq_start_positions[-1], self.tag_num), dtype="int64")
+            (sum(seq_start_positions), self.tag_num), dtype="int64")
         self.decoded_path = np.zeros(
-            (seq_start_positions[-1], 1), dtype="int64")
+            (sum(seq_start_positions), 1), dtype="int64")
 
     def _decode_one_sequence(self, decoded_path, x):
         seq_len, tag_num = x.shape
@@ -71,9 +71,11 @@ class CRFDecoding(object):
             decoded_path[i - 1] = max_idx = track[i, max_idx]
 
     def decode(self):
+        cur_pos = 0
         for i in range(self.seq_num):
-            start = self.seq_start_positions[i]
-            end = self.seq_start_positions[i + 1]
+            start = cur_pos
+            cur_pos += self.seq_start_positions[i]
+            end = cur_pos
             self._decode_one_sequence(self.decoded_path[start:end, :],
                                       self.x[start:end, :])
         return self.decoded_path
@@ -90,11 +92,13 @@ class TestCRFDecodingOp1(OpTest):
         TAG_NUM = 17
         MAX_SEQ_LEN = 10
 
-        lod = [[0]]
+        lod = [[]]
+        total_len = 0
         for i in range(SEQ_NUM):
-            lod[-1].append(lod[-1][-1] + random.randint(1, MAX_SEQ_LEN))
+            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
+            total_len += lod[-1][-1]
         emission = np.random.uniform(-1, 1,
-                                     [lod[-1][-1], TAG_NUM]).astype("float64")
+                                     [total_len, TAG_NUM]).astype("float64")
         transition = np.random.uniform(-0.5, 0.5,
                                        [TAG_NUM + 2, TAG_NUM]).astype("float64")
 
@@ -126,7 +130,8 @@ class TestCRFDecodingOp2(OpTest):
         self.op_type = "crf_decoding"
         TAG_NUM = 5
 
-        lod = [[0, 1, 3, 6, 10]]
+        lod = [[1, 2, 3, 4]]
+        total_len = sum(lod[-1])
         transition = np.repeat(
             np.arange(
                 TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
@@ -135,13 +140,13 @@ class TestCRFDecodingOp2(OpTest):
         emission = np.repeat(
             np.arange(
                 TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
-            lod[-1][-1],
+            total_len,
             axis=0)
 
         labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
+            low=0, high=TAG_NUM, size=(total_len, 1), dtype="int64")
         predicted_labels = np.ones(
-            (lod[-1][-1], 1), dtype="int64") * (TAG_NUM - 1)
+            (total_len, 1), dtype="int64") * (TAG_NUM - 1)
         expected_output = (labels == predicted_labels).astype("int64")
 
         self.inputs = {
diff --git a/python/paddle/fluid/tests/unittests/test_ctc_align.py b/python/paddle/fluid/tests/unittests/test_ctc_align.py
index f166031a1c..131b4076f4 100644
--- a/python/paddle/fluid/tests/unittests/test_ctc_align.py
+++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py
@@ -22,14 +22,16 @@ from test_softmax_op import stable_softmax
 def CTCAlign(input, lod, blank, merge_repeated):
     lod0 = lod[0]
     result = []
-    for i in range(len(lod0) - 1):
+    cur_offset = 0
+    for i in range(len(lod0)):
         prev_token = -1
-        for j in range(lod0[i], lod0[i + 1]):
+        for j in range(cur_offset, cur_offset + lod0[i]):
             token = input[j][0]
             if (token != blank) and not (merge_repeated and
                                          token == prev_token):
                 result.append(token)
             prev_token = token
+        cur_offset += lod0[i]
     result = np.array(result).reshape([len(result), 1]).astype("int32")
     if len(result) == 0:
         result = np.array([-1])
@@ -39,7 +41,7 @@ def CTCAlign(input, lod, blank, merge_repeated):
 class TestCTCAlignOp(OpTest):
     def config(self):
         self.op_type = "ctc_align"
-        self.input_lod = [[0, 11, 18]]
+        self.input_lod = [[11, 7]]
         self.blank = 0
         self.merge_repeated = False
         self.input = np.array(
@@ -66,7 +68,7 @@ class TestCTCAlignOp(OpTest):
 class TestCTCAlignOpCase1(TestCTCAlignOp):
     def config(self):
         self.op_type = "ctc_align"
-        self.input_lod = [[0, 11, 19]]
+        self.input_lod = [[11, 8]]
         self.blank = 0
         self.merge_repeated = True
         self.input = np.array(
@@ -77,7 +79,7 @@ class TestCTCAlignOpCase1(TestCTCAlignOp):
 class TestCTCAlignOpCase2(TestCTCAlignOp):
     def config(self):
         self.op_type = "ctc_align"
-        self.input_lod = [[0, 4]]
+        self.input_lod = [[4]]
         self.blank = 0
         self.merge_repeated = True
         self.input = np.array([0, 0, 0, 0]).reshape([4, 1]).astype("int32")
diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
index f545ad155c..05d3367ad8 100644
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -74,13 +74,13 @@ class TestDetectionMAPOp(OpTest):
         self.evaluate_difficult = True
         self.ap_type = "integral"
 
-        self.label_lod = [[0, 2, 4]]
+        self.label_lod = [[2, 2]]
         # label difficult xmin ymin xmax ymax
         self.label = [[1, 0, 0.1, 0.1, 0.3, 0.3], [1, 1, 0.6, 0.6, 0.8, 0.8],
                       [2, 0, 0.3, 0.3, 0.6, 0.5], [1, 0, 0.7, 0.1, 0.9, 0.3]]
 
         # label score xmin ymin xmax ymax difficult
-        self.detect_lod = [[0, 3, 7]]
+        self.detect_lod = [[3, 4]]
         self.detect = [
             [1, 0.3, 0.1, 0.0, 0.4, 0.3], [1, 0.7, 0.0, 0.1, 0.2, 0.3],
             [1, 0.9, 0.7, 0.6, 0.8, 0.8], [2, 0.8, 0.2, 0.1, 0.4, 0.4],
@@ -89,7 +89,7 @@ class TestDetectionMAPOp(OpTest):
         ]
 
         # label score true_pos false_pos
-        self.tf_pos_lod = [[0, 3, 7]]
+        self.tf_pos_lod = [[3, 4]]
         self.tf_pos = [[1, 0.9, 1, 0], [1, 0.7, 1, 0], [1, 0.3, 0, 1],
                        [1, 0.2, 1, 0], [2, 0.8, 0, 1], [2, 0.1, 1, 0],
                        [3, 0.2, 0, 1]]
@@ -112,15 +112,19 @@ class TestDetectionMAPOp(OpTest):
             for i, count in enumerate(class_pos_count):
                 class_pos_count_dict[i] = count
 
-            for i in range(len(true_pos_lod[0]) - 1):
-                start = true_pos_lod[0][i]
-                end = true_pos_lod[0][i + 1]
+            cur_pos = 0
+            for i in range(len(true_pos_lod[0])):
+                start = cur_pos
+                cur_pos += true_pos_lod[0][i]
+                end = cur_pos
                 for j in range(start, end):
                     true_pos_dict[i].append(true_pos[j])
 
-            for i in range(len(false_pos_lod[0]) - 1):
-                start = false_pos_lod[0][i]
-                end = false_pos_lod[0][i + 1]
+            cur_pos = 0
+            for i in range(len(false_pos_lod[0])):
+                start = cur_pos
+                cur_pos += false_pos_lod[0][i]
+                end = cur_pos
                 for j in range(start, end):
                     false_pos_dict[i].append(false_pos[j])
 
@@ -130,19 +134,19 @@ class TestDetectionMAPOp(OpTest):
             label_number = self.class_num
 
             out_class_pos_count = []
-            out_true_pos_lod = [0]
+            out_true_pos_lod = []
             out_true_pos = []
-            out_false_pos_lod = [0]
+            out_false_pos_lod = []
             out_false_pos = []
 
             for i in range(label_number):
                 out_class_pos_count.append([label_count[i]])
                 true_pos_list = true_pos[i]
                 out_true_pos += true_pos_list
-                out_true_pos_lod.append(len(out_true_pos))
+                out_true_pos_lod.append(len(true_pos_list))
                 false_pos_list = false_pos[i]
                 out_false_pos += false_pos_list
-                out_false_pos_lod.append(len(out_false_pos))
+                out_false_pos_lod.append(len(false_pos_list))
 
             return out_class_pos_count, out_true_pos, [
                 out_true_pos_lod
@@ -241,7 +245,7 @@ class TestDetectionMAPOpSkipDiff(TestDetectionMAPOp):
 
         self.evaluate_difficult = False
 
-        self.tf_pos_lod = [[0, 2, 6]]
+        self.tf_pos_lod = [[2, 4]]
         # label score true_pos false_pos
         self.tf_pos = [[1, 0.7, 1, 0], [1, 0.3, 0, 1], [1, 0.2, 1, 0],
                        [2, 0.8, 0, 1], [2, 0.1, 1, 0], [3, 0.2, 0, 1]]
@@ -267,9 +271,9 @@ class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
     def init_test_case(self):
         super(TestDetectionMAPOpMultiBatch, self).init_test_case()
         self.class_pos_count = [0, 2, 1]
-        self.true_pos_lod = [[0, 0, 3, 5]]
+        self.true_pos_lod = [[0, 3, 2]]
         self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]]
-        self.false_pos_lod = [[0, 0, 3, 5]]
+        self.false_pos_lod = [[0, 3, 2]]
         self.false_pos = [[0.7, 0.], [0.3, 1.], [0.2, 0.], [0.8, 1.], [0.1, 0.]]
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
index 95af51f1b2..0f289af284 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -136,16 +136,16 @@ class BaseRNN(object):
         feed_dict = dict()
 
         for iname in self.inputs:
-            lod = [0]
+            lod = []
             np_flatten = []
             for seq_id in xrange(len(self.inputs[iname])):
                 seq_len = len(self.inputs[iname][seq_id])
-                lod.append(lod[-1] + seq_len)
+                lod.append(seq_len)
                 np_flatten.extend(self.inputs[iname][seq_id])
 
             t = fluid.Tensor()
             t.set(numpy.array(np_flatten), place)
-            t.set_lod([lod])
+            t.set_recursive_sequence_lengths([lod])
             feed_dict[iname] = t
 
         for pname in self.params:
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index d3f63ee2c4..92e718662d 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -39,20 +39,20 @@ class TestDyRnnStaticInput(unittest.TestCase):
 
     def prepare_x_tensor(self):
         self.x_tensor_dim = 10
-        lod = [[0, 2, 3, 6]]
-        shape = [lod[0][-1], self.x_tensor_dim]
+        lod = [[2, 1, 3]]
+        shape = [sum(lod[0]), self.x_tensor_dim]
         self.x_tensor_data = np.random.random(shape).astype('float32')
         self.x_tensor = core.LoDTensor()
-        self.x_tensor.set_lod(lod)
+        self.x_tensor.set_recursive_sequence_lengths(lod)
         self.x_tensor.set(self.x_tensor_data, self.place)
 
     def prepare_static_input_tensor(self):
         self.static_input_tensor_dim = 4
-        lod = [[0, 1, 3, 6]]
-        shape = [lod[0][-1], self.static_input_tensor_dim]
+        lod = [[1, 2, 3]]
+        shape = [sum(lod[0]), self.static_input_tensor_dim]
         self.static_input_data = np.random.random(shape).astype('float32')
         self.static_input_tensor = core.LoDTensor()
-        self.static_input_tensor.set_lod(lod)
+        self.static_input_tensor.set_recursive_sequence_lengths(lod)
         self.static_input_tensor.set(self.static_input_data, self.place)
 
     def fetch_value(self, var):
@@ -69,7 +69,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
         ndarray = np.zeros(shape=dims).astype('float32')
         for i in xrange(np.product(dims)):
             ndarray.ravel()[i] = lod_tensor.get_float_element(i)
-        return ndarray, lod_tensor.lod()
+        return ndarray, lod_tensor.recursive_sequence_lengths()
 
     def build_graph(self, only_forward=False):
         x_tensor = fluid.layers.data(
@@ -131,21 +131,20 @@ class TestDyRnnStaticInput(unittest.TestCase):
             framework.grad_var_name('static_input_tensor'))
         return static_input_grad, loss
 
-    def get_seq_len_from_lod(self, lod):
-        return [lod[0][i + 1] - lod[0][i] for i in xrange(len(lod[0]) - 1)]
-
     def get_expected_static_step_outs(self):
-        x_lod = self.x_tensor.lod()
-        x_seq_len = self.get_seq_len_from_lod(x_lod)
+        x_lod = self.x_tensor.recursive_sequence_lengths()
+        x_seq_len = x_lod[0]
         x_seq_len_sorted = sorted(x_seq_len)
         x_sorted_indices = np.argsort(x_seq_len)[::-1]
 
-        static_lod = self.static_input_tensor.lod()
-        static_sliced = [
-            self.static_input_data[static_lod[0][i]:static_lod[0][i + 1]]
-            for i in xrange(len(static_lod[0]) - 1)
-        ]
-        static_seq_len = self.get_seq_len_from_lod(static_lod)
+        static_lod = self.static_input_tensor.recursive_sequence_lengths()
+        static_sliced = []
+        cur_offset = 0
+        for i in xrange(len(static_lod[0])):
+            static_sliced.append(self.static_input_data[cur_offset:(
+                cur_offset + static_lod[0][i])])
+            cur_offset += static_lod[0][i]
+        static_seq_len = static_lod[0]
         static_reordered = []
         for i in xrange(len(x_sorted_indices)):
             static_reordered.extend(static_sliced[x_sorted_indices[i]].tolist())
@@ -159,11 +158,13 @@ class TestDyRnnStaticInput(unittest.TestCase):
 
         for i in xrange(self._max_sequence_len):
             end = len(x_seq_len) - bisect.bisect_left(x_seq_len_sorted, i + 1)
-            lod = [0]
+            lod = []
+            total_len = 0
             for i in xrange(end):
-                lod.append(static_seq_len_reordered[i] + lod[-1])
+                lod.append(static_seq_len_reordered[i])
+                total_len += lod[-1]
             static_step_lods.append([lod])
-            end = lod[-1]
+            end = total_len
             static_step_outs.append(
                 np.array(static_reordered[:end]).astype('float32'))
 
@@ -199,7 +200,9 @@ class TestDyRnnStaticInput(unittest.TestCase):
             self.static_input_tensor.set_float_element(i, origin)
             numeric_gradients.ravel()[i] = (y_pos - y_neg) / self._delta / 2
         self.assertTrue(np.allclose(actual_gradients, numeric_gradients, 0.001))
-        self.assertTrue(np.allclose(actual_lod, self.static_input_tensor.lod()))
+        self.assertTrue(
+            np.allclose(actual_lod,
+                        self.static_input_tensor.recursive_sequence_lengths()))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
index 2957fb5058..816562621b 100644
--- a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
+++ b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
@@ -52,23 +52,29 @@ class TestEditDistanceOp(OpTest):
     def setUp(self):
         self.op_type = "edit_distance"
         normalized = False
-        x1 = np.array([[0, 12, 3, 5, 8, 2]]).astype("int64")
-        x2 = np.array([[0, 12, 4, 7, 8]]).astype("int64")
+        x1 = np.array([[12, 3, 5, 8, 2]]).astype("int64")
+        x2 = np.array([[12, 4, 7, 8]]).astype("int64")
         x1 = np.transpose(x1)
         x2 = np.transpose(x2)
-        x1_lod = [0, 1, 5]
-        x2_lod = [0, 3, 4]
+        x1_lod = [1, 4]
+        x2_lod = [3, 1]
 
-        num_strs = len(x1_lod) - 1
+        num_strs = len(x1_lod)
         distance = np.zeros((num_strs, 1)).astype("float32")
         sequence_num = np.array(2).astype("int64")
+
+        x1_offset = 0
+        x2_offset = 0
         for i in range(0, num_strs):
             distance[i] = Levenshtein(
-                hyp=x1[x1_lod[i]:x1_lod[i + 1]],
-                ref=x2[x2_lod[i]:x2_lod[i + 1]])
+                hyp=x1[x1_offset:(x1_offset + x1_lod[i])],
+                ref=x2[x2_offset:(x2_offset + x2_lod[i])])
+            x1_offset += x1_lod[i]
+            x2_offset += x2_lod[i]
             if normalized is True:
-                len_ref = x2_lod[i + 1] - x2_lod[i]
+                len_ref = x2_lod[i]
                 distance[i] = distance[i] / len_ref
+
         self.attrs = {'normalized': normalized}
         self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
         self.outputs = {'Out': distance, 'SequenceNum': sequence_num}
@@ -81,23 +87,29 @@ class TestEditDistanceOpNormalized(OpTest):
     def setUp(self):
         self.op_type = "edit_distance"
         normalized = True
-        x1 = np.array([[0, 10, 3, 6, 5, 8, 2]]).astype("int64")
-        x2 = np.array([[0, 10, 4, 6, 7, 8]]).astype("int64")
+        x1 = np.array([[10, 3, 6, 5, 8, 2]]).astype("int64")
+        x2 = np.array([[10, 4, 6, 7, 8]]).astype("int64")
         x1 = np.transpose(x1)
         x2 = np.transpose(x2)
-        x1_lod = [0, 1, 3, 6]
-        x2_lod = [0, 2, 3, 5]
+        x1_lod = [1, 2, 3]
+        x2_lod = [2, 1, 2]
 
-        num_strs = len(x1_lod) - 1
+        num_strs = len(x1_lod)
         distance = np.zeros((num_strs, 1)).astype("float32")
         sequence_num = np.array(3).astype("int64")
+
+        x1_offset = 0
+        x2_offset = 0
         for i in range(0, num_strs):
             distance[i] = Levenshtein(
-                hyp=x1[x1_lod[i]:x1_lod[i + 1]],
-                ref=x2[x2_lod[i]:x2_lod[i + 1]])
+                hyp=x1[x1_offset:(x1_offset + x1_lod[i])],
+                ref=x2[x2_offset:(x2_offset + x2_lod[i])])
+            x1_offset += x1_lod[i]
+            x2_offset += x2_lod[i]
             if normalized is True:
-                len_ref = x2_lod[i + 1] - x2_lod[i]
+                len_ref = x2_lod[i]
                 distance[i] = distance[i] / len_ref
+
         self.attrs = {'normalized': normalized}
         self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
         self.outputs = {'Out': distance, 'SequenceNum': sequence_num}
diff --git a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
index 9d724a6479..8b9da84311 100644
--- a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
+++ b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
@@ -24,17 +24,16 @@ class TestFeedFetch(unittest.TestCase):
         input_array = np.ones((4, 4, 6)).astype("float32")
         input_array[0, 0, 0] = 3
         input_array[3, 3, 5] = 10
-        input_tensor = core.LoDTensor([[0, 2, 4]])
+        input_tensor = core.LoDTensor([[2, 2]])
         input_tensor.set(input_array, place)
 
         core.set_feed_variable(scope, input_tensor, "feed", 0)
 
         output_tensor = core.get_fetch_variable(scope, "feed", 0)
 
-        output_lod = output_tensor.lod()
-        self.assertEqual(0, output_lod[0][0])
+        output_lod = output_tensor.recursive_sequence_lengths()
+        self.assertEqual(2, output_lod[0][0])
         self.assertEqual(2, output_lod[0][1])
-        self.assertEqual(4, output_lod[0][2])
 
         output_array = np.array(output_tensor)
         self.assertEqual(3, output_array[0, 0, 0])
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
index 533d8ccfac..0c75cf33f5 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
@@ -55,7 +55,7 @@ class TestFillConstantBatchSizeLikeWithLoDTensor(OpTest):
         self.op_type = "fill_constant_batch_size_like"
         self.inputs = {
             'Input': (np.random.random((31, 28)).astype("float32"),
-                      [[0, 9, 23, 31]])
+                      [[9, 14, 8]])
         }
         self.attrs = {
             'value': 3.5,
diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py
index 3a13eb872a..8fbf156085 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
@@ -20,8 +20,8 @@ from test_lstm_op import identity, sigmoid, tanh, relu
 
 
 class TestGRUOp(OpTest):
-    lod = [[0, 2, 6, 9]]
-    batch_size = lod[0][-1]
+    lod = [[2, 4, 3]]
+    batch_size = sum(lod[0])
     frame_size = 5
     activate = {
         'identity': identity,
@@ -33,10 +33,10 @@ class TestGRUOp(OpTest):
     @staticmethod
     def seq_to_batch(lod, is_reverse):
         idx_in_seq_list = []
-        seq_starts = lod[0]
-        seq_lens = []
-        for i in range(len(seq_starts) - 1):
-            seq_lens.append(seq_starts[i + 1] - seq_starts[i])
+        seq_lens = lod[0]
+        seq_starts = [0]
+        for i in range(len(seq_lens)):
+            seq_starts.append(seq_starts[-1] + seq_lens[i])
         sorted_seqs = sorted(
             range(len(seq_lens)), lambda x, y: seq_lens[y] - seq_lens[x])
         num_batch = seq_lens[sorted_seqs[0]]
diff --git a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
index 8f62ac20a5..eff4212d91 100644
--- a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
@@ -58,8 +58,8 @@ class TestIOUSimilarityOpWithLoD(TestIOUSimilarityOp):
 
     def setUp(self):
         super(TestIOUSimilarityOpWithLoD, self).setUp()
-        self.boxes1_lod = [[0, 1, 2]]
-        self.output_lod = [[0, 1, 2]]
+        self.boxes1_lod = [[1, 1]]
+        self.output_lod = [[1, 1]]
 
         self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
         self.outputs = {'Out': (self.output, self.output_lod)}
diff --git a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
index f49f7635f7..696d0ab4fa 100644
--- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
@@ -105,11 +105,13 @@ class TestLinearChainCrfOp(OpTest):
         MAX_SEQ_LEN = 5
 
         # the linear_chain_crf operator only supports sequence (LoD level = 1)
-        lod = [[0]]
+        lod = [[]]
+        seq_start_pos = [0]
         for i in range(SEQ_NUM):
-            lod[-1].append(lod[-1][-1] + random.randint(1, MAX_SEQ_LEN))
-        emission = np.random.uniform(-1, 1,
-                                     [lod[-1][-1], TAG_NUM]).astype("float64")
+            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
+            seq_start_pos.append(seq_start_pos[-1] + lod[-1][-1])
+        emission = np.random.uniform(
+            -1, 1, [seq_start_pos[-1], TAG_NUM]).astype("float64")
         emission_row_max = np.amax(emission, axis=1, keepdims=True)
         emission_exps = np.exp(emission - emission_row_max)
 
@@ -118,14 +120,14 @@ class TestLinearChainCrfOp(OpTest):
         transition_exps = np.exp(transition)
 
         labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
+            low=0, high=TAG_NUM, size=(seq_start_pos[-1], 1), dtype="int64")
 
         self.inputs = {
             "Emission": (emission, lod),
             "Transition": transition,
             "Label": (labels, lod)
         }
-        crf = LinearChainCrfForward(lod[0], emission, emission_row_max,
+        crf = LinearChainCrfForward(seq_start_pos, emission, emission_row_max,
                                     emission_exps, transition, transition_exps,
                                     labels)
         alpha, log_likelihood = crf.crf_forward_compute()
diff --git a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
index 093eecb837..bac5e50231 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
@@ -30,7 +30,8 @@ class TestLoDRankTable(unittest.TestCase):
 
         tensor = core.LoDTensor()
         tensor.set(numpy.random.random(size=(17, 100)), cpu)
-        tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])
+        tensor.set_recursive_sequence_lengths(
+            [[1, 2], [5, 1, 1], [3, 1, 5, 1, 3, 3, 1]])
         exe.run(scope=scope, feed={'x': tensor})
         var = scope.find_var(rank_table.name)
         table = var.get_lod_rank_table()
diff --git a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
index 6b6d4c824a..77905c4b96 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
@@ -21,11 +21,15 @@ class TestLodResetOpByAttr(OpTest):
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float32")
-        lod = [[0, 3, 5, 10]]
-        target_lod_0 = [0, 7, 10]
+        lod = [[3, 2, 5]]
+        # target_offset_lod and target_lod are the same lod info represented
+        # in offset-based format and length-based format, respectively.
+        target_offset_lod = [0, 7, 10]
+        target_lod = [7, 3]
         self.inputs = {'X': (x, lod)}
-        self.attrs = {'target_lod': target_lod_0}
-        self.outputs = {'Out': (x, [target_lod_0])}
+        # The `target_lod` attribute is still based on offset
+        self.attrs = {'target_lod': target_offset_lod}
+        self.outputs = {'Out': (x, [target_lod])}
 
     def test_check_output(self):
         self.check_output()
@@ -38,13 +42,16 @@ class TestLodResetOpByInput(OpTest):
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float32")
-        lod = [[0, 3, 5, 10]]
-        target_lod_0 = [0, 4, 7, 10]
+        lod = [[3, 2, 5]]
+        # target_offset_lod and target_lod are the same lod info represented
+        # in offset-based format and length-based format, respectively.
+        target_offset_lod = [0, 4, 7, 10]
+        target_lod = [4, 3, 3]
         self.inputs = {
             'X': (x, lod),
-            'Y': np.array([target_lod_0]).astype('int32')
+            'Y': np.array([target_offset_lod]).astype('int32')
         }
-        self.outputs = {'Out': (x, [target_lod_0])}
+        self.outputs = {'Out': (x, [target_lod])}
 
     def test_check_output(self):
         self.check_output()
@@ -57,15 +64,16 @@ class TestLodResetOpBoth(OpTest):
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float32")
-        lod = [[0, 3, 5, 10]]
-        target_lod_0_attr = [0, 7, 10]
-        target_lod_0_in = [0, 4, 7, 10]
+        lod = [[3, 2, 5]]
+        target_offset_lod_attr = [0, 7, 10]
+        target_offset_lod_in = [0, 4, 7, 10]
+        target_lod_in = [4, 3, 3]
         self.inputs = {
             'X': (x, lod),
-            'Y': np.array(target_lod_0_in).astype('int32')
+            'Y': np.array(target_offset_lod_in).astype('int32')
         }
-        self.attrs = {'target_lod': target_lod_0_attr}
-        self.outputs = {'Out': (x, [target_lod_0_in])}
+        self.attrs = {'target_lod': target_offset_lod_attr}
+        self.outputs = {'Out': (x, [target_lod_in])}
 
     def test_check_output(self):
         self.check_output()
@@ -78,11 +86,11 @@ class TestLodResetOpYIsLoDTensor(OpTest):
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float32")
-        lod = [[0, 3, 5, 10]]
+        lod = [[3, 2, 5]]
         y = np.random.random((10, 10)).astype("float32")
-        target_lod_0 = [[0, 4, 7, 10]]
-        self.inputs = {'X': (x, lod), 'Y': (y, target_lod_0)}
-        self.outputs = {'Out': (x, target_lod_0)}
+        target_lod = [[4, 3, 3]]
+        self.inputs = {'X': (x, lod), 'Y': (y, target_lod)}
+        self.outputs = {'Out': (x, target_lod)}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
index 63b17a5ccd..118c22fbb1 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
@@ -27,7 +27,7 @@ class TestLoDTensorArray(unittest.TestCase):
         for i in xrange(10):
             t = core.LoDTensor()
             t.set(numpy.array([i], dtype='float32'), cpu)
-            t.set_lod([[0, 1]])
+            t.set_recursive_sequence_lengths([[1]])
             tensor_array.append(t)
 
         self.assertEqual(10, len(tensor_array))
@@ -35,17 +35,17 @@ class TestLoDTensorArray(unittest.TestCase):
         for i in xrange(10):
             t = tensor_array[i]
             self.assertEqual(numpy.array(t), numpy.array([i], dtype='float32'))
-            self.assertEqual([[0, 1]], t.lod())
+            self.assertEqual([[1]], t.recursive_sequence_lengths())
 
             t = core.LoDTensor()
             t.set(numpy.array([i + 10], dtype='float32'), cpu)
-            t.set_lod([[0, 2]])
+            t.set_recursive_sequence_lengths([[1]])
             tensor_array[i] = t
             t = tensor_array[i]
             self.assertEqual(
                 numpy.array(t), numpy.array(
                     [i + 10], dtype='float32'))
-            self.assertEqual([[0, 2]], t.lod())
+            self.assertEqual([[1]], t.recursive_sequence_lengths())
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
index 66a03640c1..cebe6997bb 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
@@ -29,7 +29,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 3, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
         expect = map(lambda x: numpy.array(x).astype('int32'),
                      [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
         self.main(
@@ -42,7 +42,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 3, 9, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 0, 1]])
         expect = map(lambda x: numpy.array(x).astype('int32'),
                      [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
         self.main(
@@ -55,7 +55,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(20).reshape(20, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 2, 5], [0, 3, 9, 11, 17, 20]])
+        tensor.set_recursive_sequence_lengths([[2, 3], [3, 6, 2, 6, 3]])
 
         expect = [
             numpy.array(
@@ -65,7 +65,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
                 [17, 18, 19], dtype='int32')
         ]
 
-        lod = [[[0, 2, 5]], [[0, 6, 12]], [[0, 3]]]
+        lod = [[[2, 3]], [[6, 6]], [[3]]]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -77,8 +77,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor.set(
             numpy.arange(31).reshape(31, 1).astype('int32'), self.place())
 
-        tensor.set_lod([[0, 3, 5, 9, 11],
-                        [0, 3, 7, 11, 11, 12, 17, 19, 21, 23, 30, 31]])
+        tensor.set_recursive_sequence_lengths(
+            [[3, 2, 4, 2], [3, 4, 4, 0, 1, 5, 2, 2, 2, 7, 1]])
 
         expect = [
             numpy.array(
@@ -88,7 +88,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             ], [17, 18, 3, 4, 5, 6, 11, 30], [19, 20, 7, 8, 9, 10], [21, 22]]
         ]
 
-        lod = [[[0, 5, 8, 8, 15]], [[0, 2, 6, 7, 8]], [[0, 2, 6]], [[0, 2]]]
+        lod = [[[5, 3, 0, 7]], [[2, 4, 1, 1]], [[2, 4]], [[2]]]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -99,8 +99,9 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13],
-                        [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]])
+        tensor.set_recursive_sequence_lengths(
+            [[2, 3, 1], [2, 3, 1, 4, 2, 1],
+             [3, 4, 4, 6, 4, 1, 1, 4, 4, 8, 6, 1, 4]])
 
         expect = [
             numpy.array(
@@ -108,8 +109,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], range(
                 22, 39) + range(7, 21), range(39, 46)]
         ]
-        lod = [[[0, 1, 3, 4], [0, 1, 4, 8, 12]],
-               [[0, 4, 7], [0, 1, 5, 9, 17, 21, 27, 31]], [[0, 2], [0, 6, 7]]]
+        lod = [[[1, 2, 1], [1, 3, 4, 4]], [[4, 3], [1, 4, 4, 8, 4, 6, 4]],
+               [[2], [6, 1]]]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -120,8 +121,9 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13],
-                        [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]])
+        tensor.set_recursive_sequence_lengths(
+            [[2, 3, 1], [2, 3, 1, 4, 2, 1],
+             [3, 4, 4, 6, 4, 1, 1, 4, 4, 8, 6, 1, 4]])
         self.main(
             tensor=tensor,
             expect_array=None,
@@ -162,12 +164,13 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             exp_tensor, exp_lod = exp
             exp_tensor = numpy.expand_dims(exp_tensor, axis=1)
             self.assertTrue(numpy.allclose(exp_tensor, numpy.array(array[i])))
-            self.assertEqual(exp_lod, array[i].lod())
+            self.assertEqual(exp_lod, array[i].recursive_sequence_lengths())
 
     def check_tensor_same(self, actual, expect):
         self.assertTrue(
             numpy.allclose(numpy.array(actual), numpy.array(expect)))
-        self.assertEqual(actual.lod(), expect.lod())
+        self.assertEqual(actual.recursive_sequence_lengths(),
+                         expect.recursive_sequence_lengths())
 
 
 class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
@@ -188,7 +191,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
 
         tensor = core.LoDTensor()
         tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place)
-        tensor.set_lod([[0, 3, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
 
         g_vars = program.global_block().var(x.name + "@GRAD")
 
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py
index e726f99d49..705a24bd8f 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
@@ -84,15 +84,17 @@ def lstm(
         h = g_o * act_cell(c)
         return h, c
 
-    def _reverse(x, lod):
+    def _reverse(x, offset):
         y = np.zeros_like(x)
-        for i in range(len(lod) - 1):
-            b, e = lod[i], lod[i + 1]
+        for i in range(len(offset) - 1):
+            b, e = offset[i], offset[i + 1]
             y[b:e, :] = np.flip(x[b:e, :], 0)
         return y
 
-    offset = lod[0]
-    batch_size = len(offset) - 1
+    offset = [0]
+    for l in lod[0]:
+        offset.append(offset[-1] + l)
+    batch_size = len(lod[0])
     hidden = []
     cell = []
     input = _reverse(input, offset) if is_reverse else input
@@ -100,7 +102,7 @@ def lstm(
         input = input + np.tile(w_b, (offset[-1], 1))
     for i in range(batch_size):
         # compute one sequence
-        seq_len = offset[i + 1] - offset[i]
+        seq_len = lod[0][i]
         x = input[offset[i]:offset[i + 1], :]
         h_pre = h0[i]  # 1 x D
         c_pre = c0[i]  # 1 x D
@@ -124,7 +126,7 @@ def lstm(
 
 class TestLstmOp(OpTest):
     def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
+        self.lod = [[2, 3, 2]]
         self.D = 16
 
         self.act_gate = 'sigmoid'
@@ -139,8 +141,8 @@ class TestLstmOp(OpTest):
         self.set_argument()
         self.op_type = 'lstm'
 
-        T = self.lod[0][-1]
-        N = len(self.lod[0]) - 1
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
 
         x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
         if self.has_initial_state:
@@ -186,7 +188,7 @@ class TestLstmOp(OpTest):
 
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
@@ -196,7 +198,7 @@ class TestLstmOp(OpTest):
 
 # class TestLstmOpHasInitial(TestLstmOp):
 #     def set_argument(self):
-#         self.lod = [[0, 2, 5, 7]]
+#         self.lod = [[2, 3, 2]]
 #         self.D = 16
 
 #         self.act_gate = 'sigmoid'
@@ -209,7 +211,7 @@ class TestLstmOp(OpTest):
 
 #     def test_check_grad(self):
 #         # TODO(qingqing) remove folowing lines after the check_grad is refined.
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -218,7 +220,7 @@ class TestLstmOp(OpTest):
 #             max_relative_error=5e-4)
 
 #     def test_check_grad_ingore_bias(self):
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -228,7 +230,7 @@ class TestLstmOp(OpTest):
 #             no_grad_set=set('Bias'))
 
 #     def test_check_grad_ingore_weight(self):
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -238,7 +240,7 @@ class TestLstmOp(OpTest):
 #             no_grad_set=set('Weight'))
 
 #     def test_check_grad_ingore_input(self):
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -248,7 +250,7 @@ class TestLstmOp(OpTest):
 #             no_grad_set=set('Input'))
 
 #     def test_check_grad_ingore_h0(self):
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -258,7 +260,7 @@ class TestLstmOp(OpTest):
 #             no_grad_set=set('H0'))
 
 #     def test_check_grad_ingore_c0(self):
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -269,7 +271,7 @@ class TestLstmOp(OpTest):
 
 # class TestLstmOpRerverse(TestLstmOp):
 #     def set_argument(self):
-#         self.lod = [[0, 2, 5, 7]]
+#         self.lod = [[2, 3, 2]]
 #         self.D = 16
 
 #         self.act_gate = 'sigmoid'
@@ -282,7 +284,7 @@ class TestLstmOp(OpTest):
 
 # class TestLstmOpNotUsePeepholes(TestLstmOp):
 #     def set_argument(self):
-#         self.lod = [[0, 2, 5, 7]]
+#         self.lod = [[2, 3, 2]]
 #         self.D = 16
 
 #         self.act_gate = 'sigmoid'
diff --git a/python/paddle/fluid/tests/unittests/test_lstmp_op.py b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
index afff133f6c..ed2262da4b 100644
--- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
@@ -64,15 +64,17 @@ def lstmp(
         r = act_proj(r)
         return r, c
 
-    def _reverse(x, lod):
+    def _reverse(x, offset):
         y = np.zeros_like(x)
-        for i in range(len(lod) - 1):
-            b, e = lod[i], lod[i + 1]
+        for i in range(len(offset) - 1):
+            b, e = offset[i], offset[i + 1]
             y[b:e, :] = np.flip(x[b:e, :], 0)
         return y
 
-    offset = lod[0]
-    batch_size = len(offset) - 1
+    offset = [0]
+    for l in lod[0]:
+        offset.append(offset[-1] + l)
+    batch_size = len(lod[0])
     # recurrent projection state
     projection = []
     cell = []
@@ -81,7 +83,7 @@ def lstmp(
         input = input + np.tile(w_b, (offset[-1], 1))
     for i in range(batch_size):
         # compute one sequence
-        seq_len = offset[i + 1] - offset[i]
+        seq_len = lod[0][i]
         x = input[offset[i]:offset[i + 1], :]
         r_pre = np.dot(h0[i], w_rh)  # 1 x P
         r_pre = act_proj(r_pre)
@@ -117,8 +119,8 @@ class TestLstmpOp(LstmTest.TestLstmOp):
         self.reset_argument()
         self.op_type = 'lstmp'
 
-        T = self.lod[0][-1]
-        N = len(self.lod[0]) - 1
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
 
         x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
         if self.has_initial_state:
@@ -166,7 +168,7 @@ class TestLstmpOp(LstmTest.TestLstmOp):
 
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -183,7 +185,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
 
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -195,7 +197,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             max_relative_error=1e-2)
 
     def test_check_grad_ingore_bias(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -207,7 +209,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('Bias'))
 
     def test_check_grad_ingore_weight(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -219,7 +221,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('Weight'))
 
     def test_check_grad_ingore_proj_weight(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -231,7 +233,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('ProjWeight'))
 
     def test_check_grad_ingore_input(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -243,7 +245,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('Input'))
 
     def test_check_grad_ingore_h0(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -255,7 +257,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('H0'))
 
     def test_check_grad_ingore_c0(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
index c27573c3d6..54ee85c1a7 100644
--- a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
@@ -70,7 +70,7 @@ class TestMineHardExamplesOp(OpTest):
 
         self.updated_match_indices = self.match_indices
 
-        self.neg_indices_lod = [[0, 1, 2]]
+        self.neg_indices_lod = [[1, 1]]
         self.neg_indices = np.array([[1], [0]]).astype('int32')
 
 
@@ -92,7 +92,7 @@ class TestMineHardExamplesOpHardExample(TestMineHardExamplesOp):
         self.updated_match_indices = np.array([[0, -1, -1],
                                                [-1, -1, -1]]).astype('int32')
 
-        self.neg_indices_lod = [[0, 1, 3]]
+        self.neg_indices_lod = [[1, 2]]
         self.neg_indices = np.array([[2], [0], [2]]).astype('int32')
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index 6459913c01..aacd8ae45a 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -135,12 +135,12 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
     batch_size = scores.shape[0]
 
     det_outs = []
-    lod = [0]
+    lod = []
     for n in range(batch_size):
         nmsed_outs, nmsed_num = multiclass_nms(boxes[n], scores[n], background,
                                                score_threshold, nms_threshold,
                                                nms_top_k, keep_top_k)
-        lod.append(lod[-1] + nmsed_num)
+        lod.append(nmsed_num)
         if nmsed_num == 0: continue
 
         for c, indices in nmsed_outs.iteritems():
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
index cd78cce872..d13f2b3afd 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
@@ -27,9 +27,9 @@ class TestOneHotOp(OpTest):
         self.op_type = 'one_hot'
         depth = 10
         dimension = 12
-        x_lod = [[0, 4, 5, 8, 11]]
-        x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])]
-        x = np.array(x).astype('int').reshape([x_lod[0][-1], 1])
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))]
+        x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])
 
         out = np.zeros(shape=(np.product(x.shape[:-1]),
                               depth)).astype('float32')
@@ -50,9 +50,9 @@ class TestOneHotOp_default_dtype(OpTest):
         self.op_type = 'one_hot'
         depth = 10
         dimension = 12
-        x_lod = [[0, 4, 5, 8, 11]]
-        x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])]
-        x = np.array(x).astype('int').reshape([x_lod[0][-1], 1])
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))]
+        x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])
 
         out = np.zeros(shape=(np.product(x.shape[:-1]),
                               depth)).astype('float32')
@@ -75,11 +75,11 @@ class TestOneHotOp_exception(OpTest):
         self.place = core.CPUPlace()
         self.dimension = 12
         self.x = core.LoDTensor()
-        x_lod = [[0, 4, 5, 8, 11]]
-        data = [np.random.randint(11, 20) for i in xrange(x_lod[0][-1])]
-        data = np.array(data).astype('int').reshape([x_lod[0][-1], 1])
+        x_lod = [[4, 1, 3, 3]]
+        data = [np.random.randint(11, 20) for i in xrange(sum(x_lod[0]))]
+        data = np.array(data).astype('int').reshape([sum(x_lod[0]), 1])
         self.x.set(data, self.place)
-        self.x.set_lod(x_lod)
+        self.x.set_recursive_sequence_lengths(x_lod)
 
     def test_check_output(self):
         program = Program()
diff --git a/python/paddle/fluid/tests/unittests/test_print_op.py b/python/paddle/fluid/tests/unittests/test_print_op.py
index c75080fbb9..e01af42a58 100644
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
@@ -28,7 +28,7 @@ class TestPrintOpCPU(unittest.TestCase):
         self.x_tensor = core.LoDTensor()
         tensor_np = np.random.random(size=(2, 3)).astype('float32')
         self.x_tensor.set(tensor_np, self.place)
-        self.x_tensor.set_lod([[0, 1, 1]])
+        self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
 
     def build_network(self, only_forward, **kargs):
         x = layers.data('x', shape=[3], dtype='float32', lod_level=1)
@@ -62,7 +62,7 @@ class TestPrintOpGPU(TestPrintOpCPU):
         self.x_tensor = core.LoDTensor()
         tensor_np = np.random.random(size=(2, 3)).astype('float32')
         self.x_tensor.set(tensor_np, self.place)
-        self.x_tensor.set_lod([[0, 1, 1]])
+        self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
index 76d0d2f2fe..a70321bd80 100644
--- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
@@ -70,11 +70,10 @@ class TestReorderLoDTensor(unittest.TestCase):
                 lod_level_i = numpy.random.randint(
                     low=1,
                     high=5,
-                    size=self.num_seq if i == 0 else lod_level_i[-1])
-                lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist()
+                    size=self.num_seq if i == 0 else sum(lod_level_i)).tolist()
                 data_lod.append(lod_level_i)
             data_value = numpy.random.random(
-                size=[data_lod[-1][-1] if data_lod else self.num_seq
+                size=[sum(data_lod[-1]) if data_lod else self.num_seq
                       ] + data_shape).astype('float32')
             self.data[data_name] = (data_value, data_lod)
 
@@ -84,29 +83,36 @@ class TestReorderLoDTensor(unittest.TestCase):
             tensor = fluid.Tensor()
             tensor.set(self.data[desc[0]][0], place)
             if self.data[desc[0]][1]:
-                tensor.set_lod(self.data[desc[0]][1])
+                tensor.set_recursive_sequence_lengths(self.data[desc[0]][1])
             self.inputs[desc[0]] = tensor
 
     def reorder(self):
-        level = 0
+        def convert_to_offset(lod):
+            offset_lod = [[0] for i in lod]
+            for i, level in enumerate(lod):
+                for seq_len in level:
+                    offset_lod[i].append(offset_lod[i][-1] + seq_len)
+            return offset_lod
 
+        level = 0
         # compute the rank_table according to ref_lod
         ref_lod = self.data[self.data_desc[1][0]][1][level]
         rank_table = []  # list of (index, length)
-        for i in range(len(ref_lod) - 1):
-            rank_table.append((i, ref_lod[i + 1] - ref_lod[i]))
+        for i in range(len(ref_lod)):
+            rank_table.append((i, ref_lod[i]))
         rank_table = sorted(rank_table, lambda x, y: y[1] - x[1])
 
         # compute the input sequence info according to input_lod
         input_value, input_lod = self.data[self.data_desc[0][0]]
+        offset_lod = convert_to_offset(input_lod)
 
         input_table = []  # list of (offset, length, sub_lod)
-        if input_lod:
-            for i in range(len(input_lod[level]) - 1):
+        if offset_lod:
+            for i in range(len(offset_lod[level]) - 1):
                 start_idx = i
                 end_idx = i + 1
                 sub_lod = []
-                for lod_level_i in input_lod[level:]:
+                for lod_level_i in offset_lod[level:]:
                     sub_lod_i = []
                     for idx in range(start_idx, end_idx):
                         sub_lod_i.append(lod_level_i[idx + 1] - lod_level_i[
@@ -132,10 +138,9 @@ class TestReorderLoDTensor(unittest.TestCase):
 
             input_seq_sub_lod = input_table[index][2]
             if len(output_lod) == 0:
-                output_lod = [[0] for i in input_seq_sub_lod]
-            for i, sub_lod_i in enumerate(input_seq_sub_lod):
-                for idx_sub in sub_lod_i:
-                    output_lod[i].append(output_lod[i][-1] + idx_sub)
+                output_lod = [[] for i in input_seq_sub_lod]
+            for i, level in enumerate(input_seq_sub_lod):
+                output_lod[i].extend(level)
         return output_value, output_lod
 
     def test_reorder_lod_tensor(self):
@@ -148,7 +153,8 @@ class TestReorderLoDTensor(unittest.TestCase):
             self.assertTrue(
                 numpy.allclose(
                     numpy.array(actual_output), expect_output, atol=0.001))
-            self.assertEqual(expect_output_lod, actual_output.lod())
+            self.assertEqual(expect_output_lod,
+                             actual_output.recursive_sequence_lengths())
         # check gradient
         expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
         expect_grad_lod = self.data[self.data_desc[0][0]][1]
@@ -156,7 +162,8 @@ class TestReorderLoDTensor(unittest.TestCase):
             self.assertTrue(
                 numpy.allclose(
                     numpy.array(actual_grad), expect_grad, atol=0.001))
-            self.assertEqual(expect_grad_lod, actual_grad.lod())
+            self.assertEqual(expect_grad_lod,
+                             actual_grad.recursive_sequence_lengths())
 
     def test_reorder_tensor(self):
         self.data_desc[0][-1] = 0  # input is tensor
@@ -168,7 +175,8 @@ class TestReorderLoDTensor(unittest.TestCase):
             self.assertTrue(
                 numpy.allclose(
                     numpy.array(actual_output), expect_output, atol=0.001))
-            self.assertEqual(expect_output_lod, actual_output.lod())
+            self.assertEqual(expect_output_lod,
+                             actual_output.recursive_sequence_lengths())
         # check gradient
         expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
         expect_grad_lod = self.data[self.data_desc[0][0]][1]
@@ -176,14 +184,14 @@ class TestReorderLoDTensor(unittest.TestCase):
             self.assertTrue(
                 numpy.allclose(
                     numpy.array(actual_grad), expect_grad, atol=0.001))
-            self.assertEqual(expect_grad_lod, actual_grad.lod())
+            self.assertEqual(expect_grad_lod,
+                             actual_grad.recursive_sequence_lengths())
 
         # compare outputs between LodTensors with explicit and implicit lod
         # use the same data but set the input lod explicitly
-        input_lod = [[
-            i for i in range(len(self.data[self.data_desc[0][0]][0]) + 1)
-        ]]
-        self.inputs[self.data_desc[0][0]].set_lod(input_lod)
+        input_lod = [[1] * len(self.data[self.data_desc[0][0]][0])]
+        self.inputs[self.data_desc[0][0]].set_recursive_sequence_lengths(
+            input_lod)
         # preserve the output of LodTensor with implicit lod to compare
         expect_output = [
             numpy.array(actual_output) for actual_output in self.actual_outputs
diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
index 3d754aff3a..df5684ab17 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
@@ -107,7 +107,7 @@ class TestROIPoolOp(OpTest):
         rois = []
         self.rois_lod = [[]]
         for bno in range(self.batch_size):
-            self.rois_lod[0].append(len(rois))
+            self.rois_lod[0].append(bno + 1)
             for i in range(bno + 1):
                 x1 = np.random.random_integers(
                     0, self.width / self.spatial_scale - self.pooled_width)
@@ -121,7 +121,6 @@ class TestROIPoolOp(OpTest):
 
                 roi = [bno, x1, y1, x2, y2]
                 rois.append(roi)
-        self.rois_lod[0].append(len(rois))
         self.rois_num = len(rois)
         self.rois = np.array(rois).astype("int64")
 
diff --git a/python/paddle/fluid/tests/unittests/test_row_conv_op.py b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
index 30f1efbcbc..07dcd10868 100644
--- a/python/paddle/fluid/tests/unittests/test_row_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
@@ -19,8 +19,10 @@ from op_test import OpTest
 
 def row_conv_forward(x, lod, wt):
     out = np.zeros_like(x)
-    seq_info = lod[0]
-    num_sequences = len(seq_info) - 1
+    num_sequences = len(lod[0])
+    seq_info = [0]
+    for seq_len in lod[0]:
+        seq_info.append(seq_info[-1] + seq_len)
     context_length = wt.shape[0]
 
     for i in range(num_sequences):  # loop over number of sequences
@@ -32,7 +34,6 @@ def row_conv_forward(x, lod, wt):
         cur_timesteps = end - start
         for j in range(cur_timesteps):  # loop over different timesteps
             for k in range(context_length):
-
                 if j + k >= cur_timesteps:
                     continue
                 curoutput[j, :] += curinput[j + k, :] * wt[k, :]
@@ -44,8 +45,8 @@ class TestRowConvOp1(OpTest):
     def setUp(self):
 
         self.op_type = "row_conv"
-        lod = [[0, 2, 5, 7]]
-        T = lod[0][-1]
+        lod = [[2, 3, 2]]
+        T = sum(lod[0])
         D = 16
         context_length = 2
 
@@ -75,8 +76,8 @@ class TestRowConvOp2(OpTest):
     def setUp(self):
 
         self.op_type = "row_conv"
-        lod = [[0, 20, 50, 100]]
-        T = lod[0][-1]
+        lod = [[20, 30, 50]]
+        T = sum(lod[0])
         D = 35
         context_length = 35
 
diff --git a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
index 10592d127f..11ffa761a6 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
@@ -18,14 +18,19 @@ import sys
 from op_test import OpTest
 
 
-def to_abs_lod(lod):
-    if len(lod) == 0 or len(lod) == 1:
-        return lod
+def to_abs_offset_lod(lod):
+    offset_lod = [[0] for i in lod]
+    for i, level in enumerate(lod):
+        for seq_len in level:
+            offset_lod[i].append(offset_lod[i][-1] + seq_len)
+
+    if len(offset_lod) == 0 or len(offset_lod) == 1:
+        return offset_lod
     import copy
-    new_lod = copy.deepcopy(lod)
-    for idx, val in enumerate(lod[0]):
-        new_lod[0][idx] = lod[1][val]
-    return new_lod
+    new_offset_lod = copy.deepcopy(offset_lod)
+    for idx, val in enumerate(offset_lod[0]):
+        new_offset_lod[0][idx] = offset_lod[1][val]
+    return new_offset_lod
 
 
 def seq_concat(inputs, level):
@@ -35,11 +40,11 @@ def seq_concat(inputs, level):
     x1 = inputs['X'][1][1][0]
     level_idx = len(lod0) - level - 1
     outs = []
-    for i in range(len(lod0[level_idx]) - 1):
-        sub_x0 = x0[to_abs_lod(lod0)[level_idx][i]:to_abs_lod(lod0)[level_idx][
-            i + 1], :]
-        sub_x1 = x1[to_abs_lod(lod1)[level_idx][i]:to_abs_lod(lod1)[level_idx][
-            i + 1], :]
+    for i in range(len(lod0[level_idx])):
+        sub_x0 = x0[to_abs_offset_lod(lod0)[level_idx][i]:to_abs_offset_lod(
+            lod0)[level_idx][i + 1], :]
+        sub_x1 = x1[to_abs_offset_lod(lod1)[level_idx][i]:to_abs_offset_lod(
+            lod1)[level_idx][i + 1], :]
         outs.append(np.concatenate((sub_x0, sub_x1), axis=0))
     return np.concatenate(outs, axis=0)
 
@@ -48,9 +53,9 @@ class TestSeqConcatOp(OpTest):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 6, 3)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        lod0 = [[2, 2], [1, 1, 1, 1]]
         x1 = np.random.random((4, 8, 3)).astype('float32')
-        lod1 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        lod1 = [[2, 2], [1, 1, 1, 1]]
         axis = 1
         level = 1
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
@@ -72,14 +77,14 @@ class TestSeqConcatOpLevelZeroNestedSequence(TestSeqConcatOp):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 6, 3)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        lod0 = [[2, 2], [1, 1, 1, 1]]
         x1 = np.random.random((7, 6, 3)).astype('float32')
-        lod1 = [[0, 2, 4], [0, 1, 3, 5, 7]]
+        lod1 = [[2, 2], [1, 2, 2, 2]]
         axis = 0
         level = 0
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        out_lod = [[0, 2, 4], [0, 2, 5, 8, 11]]
+        out_lod = [[2, 2], [2, 3, 3, 3]]
         self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
 
@@ -87,14 +92,14 @@ class TestSeqConcatOplevelOneNestedSequence(TestSeqConcatOp):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 6, 3)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        lod0 = [[2, 2], [1, 1, 1, 1]]
         x1 = np.random.random((7, 6, 3)).astype('float32')
-        lod1 = [[0, 3, 4], [0, 1, 3, 5, 7]]
+        lod1 = [[3, 1], [1, 2, 2, 2]]
         axis = 0
         level = 1
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        out_lod = [[0, 5, 8], [0, 1, 2, 3, 5, 7, 8, 9, 11]]
+        out_lod = [[5, 3], [1, 1, 1, 2, 2, 1, 1, 2]]
         self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
 
@@ -102,14 +107,14 @@ class TestSeqConcatOpLevelZeroSequence(TestSeqConcatOp):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 3, 4)).astype('float32')
-        lod0 = [[0, 1, 2, 3, 4]]
+        lod0 = [[1, 1, 1, 1]]
         x1 = np.random.random((7, 3, 4)).astype('float32')
-        lod1 = [[0, 1, 3, 5, 7]]
+        lod1 = [[1, 2, 2, 2]]
         axis = 0
         level = 0
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        out_lod = [[0, 2, 5, 8, 11]]
+        out_lod = [[2, 3, 3, 3]]
         self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_seq_conv.py b/python/paddle/fluid/tests/unittests/test_seq_conv.py
index 51dbf1f618..9701d9adef 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
@@ -75,35 +75,38 @@ class TestSeqProject(OpTest):
         pading_data = self.pad_data
         out = np.zeros((self.input_size[0], self.context_length *
                         self.input_size[1])).astype('float32')
-        lod = lod[0]
+        offset = [0]
+        for seq_len in lod[0]:
+            offset.append(offset[-1] + seq_len)
         begin_pad = np.max([0, -self.context_start])
 
-        for i in range(len(lod) - 1):
+        for i in range(len(offset) - 1):
             for j in range(self.context_length):
-                in_begin = lod[i] + self.context_start + j
-                in_end = lod[i + 1] + self.context_start + j
-                out_begin = lod[i]
-                out_end = lod[i + 1]
-                if in_begin < lod[i]:
-                    pad_size = np.min([lod[i] - in_begin, lod[i + 1] - lod[i]])
+                in_begin = offset[i] + self.context_start + j
+                in_end = offset[i + 1] + self.context_start + j
+                out_begin = offset[i]
+                out_end = offset[i + 1]
+                if in_begin < offset[i]:
+                    pad_size = np.min(
+                        [offset[i] - in_begin, offset[i + 1] - offset[i]])
                     if self.padding_trainable:
                         sub_w = pading_data[j:j + pad_size, :]
-                        out[lod[i]:lod[i] + pad_size, j * self.input_size[1]:(
-                            j + 1) * self.input_size[1]] = sub_w
-                    out_begin = lod[i] + pad_size
-                    in_begin = lod[i]
+                        out[offset[i]:offset[i] + pad_size, j * self.input_size[
+                            1]:(j + 1) * self.input_size[1]] = sub_w
+                    out_begin = offset[i] + pad_size
+                    in_begin = offset[i]
 
-                if in_end > lod[i + 1]:
+                if in_end > offset[i + 1]:
                     pad_size = np.min(
-                        [in_end - lod[i + 1], lod[i + 1] - lod[i]])
+                        [in_end - offset[i + 1], offset[i + 1] - offset[i]])
                     if self.padding_trainable:
                         sub_w = pading_data[begin_pad + self.context_start + j -
                                             pad_size:begin_pad +
                                             self.context_start + j, :]
-                        out[lod[i + 1] - pad_size:lod[i + 1], j * self.
+                        out[offset[i + 1] - pad_size:offset[i + 1], j * self.
                             input_size[1]:(j + 1) * self.input_size[1]] = sub_w
-                    in_end = lod[i + 1]
-                    out_end = lod[i + 1] - pad_size
+                    in_end = offset[i + 1]
+                    out_end = offset[i + 1] - pad_size
                 if in_end <= in_begin:
                     continue
 
@@ -175,7 +178,11 @@ class TestSeqProject(OpTest):
         self.context_stride = 1
 
         self.input_size = [self.input_row, 23]
-        self.lod = [[0, 4, 5, 8, self.input_row]]
+        offset_lod = [[0, 4, 5, 8, self.input_row]]
+        self.lod = [[]]
+        # convert from offset-based lod to length-based lod
+        for i in range(len(offset_lod[0]) - 1):
+            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
         self.output_represention = 8  # output feature size
 
 
@@ -188,7 +195,11 @@ class TestSeqProjectCase1(TestSeqProject):
         self.context_stride = 1
 
         self.input_size = [self.input_row, 23]
-        self.lod = [[0, 4, 5, 8, self.input_row]]
+        offset_lod = [[0, 4, 5, 8, self.input_row]]
+        self.lod = [[]]
+        # convert from offset-based lod to length-based lod
+        for i in range(len(offset_lod[0]) - 1):
+            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
         self.output_represention = 8  # output feature size
 
 
@@ -203,8 +214,12 @@ class TestSeqProjectCase2(TestSeqProject):
         self.input_size = [self.input_row, 23]
         idx = range(self.input_size[0])
         del idx[0]
-        self.lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
-                    [self.input_size[0]]]
+        offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
+                      [self.input_size[0]]]
+        self.lod = [[]]
+        # convert from offset-based lod to length-based lod
+        for i in range(len(offset_lod[0]) - 1):
+            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
         self.output_represention = 8  # output feature size
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py
index 2e48ef0e88..0b3659d7a6 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
@@ -18,26 +18,34 @@ from op_test import OpTest
 
 
 class TestSeqAvgPool(OpTest):
+    def convert_to_offset(self, lod):
+        offset = [[0] for i in lod]
+        for i, level in enumerate(lod):
+            for seq_len in level:
+                offset[i].append(offset[i][-1] + seq_len)
+        return offset
+
     def set_data(self):
         self.op_type = 'sequence_pool'
         # one level, batch size is 4
         x = np.random.uniform(0.1, 1, [11, 23]).astype('float32')
-        lod = [[0, 4, 5, 8, 11]]
+        lod = [[4, 1, 3, 3]]
         self.inputs = {'X': (x, lod)}
+        offset = self.convert_to_offset(lod)
 
         out = np.zeros((4, 23)).astype('float32')
         self.outputs = {'Out': out}
-        return x, lod, out
+        return x, offset, out
 
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "AVERAGE"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
             out[i] = sub_x.mean(axis=0)
 
     def setUp(self):
-        x, lod, out = self.set_data()
-        self.compute(x, lod, out)
+        x, offset, out = self.set_data()
+        self.compute(x, offset, out)
 
     def test_check_output(self):
         self.check_output()
@@ -50,10 +58,10 @@ class TestSeqAvgPool(OpTest):
 
 
 class TestSeqSumPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "SUM"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
             out[i] = sub_x.sum(axis=0)
 
 
@@ -61,46 +69,47 @@ class TestSeqMaxPool(TestSeqAvgPool):
     def set_data(self):
         self.op_type = 'sequence_pool'
         x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
-        lod = [[0, 4, 5, 8, 13]]
-        for i in range(4):
-            l = lod[0][i + 1] - lod[0][i]
-            x[lod[0][i] + np.random.randint(l), :] += 2.0
+        lod = [[4, 1, 3, 5]]
+        offset = self.convert_to_offset(lod)
+        for i in range(len(offset[0]) - 1):
+            l = offset[0][i + 1] - offset[0][i]
+            x[offset[0][i] + np.random.randint(l), :] += 2.0
 
         self.inputs = {'X': (x, lod)}
 
         out = np.zeros((4, 23)).astype('float32')
         self.outputs = {'Out': out}
-        return x, lod, out
+        return x, offset, out
 
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "MAX"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
             out[i] = np.amax(sub_x, axis=0)
 
 
 class TestSeqSqrtPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "SQRT"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            len = lod[0][i + 1] - lod[0][i]
-            out[i] = sub_x.sum(axis=0) / np.sqrt(len)
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
+            seq_len = offset[0][i + 1] - offset[0][i]
+            out[i] = sub_x.sum(axis=0) / np.sqrt(seq_len)
 
 
 class TestSeqLastPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "LAST"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
             out[i] = sub_x[-1, :]
 
 
 class TestSeqFirstPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "FIRST"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
             out[i] = sub_x[0, :]
 
 
@@ -109,35 +118,39 @@ class TestSeqAvgPool2D(TestSeqAvgPool):
         self.op_type = 'sequence_pool'
         # one level, batch size is 4
         x = np.random.uniform(0.1, 1, [13, 3, 17]).astype('float32')
-        lod = [[0, 4, 5, 8, 13]]
+        lod = [[4, 1, 3, 5]]
         self.inputs = {'X': (x, lod)}
+        offset = self.convert_to_offset(lod)
 
         out = np.zeros((4, 3, 17)).astype('float32')
         self.outputs = {'Out': out}
-        return x, lod, out
+        return x, offset, out
 
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "AVERAGE"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
             out[i] = np.reshape(sub_x.mean(axis=0), (3, 17))
 
 
 class TestSeqSumPool2D(TestSeqAvgPool2D):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "SUM"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
             out[i] = np.reshape(sub_x.sum(axis=0), (3, 17))
 
 
 class TestSeqSqrtPool2D(TestSeqAvgPool2D):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "SQRT"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
-            len = lod[0][i + 1] - lod[0][i]
-            out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(len), (3, 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
+            seq_len = offset[0][i + 1] - offset[0][i]
+            out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(seq_len), (3, 17))
 
     def test_check_grad(self):
         # Remove MaxIndex after check_grad is refined.
@@ -150,36 +163,40 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
     def set_data(self):
         self.op_type = 'sequence_pool'
         x = np.random.uniform(0.1, 1, [13, 3, 11]).astype('float32')
-        lod = [[0, 4, 5, 8, 13]]
+        lod = [[4, 1, 3, 5]]
         self.inputs = {'X': (x, lod)}
-        for i in range(4):
-            l = lod[0][i + 1] - lod[0][i]
-            x[lod[0][i] + np.random.randint(l), :] += 1.0
+        offset = self.convert_to_offset(lod)
+        for i in range(len(offset[0]) - 1):
+            l = offset[0][i + 1] - offset[0][i]
+            x[offset[0][i] + np.random.randint(l), :] += 1.0
 
         out = np.zeros((4, 3, 11)).astype('float32')
         self.outputs = {'Out': out}
-        return x, lod, out
+        return x, offset, out
 
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "MAX"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 11))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 11))
             out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
 
 
 class TestSeqLastPool2D(TestSeqAvgPool2D):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "LAST"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
             out[i] = np.reshape(sub_x[-1, :], (3, 17))
 
 
 class TestSeqFirstPool2D(TestSeqAvgPool2D):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "FIRST"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
             out[i] = np.reshape(sub_x[0, :], (3, 17))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
index ebab77e804..8f0765277a 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
@@ -18,15 +18,17 @@ from op_test import OpTest
 
 
 def sequence_erase(in_seq, lod0, tokens):
-    new_lod0 = [0]
+    new_lod0 = []
     out_seq = []
-    for i in range(0, len(lod0) - 1):
+    offset = 0
+    for i in range(0, len(lod0)):
         num_out = 0
-        for dat in in_seq[lod0[i]:lod0[i + 1]]:
+        for dat in in_seq[offset:(offset + lod0[i])]:
             if dat not in tokens:
                 out_seq.append(dat)
                 num_out += 1
-        new_lod0.append(new_lod0[-1] + num_out)
+        offset += lod0[i]
+        new_lod0.append(num_out)
     return np.array(out_seq).astype("int32"), new_lod0
 
 
@@ -34,7 +36,7 @@ class TestSequenceEraseOpInt32(OpTest):
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
-        lod = [[0, 9, 13, 24, 30]]
+        lod = [[9, 4, 11, 6]]
         tokens = [2, 3, 5]
         out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
         self.attrs = {'tokens': tokens}
@@ -49,7 +51,7 @@ class TestSequenceEraseOpInt64(OpTest):
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
-        lod = [[0, 9, 13, 24, 30]]
+        lod = [[9, 4, 11, 6]]
         tokens = [2, 3, 5]
         out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
         self.attrs = {'tokens': tokens}
@@ -64,7 +66,7 @@ class TestSequenceEraseOpEmpty(OpTest):
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
-        lod = [[0, 9, 13, 24, 30]]
+        lod = [[9, 4, 11, 6]]
         tokens = []
         out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
         self.attrs = {'tokens': tokens}
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index 4c8ec1426c..0bbd31814e 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -21,7 +21,7 @@ class TestSequenceExpand(OpTest):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
         y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
-        y_lod = [[0, 1, 4, 8]]
+        y_lod = [[1, 3, 4]]
         self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
 
     def compute(self):
@@ -37,23 +37,27 @@ class TestSequenceExpand(OpTest):
         out = np.zeros(shape=((0, ) + x_data.shape[1:]), dtype=x_data.dtype)
 
         if x_lod is None:
-            x_idx = [i for i in xrange(x_data.shape[0] + 1)]
+            # x_idx = [i for i in xrange(x_data.shape[0] + 1)]
+            x_idx = [1] * x_data.shape[0]
         else:
             x_idx = x_lod[0]
-            out_lod = [[0]]
+            out_lod = [[]]
+
+        offset = 0
+        for i in xrange(len(y_lod[ref_level])):
+            repeat_num = y_lod[ref_level][i]
+            x_len = x_idx[i]
 
-        for i in xrange(1, len(y_lod[ref_level])):
-            repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1]
-            x_len = x_idx[i] - x_idx[i - 1]
             if repeat_num > 0:
-                x_sub = x_data[x_idx[i - 1]:x_idx[i], :]
+                x_sub = x_data[offset:(offset + x_len), :]
                 stacked_x_sub = x_sub
                 for r in range(repeat_num - 1):
                     stacked_x_sub = np.vstack((stacked_x_sub, x_sub))
                 out = np.vstack((out, stacked_x_sub))
                 if x_lod is not None:
                     for j in xrange(repeat_num):
-                        out_lod[0].append(out_lod[0][-1] + x_len)
+                        out_lod[0].append(x_len)
+            offset += x_len
 
         if x_lod is None:
             self.outputs = {'Out': out}
@@ -75,9 +79,9 @@ class TestSequenceExpand(OpTest):
 class TestSequenceExpandCase1(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
-        x_lod = [[0, 2, 5]]
+        x_lod = [[2, 3]]
         y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
-        y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]]
+        y_lod = [[2, 3], [2, 2, 3, 3, 3]]
         self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
         self.attrs = {'ref_level': 0}
 
@@ -85,9 +89,9 @@ class TestSequenceExpandCase1(TestSequenceExpand):
 class TestSequenceExpandCase2(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
-        x_lod = [[0, 1]]
+        x_lod = [[1]]
         y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32')
-        y_lod = [[0, 2], [0, 2]]
+        y_lod = [[2], [1, 1]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
         self.attrs = {'ref_level': 0}
 
@@ -95,9 +99,9 @@ class TestSequenceExpandCase2(TestSequenceExpand):
 class TestSequenceExpandCase3(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
-        x_lod = [[0, 1, 2, 3, 4]]
-        y_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32')
-        y_lod = [[0, 2, 4, 4, 6]]
+        x_lod = [[1, 1, 1, 1]]
+        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
+        y_lod = [[2, 2, 2, 2]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
@@ -105,9 +109,9 @@ class TestSequenceExpandCase4(TestSequenceExpand):
     def set_data(self):
         data = np.random.uniform(0.1, 1, [5 * 2, 1])
         x_data = np.array(data).reshape([5, 2]).astype('float32')
-        x_lod = [[0, 2, 5]]
-        y_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
-        y_lod = [[0, 1, 3], [0, 1, 3]]
+        x_lod = [[2, 3]]
+        y_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
+        y_lod = [[2], [2, 3]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
index efeab56039..68f2e5eba3 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
@@ -22,7 +22,7 @@ class TestSequenceReshape(OpTest):
     def setUp(self):
         self.op_type = 'sequence_reshape'
         dimension = 12
-        x_lod = [[0, 4, 5, 8, 11]]
+        x_lod = [[4, 1, 3, 3]]
         x = np.random.uniform(0.1, 1, [11, 24]).astype('float32')
 
         self.inputs = {'X': (x, x_lod)}
@@ -34,13 +34,13 @@ class TestSequenceReshape(OpTest):
 
     def compute_output(self, x, x_lod, dimension):
         x_width = x.shape[1]
-        out_lod = [[0]]
-        for i in xrange(len(x_lod[0]) - 1):
-            seq_len = x_lod[0][i + 1] - x_lod[0][i]
+        out_lod = [[]]
+        for i in xrange(len(x_lod[0])):
+            seq_len = x_lod[0][i]
             offset = (seq_len * x_width) / dimension
             assert int(offset) * dimension == seq_len * x_width
-            out_lod[0].append(out_lod[0][-1] + int(offset))
-        out = np.zeros(shape=(out_lod[0][-1], dimension)).astype('float32')
+            out_lod[0].append(int(offset))
+        out = np.zeros(shape=(sum(out_lod[0]), dimension)).astype('float32')
         out.ravel()[:] = x.ravel()[:]
         return out, out_lod
 
@@ -55,7 +55,7 @@ class TestSequenceReshape_reduce(TestSequenceReshape):
     def setUp(self):
         self.op_type = 'sequence_reshape'
         dimension = 24
-        x_lod = [[0, 4, 6, 8, 12]]
+        x_lod = [[4, 2, 2, 4]]
         x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
 
         self.inputs = {'X': (x, x_lod)}
@@ -70,7 +70,7 @@ class TestSequenceReshape_same(TestSequenceReshape):
     def setUp(self):
         self.op_type = 'sequence_reshape'
         dimension = 12
-        x_lod = [[0, 4, 6, 8, 12]]
+        x_lod = [[4, 2, 2, 4]]
         x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
 
         self.inputs = {'X': (x, x_lod)}
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
index 660b4a171d..313e485d1e 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
@@ -29,20 +29,20 @@ class TestSequenceSliceOp(OpTest):
 
         self.inputs = {'X': (x, lod), 'Offset': offset, 'Length': length}
         outs = []  #np.zeros((100, 3, 2)).astype('float32')
-        out_lod = [[0]]
-        out_lod_offset = 0
+        out_lod = [[]]
+        lod_offset = 0
         for i in range(len(offset)):
-            sub_x = x[lod[0][i] + offset[i, 0]:lod[0][i] + offset[i, 0] +
+            sub_x = x[lod_offset + offset[i, 0]:lod_offset + offset[i, 0] +
                       length[i, 0], :]
-            out_lod_offset = out_lod_offset + len(sub_x)
             outs.append(sub_x)
-            out_lod[0].append(out_lod_offset)
+            out_lod[0].append(len(sub_x))
+            lod_offset += lod[0][i]
         outs = np.concatenate(outs, axis=0)
         self.outputs = {'Out': (outs, out_lod)}
 
     def init_test_case(self):
         self.x_dim = (100, 3, 2)
-        self.x_lod = [[0, 20, 40, 60, 80, 100]]
+        self.x_lod = [[20, 20, 20, 20, 20]]
         self.offset = [[1], [2], [3], [4], [5]]
         self.length = [[10], [8], [6], [4], [2]]
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
index d6dc99bb31..e91a69a0f8 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
@@ -26,15 +26,16 @@ class TestSequenceSoftmaxOp(OpTest):
         self.init_op_type()
 
         x = np.random.uniform(0.1, 1, (11, 1)).astype("float32")
-        lod = [[0, 4, 5, 8, 11]]
+        lod = [[4, 1, 3, 3]]
 
         out = np.zeros((11, 1)).astype("float32")
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            sub_x = sub_x.reshape(1, lod[0][i + 1] - lod[0][i])
+        offset = 0
+        for i in range(len(lod[0])):
+            sub_x = x[offset:offset + lod[0][i], :]
+            sub_x = sub_x.reshape(1, lod[0][i])
             sub_out = stable_softmax(sub_x)
-            out[lod[0][i]:lod[0][i + 1], :] = sub_out.reshape(
-                lod[0][i + 1] - lod[0][i], 1)
+            out[offset:offset + lod[0][i], :] = sub_out.reshape(lod[0][i], 1)
+            offset += lod[0][i]
 
         self.inputs = {"X": (x, lod)}
         self.outputs = {"Out": out}
diff --git a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
index 1d93230e7b..b779f0fb01 100644
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
@@ -54,12 +54,12 @@ class TestShrinkRNNMemoryReferLoD(TestShrinkRNNMemoryBase):
     def test_refer_lod(self):
         cpu = core.CPUPlace()
         x_tensor = core.LoDTensor()
-        x_tensor.set_lod([[0, 2, 5, 6]])
+        x_tensor.set_recursive_sequence_lengths([[2, 3, 1]])
         tensor_np = np.random.random(size=(6, 100)).astype('float32')
         x_tensor.set(tensor_np, cpu)
 
         rank_table_tensor = core.LoDTensor()
-        rank_table_tensor.set_lod([[0, 1, 3, 6]])
+        rank_table_tensor.set_recursive_sequence_lengths([[1, 2, 3]])
         rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
                               cpu)
 
@@ -83,7 +83,7 @@ class TestShrinkRNNMemoryNoLoD(TestShrinkRNNMemoryBase):
         x_tensor.set(tensor_np, cpu)
 
         rank_table_tensor = core.LoDTensor()
-        rank_table_tensor.set_lod([[0, 1, 3, 6]])
+        rank_table_tensor.set_recursive_sequence_lengths([[1, 2, 3]])
         rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
                               cpu)
 
diff --git a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
index 02cc7da849..0916ed7c9f 100644
--- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
@@ -56,7 +56,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
     def test_split_and_merge_lod_tensor_level_0(self):
         tensor = core.LoDTensor()
         tensor.set(np.arange(10).reshape(10, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 3, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
 
         mask_np = np.array([0, 1, 0]).astype('bool')
         mask_np = np.expand_dims(mask_np, axis=1)
@@ -68,15 +68,15 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         expect_true_tensor = np.expand_dims(expect_true_tensor, axis=1)
         expect_true = core.LoDTensor()
         expect_true.set(expect_true_tensor, self.place())
-        expect_true.set_lod([[0, 6]])
+        expect_true.set_recursive_sequence_lengths([[6]])
 
         expect_false_tensor = np.array([0, 1, 2, 9]).astype('int32')
         expect_false_tensor = np.expand_dims(expect_false_tensor, axis=1)
-        expect_false_lod = [[0, 3, 4]]
+        expect_false_lod = [[3, 1]]
 
         expect_false = core.LoDTensor()
         expect_false.set(expect_false_tensor, self.place())
-        expect_false.set_lod(expect_false_lod)
+        expect_false.set_recursive_sequence_lengths(expect_false_lod)
 
         self.main(
             tensor=tensor,
@@ -126,7 +126,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
 
     def check_tensor_same(self, actual, expect):
         self.assertTrue(np.allclose(np.array(actual), np.array(expect)))
-        self.assertEqual(actual.lod(), expect.lod())
+        self.assertEqual(actual.recursive_sequence_lengths(),
+                         expect.recursive_sequence_lengths())
 
 
 class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
@@ -151,7 +152,7 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
 
         tensor = core.LoDTensor()
         tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place)
-        tensor.set_lod([[0, 3, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
 
         mask_np = np.array([0, 1, 0]).astype('bool')
         mask_np = np.expand_dims(mask_np, axis=1)
diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
index ccb41e56c5..bd20889752 100644
--- a/python/paddle/fluid/tests/unittests/test_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
@@ -22,22 +22,23 @@ def gen_match_and_neg_indices(num_prior, gt_lod, neg_lod):
     if len(gt_lod) != len(neg_lod):
         raise AssertionError("The input arguments are illegal.")
 
-    batch_size = len(gt_lod) - 1
+    batch_size = len(gt_lod)
 
     match_indices = -1 * np.ones((batch_size, num_prior)).astype('int32')
-    neg_indices = np.zeros((neg_lod[-1], 1)).astype('int32')
+    neg_indices = np.zeros((sum(neg_lod), 1)).astype('int32')
 
+    offset = 0
     for n in range(batch_size):
-        gt_num = gt_lod[n + 1] - gt_lod[n]
+        gt_num = gt_lod[n]
         ids = random.sample([i for i in range(num_prior)], gt_num)
         match_indices[n, ids] = [i for i in range(gt_num)]
 
         ret_ids = set([i for i in range(num_prior)]) - set(ids)
-        s = neg_lod[n]
-        e = neg_lod[n + 1]
-        l = e - s
+        l = neg_lod[n]
         neg_ids = random.sample(ret_ids, l)
-        neg_indices[s:e, :] = np.array(neg_ids).astype('int32').reshape(l, 1)
+        neg_indices[offset:offset + neg_lod[n], :] = np.array(neg_ids).astype(
+            'int32').reshape(l, 1)
+        offset += neg_lod[n]
 
     return match_indices, neg_indices
 
@@ -56,24 +57,28 @@ def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod,
     # init weight for target label
     trg_label_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
 
+    gt_offset = 0
+    neg_offset = 0
     for i in range(batch_size):
         cur_indices = match_indices[i]
         col_ids = np.where(cur_indices > -1)
         col_val = cur_indices[col_ids]
 
-        gt_start = gt_lod[i]
         # target bbox
-        for v, c in zip(col_val + gt_start, col_ids[0].tolist()):
+        for v, c in zip(col_val + gt_offset, col_ids[0].tolist()):
             trg_box[i][c][:] = encoded_box[v][c][:]
         # weight for target bbox
         trg_box_wt[i][col_ids] = 1.0
 
-        trg_label[i][col_ids] = gt_label[col_val + gt_start]
+        trg_label[i][col_ids] = gt_label[col_val + gt_offset]
         trg_label_wt[i][col_ids] = 1.0
         # set target label weight to 1.0 for the negative samples
         if neg_indices is not None:
-            neg_ids = neg_indices[neg_lod[i]:neg_lod[i + 1]]
+            neg_ids = neg_indices[neg_offset:neg_offset + neg_lod[i]]
             trg_label_wt[i][neg_ids] = 1.0
+        # update offset
+        gt_offset += gt_lod[i]
+        neg_offset += neg_lod[i]
 
     return trg_box, trg_box_wt, trg_label, trg_label_wt
 
@@ -83,11 +88,11 @@ class TestTargetAssginFloatType(OpTest):
         self.op_type = "target_assign"
         num_prior = 120
         num_class = 21
-        gt_lod = [0, 5, 11, 23]
-        neg_lod = [0, 4, 7, 13]
+        gt_lod = [5, 6, 12]
+        neg_lod = [4, 3, 6]
         mismatch_value = 0
-        batch_size = len(gt_lod) - 1
-        num_gt = gt_lod[-1]
+        batch_size = len(gt_lod)
+        num_gt = sum(gt_lod)
 
         encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
         gt_label = np.random.randint(
@@ -121,11 +126,11 @@ class TestTargetAssginIntType(OpTest):
         self.op_type = "target_assign"
         num_prior = 120
         num_class = 21
-        gt_lod = [0, 5, 11, 23]
-        neg_lod = [0, 4, 7, 13]
+        gt_lod = [5, 6, 12]
+        neg_lod = [4, 3, 6]
         mismatch_value = 0
-        batch_size = len(gt_lod) - 1
-        num_gt = gt_lod[-1]
+        batch_size = len(gt_lod)
+        num_gt = sum(gt_lod)
 
         encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
         gt_label = np.random.randint(
diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py
index 379081c328..f17edd3025 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
@@ -69,15 +69,14 @@ class TestTensor(unittest.TestCase):
         array[0, 0, 0] = 3
         array[3, 3, 5] = 10
         lod_tensor.set(array, place)
-        lod_tensor.set_lod([[0, 2, 4]])
+        lod_tensor.set_recursive_sequence_lengths([[2, 2]])
 
         lod_v = numpy.array(lod_tensor)
         self.assertTrue(numpy.alltrue(array == lod_v))
 
-        lod = lod_tensor.lod()
-        self.assertEqual(0, lod[0][0])
+        lod = lod_tensor.recursive_sequence_lengths()
+        self.assertEqual(2, lod[0][0])
         self.assertEqual(2, lod[0][1])
-        self.assertEqual(4, lod[0][2])
 
     def test_float_lod_tensor(self):
         place = core.CPUPlace()
@@ -97,21 +96,21 @@ class TestTensor(unittest.TestCase):
         lod_v = numpy.array(lod_tensor)
         self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
         self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
-        self.assertEqual(len(lod_tensor.lod()), 0)
+        self.assertEqual(len(lod_tensor.recursive_sequence_lengths()), 0)
 
-        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
-        lod_tensor.set_lod(lod_py)
-        lod = lod_tensor.lod()
+        lod_py = [[2, 1], [1, 2, 2]]
+        lod_tensor.set_recursive_sequence_lengths(lod_py)
+        lod = lod_tensor.recursive_sequence_lengths()
         self.assertListEqual(lod_py, lod)
 
     def test_lod_tensor_init(self):
         scope = core.Scope()
         place = core.CPUPlace()
-        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
+        lod_py = [[2, 1], [1, 2, 2]]
         lod_tensor = core.LoDTensor()
 
         lod_tensor.set_dims([5, 2, 3, 4])
-        lod_tensor.set_lod(lod_py)
+        lod_tensor.set_recursive_sequence_lengths(lod_py)
         lod_tensor.alloc_float(place)
         tensor_array = numpy.array(lod_tensor)
         tensor_array[0, 0, 0, 0] = 1.0
@@ -121,17 +120,17 @@ class TestTensor(unittest.TestCase):
         lod_v = numpy.array(lod_tensor)
         self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
         self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
-        self.assertListEqual(lod_py, lod_tensor.lod())
+        self.assertListEqual(lod_py, lod_tensor.recursive_sequence_lengths())
 
     def test_lod_tensor_gpu_init(self):
         if not core.is_compiled_with_cuda():
             return
         place = core.CUDAPlace(0)
-        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
+        lod_py = [[2, 1], [1, 2, 2]]
         lod_tensor = core.LoDTensor()
 
         lod_tensor.set_dims([5, 2, 3, 4])
-        lod_tensor.set_lod(lod_py)
+        lod_tensor.set_recursive_sequence_lengths(lod_py)
         lod_tensor.alloc_float(place)
         tensor_array = numpy.array(lod_tensor)
         tensor_array[0, 0, 0, 0] = 1.0
@@ -141,7 +140,7 @@ class TestTensor(unittest.TestCase):
         lod_v = numpy.array(lod_tensor)
         self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
         self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
-        self.assertListEqual(lod_py, lod_tensor.lod())
+        self.assertListEqual(lod_py, lod_tensor.recursive_sequence_lengths())
 
     def test_empty_tensor(self):
         place = core.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index ac638f7836..9f1aaee472 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -34,8 +34,8 @@ class CTCForward(object):
 
         self.level = 0
         self.num_classes = softmax.shape[1]
-        self.batch_size = len(softmax_lod[self.level]) - 1
-        assert self.batch_size == len(labels_lod[self.level]) - 1
+        self.batch_size = len(softmax_lod[self.level])
+        assert self.batch_size == len(labels_lod[self.level])
 
         self.loss = np.zeros([self.batch_size, 1], dtype="float32")
         self.gradient = np.zeros(self.softmax.shape, dtype="float32")
@@ -156,16 +156,20 @@ class CTCForward(object):
         return -log_prob
 
     def forward(self):
+        softmax_offset = 0
+        labels_offset = 0
         for i in range(self.batch_size):
-            softmax_start_i = self.softmax_lod[self.level][i]
-            softmax_end_i = self.softmax_lod[self.level][i + 1]
-            labels_start_i = self.labels_lod[self.level][i]
-            labels_end_i = self.labels_lod[self.level][i + 1]
+            softmax_start_i = softmax_offset
+            softmax_end_i = softmax_offset + self.softmax_lod[self.level][i]
+            labels_start_i = labels_offset
+            labels_end_i = labels_offset + self.labels_lod[self.level][i]
 
             softmax_a_sequence = self.softmax[softmax_start_i:softmax_end_i, :]
             labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
             self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
                                                    labels_a_sequence)
+            softmax_offset += self.softmax_lod[self.level][i]
+            labels_offset += self.labels_lod[self.level][i]
         return self.loss
 
 
@@ -173,8 +177,8 @@ class TestWarpCTCOp(OpTest):
     def config(self):
         self.batch_size = 4
         self.num_classes = 8
-        self.logits_lod = [[0, 4, 5, 8, 11]]
-        self.labels_lod = [[0, 3, 4, 8, 12]]
+        self.logits_lod = [[4, 1, 3, 3]]
+        self.labels_lod = [[3, 1, 4, 4]]
         self.blank = self.num_classes - 1
         self.norm_by_times = False
 
@@ -184,11 +188,13 @@ class TestWarpCTCOp(OpTest):
 
         logits = np.random.uniform(
             0.1, 1.0,
-            [self.logits_lod[0][-1], self.num_classes]).astype("float32")
+            [sum(self.logits_lod[0]), self.num_classes]).astype("float32")
         softmax = np.apply_along_axis(stable_softmax, 1, logits)
         # labels should not be blank
         labels = np.random.randint(
-            0, self.num_classes - 1, [self.labels_lod[0][-1], 1], dtype="int32")
+            0,
+            self.num_classes - 1, [sum(self.labels_lod[0]), 1],
+            dtype="int32")
 
         ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
                          self.blank, self.norm_by_times)
@@ -196,9 +202,8 @@ class TestWarpCTCOp(OpTest):
 
         max_sequence_length = 0
         for i in range(self.batch_size):
-            max_sequence_length = max(
-                max_sequence_length,
-                self.logits_lod[0][i + 1] - self.logits_lod[0][i])
+            max_sequence_length = max(max_sequence_length,
+                                      self.logits_lod[0][i])
         self.gradient = np.zeros(
             [max_sequence_length, self.batch_size, self.num_classes],
             dtype="float32")
@@ -222,8 +227,8 @@ class TestWarpCTCOpCase1(TestWarpCTCOp):
     def config(self):
         self.batch_size = 4
         self.num_classes = CUDA_BLOCK_SIZE + 2
-        self.logits_lod = [[0, 4, 5, 8, 11]]
-        self.labels_lod = [[0, 3, 4, 8, 12]]
+        self.logits_lod = [[4, 1, 3, 3]]
+        self.labels_lod = [[3, 1, 4, 4]]
         self.blank = 0
         self.norm_by_times = False
 
diff --git a/python/paddle/fluid/tests/unittests/test_weight_normalization.py b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
index 2adf917bc5..436f9b9f86 100644
--- a/python/paddle/fluid/tests/unittests/test_weight_normalization.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
@@ -76,11 +76,11 @@ class TestWeightNormalization(unittest.TestCase):
                 lod_level_i = numpy.random.randint(
                     low=1,
                     high=5,
-                    size=self.batch_size if i == 0 else lod_level_i[-1])
-                lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist()
+                    size=self.batch_size
+                    if i == 0 else sum(lod_level_i)).tolist()
                 data_lod.append(lod_level_i)
             data_value = numpy.random.random(
-                size=[data_lod[-1][-1] if data_lod else self.batch_size
+                size=[sum(data_lod[-1]) if data_lod else self.batch_size
                       ] + data_shape).astype('float32')
             self.data[data_name] = (data_value, data_lod)
 
@@ -90,7 +90,7 @@ class TestWeightNormalization(unittest.TestCase):
             tensor = fluid.Tensor()
             tensor.set(self.data[desc[0]][0], place)
             if self.data[desc[0]][1]:
-                tensor.set_lod(self.data[desc[0]][1])
+                tensor.set_recursive_sequence_lengths(self.data[desc[0]][1])
             self.inputs[desc[0]] = tensor
 
     def weight_normalize(self):
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
index 1dc94a80c9..a995ee10f2 100644
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -22,7 +22,7 @@ def as_lodtensor(np_array, lod, place):
     tensor = core.LoDTensor()
     tensor.set(np_value, place)
     if lod is not None:
-        tensor.set_lod(lod)
+        tensor.set_recursive_sequence_lengths(lod)
     return tensor
 
 
@@ -73,7 +73,7 @@ def set_input(scope, op, inputs, place):
         if isinstance(var, tuple) or isinstance(var, np.ndarray):
             tensor = scope.find_var(var_name).get_tensor()
             if isinstance(var, tuple):
-                tensor.set_lod(var[1])
+                tensor.set_recursive_sequence_lengths(var[1])
                 var = var[0]
             tensor.set_dims(var.shape)
             tensor.set(var, place)
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index b194af76dc..a9775e10ef 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -7,7 +7,7 @@ for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
     if [[ $file =~ ^(paddle/api/.*|paddle/capi/.*|paddle/contrib/.*|paddle/cuda/.*|paddle/function/.*|paddle/gserver/.*|paddle/math/.*|paddle/optimizer/.*|paddle/parameter/.*|paddle/pserver/.*|paddle/trainer/.*|paddle/utils/.*) ]]; then
         continue;
     else
-        cpplint $file;
+        cpplint --filter=-readability/fn_size $file;
         TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
     fi
 done

From 3a25f21eae9d175d34e3a871f1155049e615c058 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <kexin.zhao.paddle@gmail.com>
Date: Fri, 15 Jun 2018 10:48:55 -0700
Subject: [PATCH 69/69] Modify lod tensor doc based on new LoDTensor Python API
 (#11253)

* Modify lod_tensor.md and nn.py

* Modify control_flow.py doc

* undo change in lod_tensor.md
---
 python/paddle/fluid/layers/control_flow.py |  4 +-
 python/paddle/fluid/layers/nn.py           | 52 +++++++++++-----------
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index d55a1a6f6a..8fec2f9c12 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -746,8 +746,8 @@ def lod_rank_table(x, level=0):
         .. code-block:: text
 
             x is a LoDTensor:
-                x.lod = [[0,                2, 3],
-                         [0,             5, 6, 7]]
+                x.lod = [[2,                1],
+                         [5,             1, 1]]
                 x.data = [a, b, c, d, e, f, g]
 
             1. set level to 0:
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 260d4afc9f..7377f7dd7d 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1621,13 +1621,13 @@ def sequence_pool(input, pool_type):
     .. code-block:: text
 
        x is a 1-level LoDTensor:
-         x.lod = [[0, 2, 5, 7]]
+         x.lod = [[2, 3, 2]]
          x.data = [1, 3, 2, 4, 6, 5, 1]
          x.dims = [7, 1]
 
        then output is a Tensor:
          out.dim = [3, 1]
-         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         with condition len(x.lod[-1]) == out.dims[0]
 
        for different pool_type:
          average: out.data = [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
@@ -1686,13 +1686,13 @@ def sequence_first_step(input):
     .. code-block:: text
 
        x is a 1-level LoDTensor:
-         x.lod = [[0, 2, 5, 7]]
+         x.lod = [[2, 3, 2]]
          x.data = [1, 3, 2, 4, 6, 5, 1]
          x.dims = [7, 1]
 
        then output is a Tensor:
          out.dim = [3, 1]
-         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         with condition len(x.lod[-1]) == out.dims[0]
          out.data = [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
 
     Args:
@@ -1719,13 +1719,13 @@ def sequence_last_step(input):
     .. code-block:: text
 
        x is a 1-level LoDTensor:
-         x.lod = [[0, 2, 5, 7]]
+         x.lod = [[2, 3, 2]]
          x.data = [1, 3, 2, 4, 6, 5, 1]
          x.dims = [7, 1]
 
        then output is a Tensor:
          out.dim = [3, 1]
-         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         with condition len(x.lod[-1]) == out.dims[0]
          out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
 
     Args:
@@ -2468,18 +2468,18 @@ def sequence_expand(x, y, ref_level=-1, name=None):
 
         * Case 1
             x is a LoDTensor:
-                x.lod  = [[0,   2,        4]]
+                x.lod  = [[2,        2]]
                 x.data = [[a], [b], [c], [d]]
                 x.dims = [4, 1]
 
             y is a LoDTensor:
-                y.lod = [[0,    2,    4],
-                         [0, 3, 6, 7, 8]]
+                y.lod = [[2,    2],
+                         [3, 3, 1, 1]]
 
             ref_level: 0
 
             then output is a 1-level LoDTensor:
-                out.lod =  [[0,   2,        4,        6,        8]]
+                out.lod =  [[2,        2,        2,        2]]
                 out.data = [[a], [b], [a], [b], [c], [d], [c], [d]]
                 out.dims = [8, 1]
 
@@ -2489,7 +2489,7 @@ def sequence_expand(x, y, ref_level=-1, name=None):
                 x.dims = [3, 1]
 
             y is a LoDTensor:
-                y.lod = [[0, 2, 2, 5]]
+                y.lod = [[2, 0, 3]]
 
             ref_level: -1
 
@@ -3343,7 +3343,7 @@ def ctc_greedy_decoder(input, blank, name=None):
                       [0.2, 0.2, 0.1, 0.5],
                       [0.5, 0.1, 0.3, 0.1]]
 
-        input.lod = [[0, 4, 8]]
+        input.lod = [[4, 4]]
 
         Then:
 
@@ -3351,7 +3351,7 @@ def ctc_greedy_decoder(input, blank, name=None):
                        [1],
                        [3]]
 
-        output.lod = [[0, 2, 3]]
+        output.lod = [[2, 1]]
 
     Args:
 
@@ -3368,7 +3368,7 @@ def ctc_greedy_decoder(input, blank, name=None):
 
     Returns:
         Variable: CTC greedy decode result. If all the sequences in result were
-        empty, the result LoDTensor will be [-1] with LoD [[0]] and dims [1, 1].
+        empty, the result LoDTensor will be [-1] with LoD [[]] and dims [1, 1].
 
     Examples:
         .. code-block:: python
@@ -3458,7 +3458,7 @@ def sequence_reshape(input, new_dim):
     .. code-block:: text
 
         x is a LoDTensor:
-            x.lod  = [[0, 2, 6]]
+            x.lod  = [[2, 4]]
             x.data = [[1, 2], [3, 4],
                       [5, 6], [7, 8], [9, 10], [11, 12]]
             x.dims = [6, 2]
@@ -3466,7 +3466,7 @@ def sequence_reshape(input, new_dim):
         set new_dim = 4
 
         then out is a LoDTensor:
-            out.lod  = [[0, 1, 3]]
+            out.lod  = [[1, 2]]
             out.data = [[1, 2, 3, 4],
                         [5, 6, 7, 8], [9, 10, 11, 12]]
             out.dims = [3, 4]
@@ -3737,7 +3737,7 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
 
             output.dims = {8, 9}
 
-            output.lod = [[0, 4, 8]]
+            output.lod = [[4, 4]]
 
         The simple usage is:
 
@@ -4133,47 +4133,47 @@ def lod_reset(x, y=None, target_lod=None):
         * Example 1:
 
             Given a 1-level LoDTensor x:
-                x.lod =  [[ 0,     2,                   5      6 ]]
+                x.lod =  [[ 2,           3,                   1 ]]
                 x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 x.dims = [6, 1]
 
-            target_lod: [0, 4, 6]
+            target_lod: [4, 2]
 
             then we get a 1-level LoDTensor:
-                out.lod =  [[ 0,                   4,            6 ]]
+                out.lod =  [[4,                          2]]
                 out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 out.dims = [6, 1]
 
         * Example 2:
 
             Given a 1-level LoDTensor x:
-                x.lod =  [[ 0,     2,                   5      6 ]]
+                x.lod =  [[2,            3,                   1]]
                 x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 x.dims = [6, 1]
 
             y is a Tensor:
-                y.data = [[0, 2, 6]]
+                y.data = [[2, 4]]
                 y.dims = [1, 3]
 
             then we get a 1-level LoDTensor:
-                out.lod =  [[ 0,     2,                          6 ]]
+                out.lod =  [[2,            4]]
                 out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 out.dims = [6, 1]
 
         * Example 3:
 
             Given a 1-level LoDTensor x:
-                x.lod =  [[ 0,      2,                   5     6 ]]
+                x.lod =  [[2,            3,                   1]]
                 x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 x.dims = [6, 1]
 
             y is a 2-level LoDTensor:
-                y.lod =  [[0, 2, 4], [0, 2, 5, 6]]
+                y.lod =  [[2, 2], [2, 2, 1, 1]]
                 y.data = [[1.1], [2.1], [3.1], [4.1], [5.1], [6.1]]
                 y.dims = [6, 1]
 
             then we get a 2-level LoDTensor:
-                out.lod =  [[0, 2, 4], [0, 2, 5, 6]]
+                out.lod =  [[2, 2], [2, 2, 1, 1]]
                 out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 out.dims = [6, 1]