From 584c3f048fcd221be5095575f50f837793f946c0 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Sat, 29 Sep 2018 08:01:02 +0000
Subject: [PATCH 01/48] fix sparse rmsprop

---
 paddle/fluid/operators/adam_op.h        |  19 +-
 paddle/fluid/operators/math/algorithm.h |  44 ++++
 paddle/fluid/operators/rmsprop_op.h     | 270 ++++++++++++++++++++----
 3 files changed, 276 insertions(+), 57 deletions(-)
 create mode 100644 paddle/fluid/operators/math/algorithm.h
diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h
index 4cb1f3a80e..8d664e3e9a 100644
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/math/algorithm.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/for_range.h"
 
@@ -199,23 +200,9 @@ struct SparseAdamFunctor {
         row_numel_(row_numel),
         row_count_(row_count) {}
 
-  inline HOSTDEVICE int64_t BinarySearchInRows(int64_t row) const {
-    int64_t beg = 0, end = row_count_ - 1;
-    while (beg <= end) {
-      auto mid = ((beg + end) >> 1);
-      if (rows_[mid] == row)
-        return mid;
-      else if (rows_[mid] < row)
-        beg = mid + 1;
-      else
-        end = mid - 1;
-    }
-    return -1;
-  }
-
   inline HOSTDEVICE void operator()(size_t i) const {
-    int64_t row = i / row_numel_;
-    auto row_idx = BinarySearchInRows(row);
+    auto row_idx =
+        math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
     T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
 
     // The following code is the same as dense
diff --git a/paddle/fluid/operators/math/algorithm.h b/paddle/fluid/operators/math/algorithm.h
new file mode 100644
index 0000000000..262469beea
--- /dev/null
+++ b/paddle/fluid/operators/math/algorithm.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>  // for int64_t
+#include <numeric>
+
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) {
+  int64_t beg = 0, end = num - 1;
+  while (beg <= end) {
+    auto mid = ((beg + end) >> 1);
+    if (x[mid] == val)
+      return mid;
+    else if (x[mid] < val)
+      beg = mid + 1;
+    else
+      end = mid - 1;
+  }
+  return -1;
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/rmsprop_op.h b/paddle/fluid/operators/rmsprop_op.h
index 25ed32c5eb..406730407d 100644
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
@@ -13,66 +13,254 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <math.h>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/algorithm.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
+template <typename T>
+struct DenseRmspropGradFunctor {
+  inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {}
+
+  HOSTDEVICE inline T operator()(int64_t idx) const { return grad_[idx]; }
+
+  const T *grad_;
+};
+
+template <typename T>
+struct SparseRmspropGradFunctor {
+  inline SparseRmspropGradFunctor(const T *grad, const int64_t *rows,
+                                  int64_t row_numel, int64_t row_count)
+      : grad_(grad),
+        rows_(rows),
+        row_numel_(row_numel),
+        row_count_(row_count) {}
+
+  HOSTDEVICE inline T operator()(int64_t idx) const {
+    auto row_idx = math::BinarySearch(rows_, row_count_, idx / row_numel_);
+    return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] : 0;
+  }
+
+  const T *grad_;
+  const int64_t *rows_;
+  int64_t row_numel_;
+  int64_t row_count_;
+};
+
+template <typename T, typename GradFunctor>
+struct UncenteredRmspropFunctor {
+  UncenteredRmspropFunctor(T *param, T *ms, T *mom, const T *lr, T rho,
+                           T epsilon, T momentum,
+                           const GradFunctor &grad_functor)
+      : param_(param),
+        ms_(ms),
+        mom_(mom),
+        lr_(lr),
+        rho_(rho),
+        epsilon_(epsilon),
+        momentum_(momentum),
+        grad_functor_(grad_functor) {}
+
+  HOSTDEVICE inline void operator()(int64_t idx) const {
+    T g = grad_functor_(idx);
+    T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
+    T mom_out = momentum_ * mom_[idx] + lr_[0] * g / sqrt(ms_out + epsilon_);
+    param_[idx] -= mom_out;
+    ms_[idx] = ms_out;
+    mom_[idx] = mom_out;
+  }
+
+  T *param_;
+  T *ms_;
+  T *mom_;
+  const T *lr_;
+  T rho_;
+  T epsilon_;
+  T momentum_;
+  GradFunctor grad_functor_;
+};
+
+template <typename T, typename GradFunctor>
+struct CenteredRmspropFunctor {
+  CenteredRmspropFunctor(T *param, T *ms, T *mom, T *mean_grad, const T *lr,
+                         T rho, T epsilon, T momentum,
+                         const GradFunctor &grad_functor)
+      : param_(param),
+        ms_(ms),
+        mom_(mom),
+        mean_grad_(mean_grad),
+        lr_(lr),
+        rho_(rho),
+        epsilon_(epsilon),
+        momentum_(momentum),
+        grad_functor_(grad_functor) {}
+
+  HOSTDEVICE inline void operator()(int64_t idx) const {
+    T g = grad_functor_(idx);
+    T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
+    T mg_out = rho_ * mean_grad_[idx] + (1 - rho_) * g;
+    T mom_out = momentum_ * mom_[idx] +
+                lr_[0] * g / sqrt(ms_out - mg_out * mg_out + epsilon_);
+    param_[idx] -= mom_out;
+    ms_[idx] = ms_out;
+    mom_[idx] = mom_out;
+    mean_grad_[idx] = mg_out;
+  }
+
+  T *param_;
+  T *ms_;
+  T *mom_;
+  T *mean_grad_;
+  const T *lr_;
+  T rho_;
+  T epsilon_;
+  T momentum_;
+  GradFunctor grad_functor_;
+};
+
 template <typename DeviceContext, typename T>
 class RmspropOpKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param_out = ctx.Output<Tensor>("ParamOut");
-    auto* moment_out = ctx.Output<Tensor>("MomentOut");
-    auto* mean_square_out = ctx.Output<Tensor>("MeanSquareOut");
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using Tensor = framework::LoDTensor;
+    auto *grad_var = ctx.InputVar("Grad");
+    auto *param_out = ctx.Output<Tensor>("ParamOut");
+    auto *moment_out = ctx.Output<Tensor>("MomentOut");
+    auto *mean_square_out = ctx.Output<Tensor>("MeanSquareOut");
 
-    auto grad = ctx.Input<Tensor>("Grad");
+    auto epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    auto rho = static_cast<T>(ctx.Attr<float>("decay"));
+    auto momentum = static_cast<T>(ctx.Attr<float>("momentum"));
+    bool centered = ctx.Attr<bool>("centered");
 
-    param_out->mutable_data<T>(ctx.GetPlace());
-    moment_out->mutable_data<T>(ctx.GetPlace());
-    mean_square_out->mutable_data<T>(ctx.GetPlace());
+    auto &p_tensor = *ctx.Input<Tensor>("Param");
+    auto &ms_tensor = *ctx.Input<Tensor>("MeanSquare");
+    auto &lr_tensor = *ctx.Input<Tensor>("LearningRate");
+    auto &mom_tensor = *ctx.Input<Tensor>("Moment");
 
-    float epsilon = ctx.Attr<float>("epsilon");
-    float rho = ctx.Attr<float>("decay");
-    float momentum = ctx.Attr<float>("momentum");
-    bool centered = ctx.Attr<bool>("centered");
+    PADDLE_ENFORCE_EQ(&p_tensor, param_out,
+                      "Param and ParamOut must be the same Tensor");
+    PADDLE_ENFORCE_EQ(&mom_tensor, moment_out,
+                      "Moment and MomentOut must be the same Tensor");
+    PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out,
+                      "MeanSquare and MeanSquareOut must be the same Tensor");
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    size_t limit = static_cast<size_t>(ms_tensor.numel());
+
+    if (grad_var->IsType<Tensor>()) {
+      auto &grad_tensor = grad_var->Get<Tensor>();
+
+      if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value) {
+        auto &place =
+            *ctx.template device_context<DeviceContext>().eigen_device();
+        auto lr_value = lr_tensor.data<T>()[0];
+
+        auto p = EigenVector<T>::Flatten(p_tensor);
+        auto ms = EigenVector<T>::Flatten(ms_tensor);
+        auto g = EigenVector<T>::Flatten(grad_tensor);
+        auto mom = EigenVector<T>::Flatten(mom_tensor);
+
+        auto p_out = EigenVector<T>::Flatten(*param_out);
+        auto mom_out = EigenVector<T>::Flatten(*moment_out);
+        auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
+
+        ms_out.device(place) = rho * ms + (1 - rho) * g * g;
+        if (centered) {
+          auto &mg_tensor = *ctx.Input<Tensor>("MeanGrad");
+          auto mg = EigenVector<T>::Flatten(mg_tensor);
+          auto *mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
+          PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
+                         "MeanGrad and MeanGradOut must be the same Tensor");
+          auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
+
+          mg_out.device(place) = rho * mg + (1 - rho) * g;
+          mom_out.device(place) =
+              momentum * mom +
+              lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt();
+        } else {
+          mom_out.device(place) =
+              momentum * mom + lr_value * g / (ms_out + epsilon).sqrt();
+        }
+        p_out.device(place) = p - mom_out;
+      } else {
+        DenseRmspropGradFunctor<T> grad_func(grad_tensor.data<T>());
+        platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+        if (centered) {
+          auto &mg_tensor = *ctx.Input<Tensor>("MeanGrad");
+          auto *mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
+          PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
+                         "MeanGrad and MeanGradOut must be the same Tensor");
+          for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
+              param_out->mutable_data<T>(ctx.GetPlace()),
+              mean_square_out->mutable_data<T>(ctx.GetPlace()),
+              moment_out->mutable_data<T>(ctx.GetPlace()),
+              mean_grad_out->mutable_data<T>(ctx.GetPlace()),
+              lr_tensor.data<T>(), rho, epsilon, momentum, grad_func));
+        } else {
+          for_range(UncenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
+              param_out->mutable_data<T>(ctx.GetPlace()),
+              mean_square_out->mutable_data<T>(ctx.GetPlace()),
+              moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
+              rho, epsilon, momentum, grad_func));
+        }
+      }
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      auto &grad = grad_var->Get<framework::SelectedRows>();
+      auto *merged_grad = const_cast<framework::Scope &>(ctx.scope())
+                              .Var()
+                              ->GetMutable<framework::SelectedRows>();
+
+      math::scatter::MergeAdd<DeviceContext, T> merge_func;
+      merge_func(dev_ctx, grad, merged_grad);
+
+      platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+      const int64_t *rows;
+#ifdef PADDLE_WITH_CUDA
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        rows = merged_grad->rows().CUDAData(ctx.GetPlace());
+      } else {
+#endif
+        rows = merged_grad->rows().data();
+#ifdef PADDLE_WITH_CUDA
+      }
+#endif
+      auto &merged_tensor = merged_grad->value();
+      int64_t row_count = merged_grad->rows().size();
+      int64_t row_numel = merged_tensor.numel() / row_count;
+      SparseRmspropGradFunctor<T> grad_func(merged_tensor.data<T>(), rows,
+                                            row_numel, row_count);
 
-    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
-    auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
-    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
-    auto g = EigenVector<T>::Flatten(*grad);
-    auto mom = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
-
-    auto p_out = EigenVector<T>::Flatten(*param_out);
-    auto mom_out = EigenVector<T>::Flatten(*moment_out);
-    auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    Eigen::DSizes<int, 1> grad_dsize(static_cast<int>(grad->numel()));
-
-    ms_out.device(place) = rho * ms + (1 - rho) * g * g;
-    if (centered) {
-      auto mg = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanGrad"));
-      auto* mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
-      mean_grad_out->mutable_data<T>(ctx.GetPlace());
-      auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
-
-      mg_out.device(place) = rho * mg + (1 - rho) * g;
-      mom_out.device(place) = momentum * mom +
-                              lr.broadcast(grad_dsize) * g /
-                                  (ms_out - mg_out.square() + epsilon).sqrt();
+      if (centered) {
+        auto &mg_tensor = *ctx.Input<Tensor>("MeanGrad");
+        auto *mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
+        PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
+                       "MeanGrad and MeanGradOut must be the same Tensor");
+        for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
+            param_out->mutable_data<T>(ctx.GetPlace()),
+            mean_square_out->mutable_data<T>(ctx.GetPlace()),
+            moment_out->mutable_data<T>(ctx.GetPlace()),
+            mean_grad_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
+            rho, epsilon, momentum, grad_func));
+      } else {
+        for_range(UncenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
+            param_out->mutable_data<T>(ctx.GetPlace()),
+            mean_square_out->mutable_data<T>(ctx.GetPlace()),
+            moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
+            rho, epsilon, momentum, grad_func));
+      }
     } else {
-      mom_out.device(place) =
-          momentum * mom +
-          lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
+      PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient");
     }
-    p_out.device(place) = p - mom_out;
   }
 };
 

From 63b2e98f3d8783624016f56e0000eabf3dbcd02f Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 10 Oct 2018 13:22:43 +0800
Subject: [PATCH 02/48] Explain LoD and a few other concepts

test=develop
---
 paddle/fluid/pybind/pybind.cc        | 45 +++++++++++++++++++++++++++-
 python/paddle/fluid/layers/io.py     |  6 +++-
 python/paddle/fluid/layers/tensor.py |  2 +-
 3 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 311cd94460..a91894ba89 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -157,7 +157,50 @@ PYBIND11_PLUGIN(core) {
       .def("_get_double_element", TensorGetElement<double>)
       .def("_dtype", [](Tensor &self) { return ToDataType(self.type()); });
 
-  py::class_<LoDTensor, Tensor>(m, "LoDTensor")
+  py::class_<LoDTensor, Tensor>(m, "LoDTensor", R"DOC(
+    LoDTensor is a Tensor with optional LoD information.
+
+    np.array(lod_tensor) can convert LoDTensor to numpy array.
+    lod_tensor.lod() can retrieve the LoD information.
+
+    LoD is short for Level of Details and is usually used for varied sequence
+    length. You can skip the following comment if you don't need optional LoD.
+
+  For example:
+     A LoDTensor X can look like the example below. It contains 2 sequences.
+     The first has length 2 and the second has length 3, as described by x.lod.
+
+     The first tensor dimension 6=2+3 is calculated from LoD if it's available.
+     It means the total number of sequence element. In X, each element has 2
+     columns, hence [6, 2].
+
+      x.lod  = [[2, 3]]
+      x.data = [[1, 2], [3, 4],
+                [5, 6], [7, 8], [9, 10], [11, 12]]
+      x.shape = [6, 2]
+
+      LoD can have multiple levels (for example, a paragraph can have multiple
+      sentences and a sentence can have multiple words). In the following
+      LodTensor Y, the lod_level is 2. It means there are 2 sequence, the
+      first sequence length is 2 (has 2 sub-sequences), the second one's
+      length is 1. The first sequence's 2 sub-sequences have length 2 and 2,
+      respectively. And the second sequence's 1 sub-sequence has length 3.
+
+      y.lod = [[2 1], [2 2 3]]
+      y.shape = [2+2+3, ...]
+
+  Note:
+      In above description, LoD is length-based. In Paddle internal
+      implementation, lod is offset-based. Hence, internally,
+      y.lod is represented as [[0, 2, 3], [0, 2, 4, 7]] (length-based
+      equivlent would be [[2-0, 3-2], [2-0, 4-2, 7-4]]).
+
+      Sometimes LoD is called recursive_sequence_length to be more
+      self-explanatory. In this case, it must be length-based. Due to history
+      reasons. when LoD is called lod in public API, it might be offset-based.
+      Users should be careful about it.
+
+        )DOC")
       .def_buffer(
           [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
       .def("__init__",
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 25fde782b7..a06cd4982f 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -56,7 +56,11 @@ def data(name,
     Args:
        name(str): The name/alias of the function
        shape(list): Tuple declaring the shape.
-       append_batch_size(bool): Whether or not to append the data as a batch.
+       append_batch_size(bool):
+          1. If true, it prepends -1 to the shape.
+            For example if shape=[1], the resulting shape is [-1, 1].
+          2. If shape contains -1, such as shape=[1, -1],
+            append_batch_size will be enforced to be be False (ineffective).
        dtype(int|float): The type of data : float32, float_16, int etc
        type(VarType): The output type. By default it is LOD_TENSOR.
        lod_level(int): The LoD Level. 0 means the input data is not a sequence.
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 44b92af7ac..9c6a2112a6 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -100,7 +100,7 @@ def create_global_var(shape,
                       force_cpu=False,
                       name=None):
     """
-    Create a new variable in the global block(block 0).
+    Create a new tensor variable with value in the global block(block 0).
 
     Args:
         shape(list[int]): shape of the variable

From 8ec748cfa0505a40287ceaf579fd34d521e516ba Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 11 Oct 2018 18:35:26 +0800
Subject: [PATCH 03/48] Accelerate SelectedRows Functors:

  1. Accelerate SelectedRows MergeAdd functor

  2. Add SelectedRowsSumTo functor to support MergeAdd multiple SelectedRows into one

test=develop
---
 paddle/fluid/operators/math/CMakeLists.txt    |   6 +-
 .../operators/math/selected_rows_functor.cc   |  57 +++++-
 .../operators/math/selected_rows_functor.h    | 116 ++++++++++++
 .../math/selected_rows_functor_test.cc        | 171 ++++++++++++++++++
 4 files changed, 342 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 9110135643..b0276f4080 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -3,8 +3,8 @@ add_subdirectory(detail)
 endif(NOT WIN32)
 
 function(math_library TARGET)
-    # math_library is a function to create math library. 
-    # The interface is the same as cc_library. 
+    # math_library is a function to create math library.
+    # The interface is the same as cc_library.
     # But it handle split GPU/CPU code and link some common library.
     set(cc_srcs)
     set(cu_srcs)
@@ -53,7 +53,7 @@ cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
 math_library(math_function DEPS blas)
 math_library(maxouting)
 math_library(pooling)
-math_library(selected_rows_functor DEPS selected_rows math_function)
+math_library(selected_rows_functor DEPS selected_rows math_function blas)
 math_library(sequence2batch)
 math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 8e8baf49b2..43d593710c 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include <set>
 #include <vector>
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 
 namespace paddle {
@@ -150,6 +149,46 @@ template struct SelectedRowsAddTo<platform::CPUDeviceContext, double>;
 template struct SelectedRowsAddTo<platform::CPUDeviceContext, int>;
 template struct SelectedRowsAddTo<platform::CPUDeviceContext, int64_t>;
 
+template <typename T>
+struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const std::vector<framework::SelectedRows*>& input1,
+                  const std::vector<int64_t>& input2_offsets,
+                  framework::SelectedRows* input2) {
+    // Ensure all selected rows have the same height
+    size_t size = 0u;
+    for (auto iter = input1.begin(); iter != input1.end(); ++iter) {
+      auto& in_rows = (*iter)->rows();
+      size += in_rows.end() - in_rows.begin();
+      auto in1_height = (*iter)->height();
+      PADDLE_ENFORCE_EQ(in1_height, input2->height());
+    }
+    // concat rows
+    std::vector<int64_t> in2_rows;
+    in2_rows.reserve(in2_rows.size() + size);
+    for (auto iter = input1.begin(); iter != input1.end(); ++iter) {
+      const framework::Vector<int64_t>& in_rows = (*iter)->rows();
+      in2_rows.insert(in2_rows.end(), in_rows.begin(), in_rows.end());
+    }
+    input2->set_rows(in2_rows);
+
+    // start = std::chrono::system_clock::now();
+    auto* in2_value = input2->mutable_value();
+    auto* in2_data = in2_value->data<T>();
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    size_t offset = 0u;
+    for (size_t i = 0u; i != input1.size(); ++i) {
+      auto& in_value = input1[i]->value();
+      const auto* in_data = in_value.data<T>();
+      offset += input2_offsets[i];
+      blas.VCOPY(in_value.numel(), in_data, in2_data + offset);
+    }
+  }
+};
+
+template struct SelectedRowsSumTo<platform::CPUDeviceContext, float>;
+template struct SelectedRowsSumTo<platform::CPUDeviceContext, double>;
+
 template <typename T>
 struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
@@ -208,8 +247,18 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
                   framework::SelectedRows* output) {
     framework::SelectedRows& out = *output;
     auto input_rows = input.rows();
-    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
-    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
+    std::vector<int64_t> merge_rows;
+    merge_rows.reserve(input_rows.size());
+    std::unordered_map<int64_t, size_t> rows_pos_map;
+    rows_pos_map.reserve(input_rows.size());
+    size_t idx = 0u;
+    for (std::vector<int64_t>::iterator iter = input_rows.begin();
+         iter != input_rows.end(); ++iter) {
+      if (rows_pos_map.find(*iter) == rows_pos_map.end()) {
+        rows_pos_map[*iter] = idx++;
+        merge_rows.emplace_back(*iter);
+      }
+    }
 
     auto input_width = input.value().dims()[1];
     out.set_rows(merge_rows);
@@ -234,8 +283,6 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
   }
 };
 
-template struct MergeAdd<platform::CPUDeviceContext, float>;
-template struct MergeAdd<platform::CPUDeviceContext, double>;
 template struct MergeAdd<platform::CPUDeviceContext, int>;
 template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
 
diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
index aa419f74fc..3d99c9b3f2 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -12,8 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+
+#include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 
 #define INLINE_FOR2(sizei, sizej)     \
@@ -49,6 +54,15 @@ struct SelectedRowsAddTo {
                   const int64_t input2_offset, framework::SelectedRows* input2);
 };
 
+// input2 = [all input in input1] + input2
+template <typename DeviceContext, typename T>
+struct SelectedRowsSumTo {
+  void operator()(const DeviceContext& context,
+                  const std::vector<framework::SelectedRows*>& input1,
+                  const std::vector<int64_t>& input2_offsets,
+                  framework::SelectedRows* input2);
+};
+
 // input2 = input1 + input2
 template <typename DeviceContext, typename T>
 struct SelectedRowsAddToTensor {
@@ -70,6 +84,108 @@ struct MergeAdd {
                   framework::SelectedRows* output);
 };
 
+template <>
+struct MergeAdd<platform::CPUDeviceContext, float> {
+  framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
+                                     const framework::SelectedRows& input) {
+    framework::SelectedRows out;
+    (*this)(context, input, &out);
+    return out;
+  }
+
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::SelectedRows& input,
+                  framework::SelectedRows* output) {
+    framework::SelectedRows& out = *output;
+    auto input_rows = input.rows();
+    std::vector<int64_t> merge_rows;
+    merge_rows.reserve(input_rows.size());
+    std::unordered_map<int64_t, size_t> rows_pos_map;
+    rows_pos_map.reserve(input_rows.size());
+    size_t idx = 0u;
+    for (std::vector<int64_t>::iterator iter = input_rows.begin();
+         iter != input_rows.end(); ++iter) {
+      if (rows_pos_map.find(*iter) == rows_pos_map.end()) {
+        rows_pos_map[*iter] = idx++;
+        merge_rows.emplace_back(*iter);
+      }
+    }
+
+    auto input_width = input.value().dims()[1];
+    out.set_rows(merge_rows);
+    out.set_height(input.height());
+    out.mutable_value()->mutable_data<float>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merge_rows.size()), input_width}),
+        context.GetPlace());
+
+    math::SetConstant<platform::CPUDeviceContext, float> constant_functor;
+    constant_functor(context, out.mutable_value(), 0.0);
+
+    auto* out_data = out.mutable_value()->data<float>();
+    auto* input_data = input.value().data<float>();
+
+    auto blas = GetBlas<platform::CPUDeviceContext, float>(context);
+    for (size_t i = 0; i < input_rows.size(); i++) {
+      size_t out_i = rows_pos_map[input_rows[i]];
+      float* y = out_data + out_i * input_width;
+      const float* x = input_data + i * input_width;
+      blas.AXPY(input_width, 1., x, y);
+    }
+  }
+};
+
+template <>
+struct MergeAdd<platform::CPUDeviceContext, double> {
+  framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
+                                     const framework::SelectedRows& input) {
+    framework::SelectedRows out;
+    (*this)(context, input, &out);
+    return out;
+  }
+
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::SelectedRows& input,
+                  framework::SelectedRows* output) {
+    framework::SelectedRows& out = *output;
+    auto input_rows = input.rows();
+    std::vector<int64_t> merge_rows;
+    merge_rows.reserve(input_rows.size());
+    std::unordered_map<int64_t, size_t> rows_pos_map;
+    rows_pos_map.reserve(input_rows.size());
+    size_t idx = 0u;
+    for (std::vector<int64_t>::iterator iter = input_rows.begin();
+         iter != input_rows.end(); ++iter) {
+      if (rows_pos_map.find(*iter) == rows_pos_map.end()) {
+        rows_pos_map[*iter] = idx++;
+        merge_rows.emplace_back(*iter);
+      }
+    }
+
+    auto input_width = input.value().dims()[1];
+    out.set_rows(merge_rows);
+    out.set_height(input.height());
+    out.mutable_value()->mutable_data<double>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merge_rows.size()), input_width}),
+        context.GetPlace());
+
+    math::SetConstant<platform::CPUDeviceContext, double> constant_functor;
+    constant_functor(context, out.mutable_value(), 0.0);
+
+    auto* out_data = out.mutable_value()->data<double>();
+    auto* input_data = input.value().data<double>();
+
+    auto blas = GetBlas<platform::CPUDeviceContext, double>(context);
+    for (size_t i = 0; i < input_rows.size(); i++) {
+      size_t out_i = rows_pos_map[input_rows[i]];
+      double* y = out_data + out_i * input_width;
+      const double* x = input_data + i * input_width;
+      blas.AXPY(input_width, 1., x, y);
+    }
+  }
+};
+
 template <typename DeviceContext, typename T>
 struct Add {
   framework::SelectedRows operator()(const DeviceContext& context,
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
index 70bed820ee..8355893560 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -219,3 +219,174 @@ TEST(selected_rows_functor, cpu_add_to) {
   // row9: 2.0 + 3.0
   EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0);
 }
+
+TEST(selected_rows_functor, cpu_merge_add_float) {
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CPUDeviceContext ctx(cpu_place);
+  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
+                                       float>
+      functor;
+  int64_t height = 10;
+  int64_t row_numel = 10;
+
+  std::vector<int64_t> rows{0, 4, 4, 7};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows{
+      new paddle::framework::SelectedRows(rows, height)};
+  auto* in_value = selected_rows->mutable_value();
+  in_value->mutable_data<float>(
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows.size()), row_numel}),
+      cpu_place);
+  functor(ctx, in_value, 1.0);
+
+  std::unique_ptr<paddle::framework::SelectedRows> output{
+      new paddle::framework::SelectedRows()};
+
+  paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
+                                             float>
+      merge_add_functor;
+  merge_add_functor(ctx, *selected_rows, output.get());
+
+  auto out_height = output->height();
+  EXPECT_EQ(out_height, height);
+
+  auto& out_rows = output->rows();
+  EXPECT_EQ(out_rows[0], 0);
+  EXPECT_EQ(out_rows[1], 4);
+  EXPECT_EQ(out_rows[2], 7);
+
+  auto* out_data = output->value().data<float>();
+
+  EXPECT_EQ(out_data[0 * row_numel], 1.0);
+  EXPECT_EQ(out_data[1 * row_numel], 2.0);
+  EXPECT_EQ(out_data[2 * row_numel], 1.0);
+}
+
+TEST(selected_rows_functor, cpu_merge_add_int) {
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CPUDeviceContext ctx(cpu_place);
+  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, int>
+      functor;
+  int64_t height = 10;
+  int64_t row_numel = 10;
+
+  std::vector<int64_t> rows{0, 4, 4, 7};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows{
+      new paddle::framework::SelectedRows(rows, height)};
+  auto* in_value = selected_rows->mutable_value();
+  in_value->mutable_data<int>(
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows.size()), row_numel}),
+      cpu_place);
+  functor(ctx, in_value, 1);
+
+  std::unique_ptr<paddle::framework::SelectedRows> output{
+      new paddle::framework::SelectedRows()};
+
+  paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
+                                             int>
+      merge_add_functor;
+  merge_add_functor(ctx, *selected_rows, output.get());
+
+  auto out_height = output->height();
+  EXPECT_EQ(out_height, height);
+
+  auto& out_rows = output->rows();
+  EXPECT_EQ(out_rows[0], 0);
+  EXPECT_EQ(out_rows[1], 4);
+  EXPECT_EQ(out_rows[2], 7);
+
+  auto* out_data = output->value().data<int>();
+
+  EXPECT_EQ(out_data[0 * row_numel], 1);
+  EXPECT_EQ(out_data[1 * row_numel], 2);
+  EXPECT_EQ(out_data[2 * row_numel], 1);
+}
+TEST(selected_rows_functor, cpu_sum_to) {
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CPUDeviceContext ctx(cpu_place);
+  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
+                                       float>
+      functor;
+  int64_t height = 10;
+  int64_t row_numel = 10;
+  std::vector<int64_t> rows1{0, 4, 7};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
+      new paddle::framework::SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows1.size()), row_numel}),
+      cpu_place);
+  functor(ctx, in1_value, 1.0);
+  std::vector<int64_t> rows2{0, 5, 7, 9};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
+      new paddle::framework::SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows2.size()), row_numel}),
+      cpu_place);
+  functor(ctx, in2_value, 2.0);
+  std::unique_ptr<paddle::framework::SelectedRows> output{
+      new paddle::framework::SelectedRows()};
+  output->set_height(height);
+  auto* out_value = output->mutable_value();
+  // simplely concat two SelectedRows
+  out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
+                                 cpu_place);
+  paddle::operators::math::SelectedRowsSumTo<paddle::platform::CPUDeviceContext,
+                                             float>
+      sum_to_functor;
+  sum_to_functor(ctx, std::vector<paddle::framework::SelectedRows*>(
+                          {selected_rows1.get(), selected_rows2.get()}),
+                 std::vector<int64_t>({0, in1_value->numel()}), output.get());
+  auto out_height = output->height();
+  EXPECT_EQ(out_height, height);
+  auto& out_rows = output->rows();
+  // input1 rows
+  EXPECT_EQ(out_rows[0], 0);
+  EXPECT_EQ(out_rows[1], 4);
+  EXPECT_EQ(out_rows[2], 7);
+  // input2 rows
+  EXPECT_EQ(out_rows[3], 0);
+  EXPECT_EQ(out_rows[4], 5);
+  EXPECT_EQ(out_rows[5], 7);
+  EXPECT_EQ(out_rows[6], 9);
+  auto* out_data = output->value().data<float>();
+  // input1 value
+  EXPECT_EQ(out_data[0 * row_numel + 0], 1.0);
+  EXPECT_EQ(out_data[0 * row_numel + 8], 1.0);
+  EXPECT_EQ(out_data[1 * row_numel + 1], 1.0);
+  EXPECT_EQ(out_data[2 * row_numel + 6], 1.0);
+  // input2 value
+  EXPECT_EQ(out_data[3 * row_numel + 3], 2.0);
+  EXPECT_EQ(out_data[3 * row_numel + 8], 2.0);
+  EXPECT_EQ(out_data[4 * row_numel + 4], 2.0);
+  EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
+  EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
+  std::unique_ptr<paddle::framework::Tensor> tensor1{
+      new paddle::framework::Tensor()};
+  tensor1->mutable_data<float>(
+      paddle::framework::make_ddim({height, row_numel}), cpu_place);
+  functor(ctx, tensor1.get(), 3.0);
+  paddle::operators::math::SelectedRowsAddToTensor<
+      paddle::platform::CPUDeviceContext, float>
+      add_to_tensor_functor;
+  add_to_tensor_functor(ctx, *output, tensor1.get());
+  auto* tensor1_data = tensor1->data<float>();
+  // row0: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor1_data[0 * row_numel + 0], 6.0);
+  // row1: 3.0
+  EXPECT_EQ(tensor1_data[1 * row_numel + 1], 3.0);
+  // row4 : 1.0 + 3.0
+  EXPECT_EQ(tensor1_data[4 * row_numel + 6], 4.0);
+  // row5: 2.0 + 3.0
+  EXPECT_EQ(tensor1_data[5 * row_numel + 7], 5.0);
+  // row6: 3.0
+  EXPECT_EQ(tensor1_data[6 * row_numel + 1], 3.0);
+  // row7: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor1_data[7 * row_numel + 3], 6.0);
+  // row9: 2.0 + 3.0
+  EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0);
+}

From 0385b0a1ea8628d2a5f4e27d86f5f0c8aed57a56 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 11 Oct 2018 19:55:27 +0800
Subject: [PATCH 04/48] Accelerate SequencePool Op on SUM mode

test=develop
---
 paddle/fluid/operators/math/CMakeLists.txt    |  4 ++--
 .../fluid/operators/math/sequence_pooling.cc  | 21 ++++++++++++++++---
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 9110135643..5878c733c4 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -3,8 +3,8 @@ add_subdirectory(detail)
 endif(NOT WIN32)
 
 function(math_library TARGET)
-    # math_library is a function to create math library. 
-    # The interface is the same as cc_library. 
+    # math_library is a function to create math library.
+    # The interface is the same as cc_library.
     # But it handle split GPU/CPU code and link some common library.
     set(cc_srcs)
     set(cu_srcs)
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index 69318a6598..235b5405fb 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/sequence_pooling.h"
 #include <string>
+
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence_pooling.h"
 
 namespace paddle {
 namespace operators {
@@ -180,6 +182,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
     }
     auto lod = input.lod()[0];
     auto& place = *context.eigen_device();
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
       Tensor in_t =
           input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
@@ -191,7 +194,14 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
       if (pooltype == "AVERAGE") {
         out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
       } else if (pooltype == "SUM") {
-        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
+        if (h > 0) {
+          const T* in_data = in_t.data<T>();
+          T* out_data = out_t.mutable_data<T>(context.GetPlace());
+          blas.VCOPY(w, in_data, out_data);
+          for (int64_t r = 1; r != h; ++r) {
+            blas.AXPY(w, 1., in_data + r * w, out_data);
+          }
+        }
       } else if (pooltype == "SQRT") {
         out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                               std::sqrt(static_cast<T>(h));
@@ -223,6 +233,7 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
     }
     auto lod = in_grad->lod()[0];
     auto& place = *context.eigen_device();
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
       auto in_g_t = in_grad->Slice(static_cast<int>(lod[i]),
                                    static_cast<int>(lod[i + 1]));
@@ -237,7 +248,11 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
       if (pooltype == "AVERAGE") {
         in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
       } else if (pooltype == "SUM") {
-        in_g_e.device(place) = (out_g_e).broadcast(bcast);
+        const T* out_g_data = out_g_t.data<T>();
+        T* in_g_data = in_g_t.mutable_data<T>(context.GetPlace());
+        for (int r = 0; r != h; ++r) {
+          blas.VCOPY(w, out_g_data, in_g_data + r * w);
+        }
       } else if (pooltype == "SQRT") {
         in_g_e.device(place) =
             (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);

From e2e82bde32709a0bedaf940c60c3d5e3b73d22b1 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 11 Oct 2018 21:12:56 +0800
Subject: [PATCH 05/48] Accelerate Reshape op

---
 paddle/fluid/operators/reshape_op.cc         | 82 ++++++++++++--------
 paddle/fluid/operators/sequence_concat_op.cc |  5 +-
 2 files changed, 51 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index d72f85f2c4..b8fdc3f826 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -164,7 +164,7 @@ dimension value will be copied from Input(X) at runtime. Note that the index of
 [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
 
 3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
-Attr(shape) still should be set correctly to gurantee shape inference in 
+Attr(shape) still should be set correctly to gurantee shape inference in
 compile-time.
 
 )DOC");
@@ -195,6 +195,7 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
   }
 };
 
+template <typename T>
 class ReshapeKernel {
  public:
   void operator()(const framework::ExecutionContext &ctx) const {
@@ -227,12 +228,15 @@ class ReshapeKernel {
           "sequence_reshape op.");
     }
 
-    out->mutable_data(ctx.GetPlace(), in->type());
-    framework::TensorCopySync(*in, ctx.GetPlace(), out);
+    if (in->data<T>() !=
+        reinterpret_cast<T *>(out->mutable_data(ctx.GetPlace(), in->type()))) {
+      framework::TensorCopySync(*in, ctx.GetPlace(), out);
+    }
     out->Resize(out_dims);
   }
 };
 
+template <typename T>
 class ReshapeGradKernel {
  public:
   void operator()(const framework::ExecutionContext &ctx) const {
@@ -240,8 +244,9 @@ class ReshapeGradKernel {
     auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     auto in_dims = d_x->dims();
 
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
+    if (d_out->data<T>() != d_x->mutable_data(ctx.GetPlace(), d_out->type())) {
+      framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
+    }
     d_x->Resize(in_dims);
   }
 };
@@ -259,7 +264,6 @@ class Reshape2Op : public ReshapeOp {
       : ReshapeOp(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    ReshapeOp::InferShape(ctx);
     PADDLE_ENFORCE(ctx->HasOutput("XShape"),
                    "Output(XShape) of ReshapeOp should not be null.");
     const auto &x_dims = ctx->GetInputDim("X");
@@ -270,6 +274,8 @@ class Reshape2Op : public ReshapeOp {
     }
     ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
     ctx->ShareLoD("X", /*->*/ "XShape");
+
+    ReshapeOp::InferShape(ctx);
   }
 };
 
@@ -335,38 +341,46 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
-                               ops::ReshapeKernel, int, ops::ReshapeKernel,
-                               int64_t, ops::ReshapeKernel);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
-                               double, ops::ReshapeGradKernel, int,
-                               ops::ReshapeGradKernel, int64_t,
-                               ops::ReshapeGradKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel<float>,
+                               double, ops::ReshapeKernel<double>, int,
+                               ops::ReshapeKernel<int>, int64_t,
+                               ops::ReshapeKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float,
+                               ops::ReshapeGradKernel<float>, double,
+                               ops::ReshapeGradKernel<double>, int,
+                               ops::ReshapeGradKernel<int>, int64_t,
+                               ops::ReshapeGradKernel<int64_t>);
 
 REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
                   ops::Reshape2GradMaker);
 REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
-                               ops::ReshapeKernel, int, ops::ReshapeKernel,
-                               int64_t, ops::ReshapeKernel);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
-                               double, ops::ReshapeGradKernel, int,
-                               ops::ReshapeGradKernel, int64_t,
-                               ops::ReshapeGradKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel<float>,
+                               double, ops::ReshapeKernel<double>, int,
+                               ops::ReshapeKernel<int>, int64_t,
+                               ops::ReshapeKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float,
+                               ops::ReshapeGradKernel<float>, double,
+                               ops::ReshapeGradKernel<double>, int,
+                               ops::ReshapeGradKernel<int>, int64_t,
+                               ops::ReshapeGradKernel<int64_t>);
 
 #ifdef PADDLE_WITH_CUDA
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
-                                ops::ReshapeKernel, int, ops::ReshapeKernel,
-                                int64_t, ops::ReshapeKernel);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
-                                double, ops::ReshapeGradKernel, int,
-                                ops::ReshapeGradKernel, int64_t,
-                                ops::ReshapeGradKernel);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
-                                ops::ReshapeKernel, int, ops::ReshapeKernel,
-                                int64_t, ops::ReshapeKernel);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
-                                double, ops::ReshapeGradKernel, int,
-                                ops::ReshapeGradKernel, int64_t,
-                                ops::ReshapeGradKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel<float>,
+                                double, ops::ReshapeKernel<double>, int,
+                                ops::ReshapeKernel<int>, int64_t,
+                                ops::ReshapeKernel<int64_t>);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float,
+                                ops::ReshapeGradKernel<float>, double,
+                                ops::ReshapeGradKernel<double>, int,
+                                ops::ReshapeGradKernel<int>, int64_t,
+                                ops::ReshapeGradKernel<int64_t>);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel<float>,
+                                double, ops::ReshapeKernel<double>, int,
+                                ops::ReshapeKernel<int>, int64_t,
+                                ops::ReshapeKernel<int64_t>);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float,
+                                ops::ReshapeGradKernel<float>, double,
+                                ops::ReshapeGradKernel<double>, int,
+                                ops::ReshapeGradKernel<int>, int64_t,
+                                ops::ReshapeGradKernel<int64_t>);
 #endif
diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc
index 397a318295..12b53be708 100644
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
@@ -90,11 +90,12 @@ REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel,
                   paddle::framework::DefaultGradOpDescMaker<false>);
 template <typename T>
 using Kernel = op::SeqConcatKernel<paddle::platform::CPUDeviceContext, T>;
-REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>);
+REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>,
+                       Kernel<int64_t>);
 REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel,
                   op::SeqConcatGradShapeInferer);
 template <typename T>
 using GradKernel =
     op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, T>;
 REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel<float>,
-                       GradKernel<double>);
+                       GradKernel<double>, GradKernel<int64_t>);

From f40848828df2bdb5d80675802e4d71bf4f817c3e Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 11 Oct 2018 22:39:04 +0800
Subject: [PATCH 06/48] Polish code

test=develop
---
 paddle/fluid/operators/sequence_concat_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc
index 12b53be708..3234b60861 100644
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
@@ -92,6 +92,7 @@ template <typename T>
 using Kernel = op::SeqConcatKernel<paddle::platform::CPUDeviceContext, T>;
 REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>,
                        Kernel<int64_t>);
+
 REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel,
                   op::SeqConcatGradShapeInferer);
 template <typename T>

From d8a1b770976ea22520a66488f75e4f4b5e6e0e49 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 11 Oct 2018 22:49:12 +0800
Subject: [PATCH 07/48] Add margin_rank_loss_op to python

---
 python/paddle/fluid/layers/nn.py | 42 ++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 8c0ef7a824..9a0e68f5d8 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -107,6 +107,7 @@ __all__ = [
     'log',
     'crop',
     'rank_loss',
+    'margin_rank_loss',
     'elu',
     'relu6',
     'pow',
@@ -5827,6 +5828,46 @@ def rank_loss(label, left, right, name=None):
     return out
 
 
+def margin_rank_loss(label, left, right, margin=0.1, name=None):
+    """
+    **Margin Rank loss layer for RankNet**
+     Args:
+        label (Variable): Indicats whether A ranked higher than B or not.
+        left (Variable): RankNet's output score for doc A.
+        right (Variable): RankNet's output score for doc B.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+     Returns:
+        list: The value of rank loss.
+     Raises:
+        ValueError: Any of label, left, and right is not a variable.
+     Examples:
+         .. code-block:: python
+             label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
+            left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
+            right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
+            out = fluid.layers.margin_rank_loss(label, left, right)
+     """
+    helper = LayerHelper('margin_rank_loss', **locals())
+    if not (isinstance(label, Variable)):
+        raise ValueError("The label should be a Variable")
+    if not (isinstance(left, Variable)):
+        raise ValueError("The left should be a Variable")
+    if not (isinstance(right, Variable)):
+        raise ValueError("The right should be a Variable")
+    out = helper.create_tmp_variable("float32")
+    act = helper.create_tmp_variable("float32")
+    helper.append_op(
+        type='margin_rank_loss',
+        inputs={"Label": label,
+                "X1": left,
+                "X2": right},
+        outputs={'Out': out,
+                 'Activated': act},
+        attrs={'margin': margin})
+    return out
+
+
 def pad2d(input,
           paddings=[0, 0, 0, 0],
           mode='constant',
@@ -6290,6 +6331,7 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
         outputs={'Out': out},
         attrs={'win_size': win_size,
                'pad_value': pad_value})
+    return out
 
 
 def sequence_mask(x, maxlen=None, dtype='int64', name=None):

From 0cb88c34bea180736fd1882b8a928c1a382e88bf Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Thu, 11 Oct 2018 16:20:51 +0000
Subject: [PATCH 08/48] add op converter

---
 paddle/fluid/inference/analysis/analyzer.cc   |  2 +-
 .../api/api_tensorrt_subgraph_engine.cc       |  1 +
 .../inference/tensorrt/convert/CMakeLists.txt |  6 +-
 .../inference/tensorrt/convert/pad_op.cc      | 68 +++++++++++++++++++
 .../inference/tensorrt/convert/test_pad_op.cc | 52 ++++++++++++++
 5 files changed, 126 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/pad_op.cc
 create mode 100644 paddle/fluid/inference/tensorrt/convert/test_pad_op.cc

diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index 8a8aeb5e09..d780592eb9 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -70,7 +70,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
       auto trt_teller = [&](const Node* node) {
         std::unordered_set<std::string> teller_set(
             {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
-             "depthwise_conv2d", "batch_norm", "concat", "tanh",
+             "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
              "elementwise_add", "dropout"});
         if (!node->IsFunction()) return false;
 
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
index 5ee6a5a931..7ac468ee4d 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -185,3 +185,4 @@ USE_TRT_CONVERTER(softmax);
 USE_TRT_CONVERTER(batch_norm);
 USE_TRT_CONVERTER(concat);
 USE_TRT_CONVERTER(dropout);
+USE_TRT_CONVERTER(pad);
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index fac1babf6e..0a35e10f69 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Add TRT tests
 nv_library(tensorrt_converter
   SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
+batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc pad_op.cc
   DEPS tensorrt_engine operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
@@ -26,6 +26,8 @@ nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
 nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
-
 nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL)
+
+nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
new file mode 100644
index 0000000000..218030a591
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * PadOp.
+ */
+class PadOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a fluid transpose op to tensorrt tranpose layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+
+    const std::vector<int> paddings =
+        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+    const float pad_value = boost::get<float>(op_desc.GetAttr("pad_value"));
+
+    nvinfer1::Dims input_shape = input->getDimensions();
+    int nbDims = input_shape.nbDims;
+    int pad_size = static_cast<int>(paddings.size());
+    PADDLE_ENFORCE_GE(nbDims, 2);
+    PADDLE_ENFORCE_EQ((nbDims + 1) * 2, pad_size);
+    PADDLE_ENFORCE(pad_value == 0.0, "The pad layer of TRT only support zero.");
+
+    nvinfer1::DimsHW pre_pad(paddings[pad_size - 4], paddings[pad_size - 2]);
+    nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]);
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Padding,
+                                       *const_cast<nvinfer1::ITensor*>(input),
+                                       pre_pad, post_pad);
+
+    PADDLE_ENFORCE(layer != nullptr);
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    layer->setName(("scale (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(pad, PadOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
new file mode 100644
index 0000000000..ba35d7ddbb
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(PadConverter, main) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("pad-X", nvinfer1::Dims3(3, 2, 2));
+  validator.DeclOutputVar("pad-Out", nvinfer1::Dims3(3, 3, 5));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("pad");
+  desc.SetInput("X", {"pad-X"});
+  desc.SetOutput("Out", {"pad-Out"});
+
+  std::vector<int> paddings = {0, 0, 0, 0, 0, 1, 1, 2};
+  float pad_value = 0.0;
+  desc.SetAttr("paddings", paddings);
+  desc.SetAttr("pad_value", pad_value);
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(2);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(pad);

From 9878eedbaafb97f546c33e7ec9bb8f138d6d3269 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 12 Oct 2018 09:35:01 +0800
Subject: [PATCH 09/48] Change API.spec

test=develop
---
 paddle/fluid/API.spec | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index d0ae802746..a6728f2cc0 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -127,6 +127,7 @@ paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.margin_rank_loss ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None))
 paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
 paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None))
 paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None))

From 35e547e00fc71b15530c07547876e5b970b62fa8 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 12 Oct 2018 10:31:40 +0800
Subject: [PATCH 10/48] Polish API doc

test=develop
---
 python/paddle/fluid/layers/nn.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 9a0e68f5d8..3f7adc4093 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5830,12 +5830,13 @@ def rank_loss(label, left, right, name=None):
 
 def margin_rank_loss(label, left, right, margin=0.1, name=None):
     """
-    **Margin Rank loss layer for RankNet**
+    **Margin Rank loss layer for rank problem**
      Args:
-        label (Variable): Indicats whether A ranked higher than B or not.
-        left (Variable): RankNet's output score for doc A.
-        right (Variable): RankNet's output score for doc B.
-        name(str|None): A name for this layer(optional). If set None, the layer
+        label (Variable): Indicats whether left higher than (right + margin) or not.
+        left (Variable): rank score for left.
+        right (Variable): rank score for right.
+        margin (float): Indicates the margin to be added to right
+        name (str|None): A name for this layer (optional). If set None, the layer
                         will be named automatically.
      Returns:
         list: The value of rank loss.
@@ -5843,7 +5844,7 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
         ValueError: Any of label, left, and right is not a variable.
      Examples:
          .. code-block:: python
-             label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
+            label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
             left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
             right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
             out = fluid.layers.margin_rank_loss(label, left, right)

From 7f6ff6f5a90b4d085275bd066ed95e723832d061 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 12 Oct 2018 10:46:44 +0800
Subject: [PATCH 11/48] Polish doc

test=develop
---
 python/paddle/fluid/layers/nn.py | 44 ++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 3f7adc4093..7261b3009b 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5830,25 +5830,31 @@ def rank_loss(label, left, right, name=None):
 
 def margin_rank_loss(label, left, right, margin=0.1, name=None):
     """
-    **Margin Rank loss layer for rank problem**
-     Args:
-        label (Variable): Indicats whether left higher than (right + margin) or not.
-        left (Variable): rank score for left.
-        right (Variable): rank score for right.
-        margin (float): Indicates the margin to be added to right
-        name (str|None): A name for this layer (optional). If set None, the layer
-                        will be named automatically.
-     Returns:
-        list: The value of rank loss.
-     Raises:
-        ValueError: Any of label, left, and right is not a variable.
-     Examples:
-         .. code-block:: python
-            label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
-            left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
-            right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
-            out = fluid.layers.margin_rank_loss(label, left, right)
-     """
+    Margin Rank loss layer for rank problem, which comparing left value and right value be passed in.
+    The rank loss can be defined as below equation:
+
+    .. math::
+
+        rank\_loss &= max(0, -label * (left - right) + margin)
+
+    Args:
+       label (Variable): Indicats whether left higher than (right + margin) or not.
+       left (Variable): rank score for left.
+       right (Variable): rank score for right.
+       margin (float): Indicates the margin to be added to right
+       name (str|None): A name for this layer (optional). If set None, the layer
+                       will be named automatically.
+    Returns:
+       list: The value of rank loss.
+    Raises:
+       ValueError: Any of label, left, and right is not a variable.
+    Examples:
+        .. code-block:: python
+           label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
+           left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
+           right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
+           out = fluid.layers.margin_rank_loss(label, left, right)
+    """
     helper = LayerHelper('margin_rank_loss', **locals())
     if not (isinstance(label, Variable)):
         raise ValueError("The label should be a Variable")

From 3f6ec900605d6b33d3f7f8a24ba0af95b3267153 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 12 Oct 2018 11:26:54 +0800
Subject: [PATCH 12/48] Polish code

test=develop
---
 paddle/fluid/operators/math/selected_rows_functor.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 43d593710c..6810c24227 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -172,7 +172,6 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
     }
     input2->set_rows(in2_rows);
 
-    // start = std::chrono::system_clock::now();
     auto* in2_value = input2->mutable_value();
     auto* in2_data = in2_value->data<T>();
     auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
@@ -275,7 +274,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
     auto* input_data = input.value().data<T>();
 
     for (size_t i = 0; i < input_rows.size(); i++) {
-      size_t out_i = FindPos(merge_rows, input_rows[i]);
+      size_t out_i = rows_pos_map[input_rows[i]];
       for (int64_t j = 0; j < input_width; j++) {
         out_data[out_i * input_width + j] += input_data[i * input_width + j];
       }

From 5f2e837847db9fff219333e03f4867abaa75768c Mon Sep 17 00:00:00 2001
From: Dun <randonlang@gmail.com>
Date: Fri, 12 Oct 2018 13:24:57 +0800
Subject: [PATCH 13/48] optimize depthwise conv by register memory (#13778)

* optimize depthwise conv by register memory
* test=develop
---
 paddle/fluid/operators/math/depthwise_conv.cu | 275 +++++++++++++-----
 1 file changed, 210 insertions(+), 65 deletions(-)

diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu
index 3be3899123..66d37c3bf3 100644
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -46,17 +46,20 @@ __forceinline__ __device__ unsigned warp_id() {
   return ret;
 }
 
+#define ARG_DEFINE_KernelDepthwiseConv                                         \
+  const T *const input_data, const T *const filter_data, const int batch_size, \
+      const int output_channels, const int output_height,                      \
+      const int output_width, const int input_channels,                        \
+      const int input_height, const int input_width,                           \
+      const int filter_multiplier, const int filter_height,                    \
+      const int filter_width, const int stride_height, const int stride_width, \
+      const int padding_height, const int padding_width,                       \
+      const int dilate_height, const int dilate_width, T *const output_data
+
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NCHW format.
 template <typename T>
-__device__ __inline__ void KernelDepthwiseConv(
-    const T* const input_data, const T* const filter_data, const int batch_size,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* const output_data) {
+__device__ __inline__ void KernelDepthwiseConv(ARG_DEFINE_KernelDepthwiseConv) {
   for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
     for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) {
       const int batch = blockIdx.y;
@@ -97,42 +100,105 @@ __device__ __inline__ void KernelDepthwiseConv(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride>
-__global__ void KernelDepthwiseConvSp(
-    const T* const input_data, const T* const filter_data, const int batch_size,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* const output_data) {
-  if (c_filter_multiplier == 0)
-    KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels,
-                           output_height, output_width, input_channels,
-                           input_height, input_width, filter_multiplier,
-                           filter_height, filter_width, stride_height,
-                           stride_width, padding_height, padding_width,
-                           dilate_height, dilate_width, output_data);
+template <typename T, int c_filter>
+__device__ __inline__ void KernelDepthwiseConvCFilter(
+    ARG_DEFINE_KernelDepthwiseConv) {
+  const int kWeghtSize = c_filter * c_filter;
+  T r_weight[kWeghtSize];
+  const int batch = blockIdx.y;
+  const int c_out = blockIdx.x;
+  const T* weight = filter_data + c_out * c_filter * c_filter;
+  for (int i = 0; i < c_filter * c_filter; i++) r_weight[i] = weight[i];
 
-  else
-    KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels,
-                           output_height, output_width, input_channels,
-                           input_height, input_width, c_filter_multiplier,
-                           filter_height, filter_height, c_stride, c_stride,
-                           padding_height, padding_width, dilate_height,
-                           dilate_width, output_data);
+  for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
+    for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) {
+      const int batch = blockIdx.y;
+      const int c_out = blockIdx.x;
+
+      const int c_in = c_out / filter_multiplier;
+      T value = 0;
+      const int h_in_start = -padding_height + h_out * stride_height;
+      const int w_in_start = -padding_width + w_out * stride_width;
+      const int h_in_end = h_in_start + c_filter * dilate_height;
+      const int w_in_end = w_in_start + c_filter * dilate_width;
+
+      const int in_offset =
+          ((batch * input_channels + c_in) * input_height) * input_width;
+
+      const int h_end = h_in_end < input_height ? h_in_end : input_height;
+      const int w_end = w_in_end < input_width ? w_in_end : input_width;
+      const int h_start = h_in_start > 0 ? h_in_start : 0;
+      const int w_start = w_in_start > 0 ? w_in_start : 0;
+
+      for (int h_in = h_in_start, h_f = 0; h_f < c_filter;
+           h_in += dilate_height, h_f++) {
+        for (int w_in = w_in_start, w_f = 0; w_f < c_filter;
+             w_in += dilate_width, w_f++) {
+          if (h_in >= 0 && h_in < input_height && w_in >= 0 &&
+              w_in < input_width) {
+            const int offset = in_offset + h_in * input_width + w_in;
+            value += r_weight[h_f * c_filter + w_f] * input_data[offset];
+          }
+        }
+      }
+      int index =
+          ((batch * gridDim.x + c_out) * output_height + h_out) * output_width +
+          w_out;
+      output_data[index] = value;
+    }
+  }
+}
+
+template <typename T, int c_filter_multiplier, int c_stride, int c_filter>
+__global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
+  if (c_filter_multiplier == 0) {
+    if (c_filter == -1)
+      KernelDepthwiseConv<T>(
+          input_data, filter_data, batch_size, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          filter_multiplier, filter_height, filter_width, stride_height,
+          stride_width, padding_height, padding_width, dilate_height,
+          dilate_width, output_data);
+    else
+      KernelDepthwiseConvCFilter<T, c_filter>(
+          input_data, filter_data, batch_size, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          filter_multiplier, filter_height, filter_width, stride_height,
+          stride_width, padding_height, padding_width, dilate_height,
+          dilate_width, output_data);
+  } else {
+    if (c_filter == -1)
+      KernelDepthwiseConv<T>(input_data, filter_data, batch_size,
+                             output_channels, output_height, output_width,
+                             input_channels, input_height, input_width,
+                             c_filter_multiplier, filter_height, filter_height,
+                             c_stride, c_stride, padding_height, padding_width,
+                             dilate_height, dilate_width, output_data);
+    else
+      KernelDepthwiseConvCFilter<T, c_filter>(
+          input_data, filter_data, batch_size, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          c_filter_multiplier, filter_height, filter_height, c_stride, c_stride,
+          padding_height, padding_width, dilate_height, dilate_width,
+          output_data);
+  }
 }
 
 // CUDA kernel to compute the depthwise convolution backprop w.r.t input.
+#define ARG_DEFINE_KernelDepthwiseConvInputGrad                                \
+  const T *const output_grad_data, const T *const filter_data,                 \
+      const int batch_size, const int output_channels,                         \
+      const int output_height, const int output_width,                         \
+      const int input_channels, const int input_height, const int input_width, \
+      const int filter_multiplier, const int filter_height,                    \
+      const int filter_width, const int stride_height, const int stride_width, \
+      const int padding_height, const int padding_width,                       \
+      const int dilate_height, const int dilate_width,                         \
+      T *const input_grad_data
+
 template <typename T>
 __device__ __inline__ void KernelDepthwiseConvInputGrad(
-    const T* const output_grad_data, const T* const filter_data,
-    const int batch_size, const int output_channels, const int output_height,
-    const int output_width, const int input_channels, const int input_height,
-    const int input_width, const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* const input_grad_data) {
+    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
   for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
     for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
       const int batch = blockIdx.y;
@@ -184,15 +250,67 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride>
+template <typename T, int c_filter, int c_filter_multiplier>
+__device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
+    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
+  const int kWeghtSize = c_filter * c_filter * c_filter_multiplier + 1;
+  T r_weight[kWeghtSize];
+  const int batch = blockIdx.y;
+  const int c_in = blockIdx.x;
+
+  for (int c_i = 0; c_i < filter_multiplier; c_i++) {
+    int c_out = c_in * filter_multiplier + c_i;
+    const T* weight = filter_data + c_out * c_filter * c_filter;
+    for (int i = 0; i < c_filter * c_filter; i++)
+      r_weight[i + c_i * c_filter * c_filter] =
+          weight[c_filter * c_filter - i - 1];
+  }
+
+  for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
+    for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
+      const int batch = blockIdx.y;
+      const int c_in = blockIdx.x;
+
+      int h_out_start = h_in - (c_filter - 1) * dilate_height + padding_height;
+
+      int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width;
+
+      T value = 0;
+
+      for (int c_i = 0; c_i < filter_multiplier; c_i++) {
+        int c_out = c_in * filter_multiplier + c_i;
+        for (int h_out = h_out_start, h_f = 0; h_f < c_filter;
+             h_out += dilate_height, h_f++) {
+          for (int w_out = w_out_start, w_f = 0; w_f < c_filter;
+               w_out += dilate_width, w_f++) {
+            int s_h_out = h_out / stride_height;
+            int s_w_out = w_out / stride_width;
+            if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
+                s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
+                s_w_out < output_width) {
+              const int output_grad_offset =
+                  ((batch * output_channels + c_out) * output_height +
+                   s_h_out) *
+                      output_width +
+                  s_w_out;
+              value +=
+                  output_grad_data[output_grad_offset] *
+                  r_weight[h_f * c_filter + w_f + c_i * c_filter * c_filter];
+            }
+          }
+        }
+      }
+      int index =
+          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
+          w_in;
+      input_grad_data[index] = value;
+    }
+  }
+}
+
+template <typename T, int c_filter_multiplier, int c_stride, int c_filter>
 __global__ void KernelDepthwiseConvInputGradSp(
-    const T* const output_grad_data, const T* const filter_data,
-    const int batch_size, const int output_channels, const int output_height,
-    const int output_width, const int input_channels, const int input_height,
-    const int input_width, const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* const input_grad_data) {
+    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
   if (c_filter_multiplier == 0)
     KernelDepthwiseConvInputGrad<T>(
         output_grad_data, filter_data, batch_size, output_channels,
@@ -200,13 +318,20 @@ __global__ void KernelDepthwiseConvInputGradSp(
         filter_multiplier, filter_height, filter_width, stride_height,
         stride_width, padding_height, padding_width, dilate_height,
         dilate_width, input_grad_data);
-  else
+  else if (c_filter == -1)
     KernelDepthwiseConvInputGrad<T>(
         output_grad_data, filter_data, batch_size, output_channels,
         output_height, output_width, input_channels, input_height, input_width,
         c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
         padding_height, padding_width, dilate_height, dilate_width,
         input_grad_data);
+  else
+    KernelDepthwiseConvInputGradCFilter<T, c_filter, c_filter_multiplier>(
+        output_grad_data, filter_data, batch_size, output_channels,
+        output_height, output_width, input_channels, input_height, input_width,
+        c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
+        padding_height, padding_width, dilate_height, dilate_width,
+        input_grad_data);
 }
 
 // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
@@ -325,12 +450,14 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
     dim3 threads(std::min(output_width, thread), blocks, 1);
     dim3 grid(output_channels, batch_size, 1);
     int filter_multiplier = output_channels / input_channels;
-#define check_case(c_filter_multiplier, c_stride)                            \
+#define check_case(c_filter_multiplier, c_stride, c_filter)                  \
   if (c_filter_multiplier == 0 ||                                            \
       filter_multiplier == c_filter_multiplier &&                            \
-          stride_height == stride_width && stride_height == c_stride) {      \
-    KernelDepthwiseConvSp<T, c_filter_multiplier,                            \
-                          c_stride><<<grid, threads, 0, context.stream()>>>( \
+          stride_height == stride_width && stride_height == c_stride &&      \
+          (ksize_height == ksize_width && ksize_height == c_filter ||        \
+           c_filter == -1)) {                                                \
+    KernelDepthwiseConvSp<T, c_filter_multiplier, c_stride,                  \
+                          c_filter><<<grid, threads, 0, context.stream()>>>( \
         input_data, filter_data, batch_size, output_channels, output_height, \
         output_width, input_channels, input_height, input_width,             \
         filter_multiplier, ksize_height, ksize_width, stride_height,         \
@@ -338,11 +465,17 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
         dilate_width, output_data);                                          \
     return;                                                                  \
   }
-    check_case(1, 1);
-    check_case(1, 2);
-    // NOTE(liangdun): 0,0 for other case
-    // add other case if needed, e.g. check_case(2^n,1)
-    check_case(0, 0);
+    check_case(1, 1, 3);
+    check_case(1, 1, 5);
+    check_case(1, 1, -1);
+    check_case(1, 2, 3);
+    check_case(1, 2, 5);
+    check_case(1, 2, -1);
+    check_case(0, 0, 3);
+    check_case(0, 0, 5);
+    check_case(0, 0, -1);
+// NOTE(liangdun): 0,0 for other case
+// add other case if needed, e.g. check_case(2^n,1)
 #undef check_case
   }
 };
@@ -384,13 +517,15 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(input_channels, batch_size, 1);
     int filter_multiplier = output_channels / input_channels;
 
-#define check_case(c_filter_multiplier, c_stride)                       \
+#define check_case(c_filter_multiplier, c_stride, c_filter)             \
   if (c_filter_multiplier == 0 ||                                       \
       filter_multiplier == c_filter_multiplier &&                       \
-          stride_height == stride_width && stride_height == c_stride) { \
+          stride_height == stride_width && stride_height == c_stride && \
+          (ksize_height == ksize_width && ksize_height == c_filter ||   \
+           c_filter == -1)) {                                           \
     KernelDepthwiseConvInputGradSp<                                     \
-        T, c_filter_multiplier,                                         \
-        c_stride><<<grid, threads, 0, context.stream()>>>(              \
+        T, c_filter_multiplier, c_stride,                               \
+        c_filter><<<grid, threads, 0, context.stream()>>>(              \
         output_grad_data, filter_data, batch_size, output_channels,     \
         output_height, output_width, input_channels, input_height,      \
         input_width, filter_multiplier, ksize_height, ksize_width,      \
@@ -398,11 +533,21 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
         dilate_height, dilate_width, input_grad_data);                  \
     return;                                                             \
   }
-    check_case(1, 1);
-    check_case(1, 2);
-    // NOTE(liangdun): 0,0 for other case
-    // add other case if needed, e.g. check_case(2^n,1)
-    check_case(0, 0);
+    check_case(1, 1, 3);
+    check_case(1, 1, 5);
+    check_case(1, 1, -1);
+    check_case(1, 2, 3);
+    check_case(1, 2, 5);
+    check_case(1, 2, -1);
+    check_case(2, 1, 3);
+    check_case(2, 1, 5);
+    check_case(2, 1, -1);
+    check_case(2, 2, 3);
+    check_case(2, 2, 5);
+    check_case(2, 2, -1);
+    check_case(0, 0, -1);
+// NOTE(liangdun): 0,0 for other case
+// add other case if needed, e.g. check_case(2^n,1)
 #undef check_case
   }
 };

From 228506618b23845792d7f6de74cb6b97cd7bfb13 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 12 Oct 2018 14:41:57 +0800
Subject: [PATCH 14/48] Avoid GetMutable implicitly reset Var Type.

This can cause a lot of problem:
1. Wrong operator implementation, Op can get a wrong type without failure.
2. Anytype can be Get without defined in VarType.

Also fix wrong STEP_SCOPE usage.

test=develop
---
 paddle/fluid/framework/executor.cc              |  2 +-
 paddle/fluid/framework/feed_fetch_method.cc     |  3 +--
 paddle/fluid/framework/naive_executor.cc        |  2 +-
 paddle/fluid/framework/variable.h               |  6 +++++-
 paddle/fluid/framework/variable_test.cc         | 11 ++++++-----
 python/paddle/fluid/layers/io.py                |  6 +++++-
 python/paddle/fluid/layers/tensor.py            |  2 +-
 python/paddle/fluid/tests/book/test_word2vec.py | 16 ++--------------
 8 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 70ec6e90a4..a070b8efb8 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -66,7 +66,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
   } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope>>();
+    var->GetMutable<std::vector<framework::Scope*>>();
   } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
   } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 8e1f93c5eb..3e9353f5cf 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -27,8 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
   // be created.
   VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
   Variable* g_feed_value = scope->Var(var_name);
-  auto& feed_inputs =
-      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
+  auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
   if (index >= feed_inputs.size()) {
     feed_inputs.resize(index + 1);
   }
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index ba10687d65..2840d503f1 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
   } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope>>();
+    var->GetMutable<std::vector<framework::Scope *>>();
   } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
   } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 067e0c2b83..873e1b20a5 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -38,8 +38,12 @@ class Variable {
 
   template <typename T>
   T* GetMutable() {
-    if (!IsType<T>()) {
+    if (!holder_) {
       holder_.reset(new PlaceholderImpl<T>(new T()));
+    } else {
+      PADDLE_ENFORCE(IsType<T>(),
+                     "Variable must be type %s, the holding type is %s",
+                     typeid(T).name(), holder_->Type().name());
     }
     return static_cast<T*>(holder_->Ptr());
   }
diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc
index c5c1d215f4..003dcfd3df 100644
--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
@@ -33,9 +33,10 @@ TEST(Variable, GetMutable) {
   const Tensor& tt = v->Get<Tensor>();
   EXPECT_EQ(1234, tt.content_);
 
-  std::string* s = v->GetMutable<std::string>();
-  *s = "hello";
-
-  const std::string& ss = v->Get<std::string>();
-  EXPECT_EQ("hello", ss);
+  try {
+    v->GetMutable<std::string>();
+  } catch (std::exception& e) {
+    return;
+  }
+  EXPECT_TRUE(false);
 }
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 25fde782b7..a06cd4982f 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -56,7 +56,11 @@ def data(name,
     Args:
        name(str): The name/alias of the function
        shape(list): Tuple declaring the shape.
-       append_batch_size(bool): Whether or not to append the data as a batch.
+       append_batch_size(bool):
+          1. If true, it prepends -1 to the shape.
+            For example if shape=[1], the resulting shape is [-1, 1].
+          2. If shape contains -1, such as shape=[1, -1],
+            append_batch_size will be enforced to be be False (ineffective).
        dtype(int|float): The type of data : float32, float_16, int etc
        type(VarType): The output type. By default it is LOD_TENSOR.
        lod_level(int): The LoD Level. 0 means the input data is not a sequence.
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 44b92af7ac..9c6a2112a6 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -100,7 +100,7 @@ def create_global_var(shape,
                       force_cpu=False,
                       name=None):
     """
-    Create a new variable in the global block(block 0).
+    Create a new tensor variable with value in the global block(block 0).
 
     Args:
         shape(list[int]): shape of the variable
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 9191f0fc20..1f3a230048 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -17,7 +17,6 @@ from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.layers.device import get_places
-from paddle.fluid.layers.control_flow import ParallelDo
 import unittest
 import os
 import numpy as np
@@ -84,18 +83,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
         avg_cost, predict_word = __network__(
             [first_word, second_word, third_word, forth_word, next_word])
     else:
-        places = get_places()
-        pd = ParallelDo(places)
-        with pd.do():
-            avg_cost, predict_word = __network__(
-                list(
-                    map(pd.read_input, [
-                        first_word, second_word, third_word, forth_word,
-                        next_word
-                    ])))
-            pd.write_output(avg_cost)
-
-        avg_cost = fluid.layers.mean(pd())
+        raise ValueError('is_parallel=True not implemented')
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
     sgd_optimizer.minimize(avg_cost)
@@ -262,7 +250,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
 
 for use_cuda in (False, True):
     for is_sparse in (False, True):
-        for is_parallel in (False, True):
+        for is_parallel in (False, ):  # TODO(paddle-dev): Add parallel test.
             inject_test_method(use_cuda, is_sparse, is_parallel)
 
 if __name__ == '__main__':

From dc5a7b906d18e1b0d26fe65e69d92e21966463fa Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 12 Oct 2018 14:26:21 +0800
Subject: [PATCH 15/48] fix default number of threads when inference with or
 without MKLDNN

test=develop
---
 paddle/fluid/inference/api/analysis_predictor.cc | 5 +++++
 paddle/fluid/inference/api/api_impl.cc           | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 3bc6af5241..f9135ff9d7 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -25,9 +25,11 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(profile);
+DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 
@@ -47,6 +49,9 @@ bool AnalysisPredictor::Init(
   }
 #endif
 
+  // no matter with or without MKLDNN
+  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
+
   if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
     LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim "
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 6682e0a81b..7cda9c5d8a 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -23,9 +23,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_bool(profile, false, "Turn on profiler for fluid");
+DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 namespace {
@@ -72,6 +74,9 @@ bool NativePaddlePredictor::Init(
   }
 #endif
 
+  // no matter with or without MKLDNN
+  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
+
   if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
   } else {

From a7cae62bbb0f0bd10be554ca5beff5911f8d77dc Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 12 Oct 2018 14:58:10 +0800
Subject: [PATCH 16/48] Polish code

test=develop
---
 python/paddle/fluid/layers/nn.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 7261b3009b..234692cb29 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5830,24 +5830,25 @@ def rank_loss(label, left, right, name=None):
 
 def margin_rank_loss(label, left, right, margin=0.1, name=None):
     """
-    Margin Rank loss layer for rank problem, which comparing left value and right value be passed in.
-    The rank loss can be defined as below equation:
+    Margin Ranking Loss Layer for ranking problem,
+    which compare left score and right score passed in.
+    The ranking loss can be defined as following equation:
 
     .. math::
 
         rank\_loss &= max(0, -label * (left - right) + margin)
 
     Args:
-       label (Variable): Indicats whether left higher than (right + margin) or not.
-       left (Variable): rank score for left.
-       right (Variable): rank score for right.
-       margin (float): Indicates the margin to be added to right
+       label (Variable): Indicates whether the left is ranked higher than the right or not.
+       left (Variable): ranking score for left.
+       right (Variable): ranking score for right.
+       margin (float): Indicates the given margin to be added to right
        name (str|None): A name for this layer (optional). If set None, the layer
                        will be named automatically.
     Returns:
-       list: The value of rank loss.
+       list: The Variable of ranking loss.
     Raises:
-       ValueError: Any of label, left, and right is not a variable.
+       ValueError: Any of label, left, and right is not a Variable.
     Examples:
         .. code-block:: python
            label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
@@ -5862,8 +5863,8 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
         raise ValueError("The left should be a Variable")
     if not (isinstance(right, Variable)):
         raise ValueError("The right should be a Variable")
-    out = helper.create_tmp_variable("float32")
-    act = helper.create_tmp_variable("float32")
+    out = helper.create_tmp_variable(left.dtype)
+    act = helper.create_tmp_variable(left.dtype)
     helper.append_op(
         type='margin_rank_loss',
         inputs={"Label": label,

From f03e0e49a8285279e9e5ed34d9ca821ec60611e8 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 12 Oct 2018 15:00:38 +0800
Subject: [PATCH 17/48] Polish doc

test=develop
---
 python/paddle/fluid/layers/nn.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 234692cb29..a3cae9385c 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5840,13 +5840,13 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
 
     Args:
        label (Variable): Indicates whether the left is ranked higher than the right or not.
-       left (Variable): ranking score for left.
-       right (Variable): ranking score for right.
+       left (Variable): Ranking score for left.
+       right (Variable): Ranking score for right.
        margin (float): Indicates the given margin to be added to right
        name (str|None): A name for this layer (optional). If set None, the layer
                        will be named automatically.
     Returns:
-       list: The Variable of ranking loss.
+       Variable: The ranking loss.
     Raises:
        ValueError: Any of label, left, and right is not a Variable.
     Examples:

From 4a22979bcaf7612511e4654a770d54ec8f68e61e Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 12 Oct 2018 15:11:28 +0800
Subject: [PATCH 18/48] Polish code

test=develop
---
 python/paddle/fluid/layers/nn.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index a3cae9385c..43aa4a9e7c 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5831,7 +5831,7 @@ def rank_loss(label, left, right, name=None):
 def margin_rank_loss(label, left, right, margin=0.1, name=None):
     """
     Margin Ranking Loss Layer for ranking problem,
-    which compare left score and right score passed in.
+    which compares left score and right score passed in.
     The ranking loss can be defined as following equation:
 
     .. math::
@@ -5842,7 +5842,7 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
        label (Variable): Indicates whether the left is ranked higher than the right or not.
        left (Variable): Ranking score for left.
        right (Variable): Ranking score for right.
-       margin (float): Indicates the given margin to be added to right
+       margin (float): Indicates the given margin.
        name (str|None): A name for this layer (optional). If set None, the layer
                        will be named automatically.
     Returns:
@@ -5857,12 +5857,12 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
            out = fluid.layers.margin_rank_loss(label, left, right)
     """
     helper = LayerHelper('margin_rank_loss', **locals())
-    if not (isinstance(label, Variable)):
-        raise ValueError("The label should be a Variable")
-    if not (isinstance(left, Variable)):
-        raise ValueError("The left should be a Variable")
-    if not (isinstance(right, Variable)):
-        raise ValueError("The right should be a Variable")
+    if not isinstance(label, Variable):
+        raise ValueError("The label should be a Variable.")
+    if not isinstance(left, Variable):
+        raise ValueError("The left should be a Variable.")
+    if not isinstance(right, Variable):
+        raise ValueError("The right should be a Variable.")
     out = helper.create_tmp_variable(left.dtype)
     act = helper.create_tmp_variable(left.dtype)
     helper.append_op(

From cbe429251637922d731328e06e9b0a5f23d63a21 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Fri, 12 Oct 2018 09:55:37 +0000
Subject: [PATCH 19/48] Add sequence unpad op test=develop

---
 paddle/fluid/operators/sequence_unpad_op.cc   | 153 ++++++++++++++++++
 paddle/fluid/operators/sequence_unpad_op.cu   |  30 ++++
 paddle/fluid/operators/sequence_unpad_op.h    | 104 ++++++++++++
 .../tests/unittests/test_sequence_unpad_op.py |  75 +++++++++
 4 files changed, 362 insertions(+)
 create mode 100644 paddle/fluid/operators/sequence_unpad_op.cc
 create mode 100644 paddle/fluid/operators/sequence_unpad_op.cu
 create mode 100644 paddle/fluid/operators/sequence_unpad_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py

diff --git a/paddle/fluid/operators/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_unpad_op.cc
new file mode 100644
index 0000000000..f3a0762b9a
--- /dev/null
+++ b/paddle/fluid/operators/sequence_unpad_op.cc
@@ -0,0 +1,153 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_unpad_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceUnpadOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceUnpadOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Length"),
+                   "Input(Length) of SequenceUnpadOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceUnpadOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "The rank of Input(X) can't be less than 2.");
+
+    auto len_dims = ctx->GetInputDim("Length");
+    PADDLE_ENFORCE(len_dims.size() == 2 && len_dims[1] == 1,
+                   "The shape of Input(Length) should be [batch_size, 1].");
+    PADDLE_ENFORCE(
+        len_dims[0] == x_dims[0],
+        "Input(X) and Input(Length) should have the same first dimension.");
+
+    int64_t out_dim_0 = -1;
+    if (ctx->IsRuntime()) {
+      out_dim_0 = x_dims[0] * x_dims[1];
+    }
+
+    std::vector<int64_t> out_dims_vec{out_dim_0};
+    if (x_dims.size() == 2) {
+      out_dims_vec.push_back(1);
+    } else {
+      for (size_t i = 2; i < x_dims.size(); ++i) {
+        out_dims_vec.push_back(x_dims[i]);
+      }
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(LoDTensor, default LoDTensor<float>) Input tensor which "
+             "contains the padded sequences with equal length.");
+    AddInput("Length",
+             "(LoDTensor) The input tensor which specifies the actual ength of "
+             "sequences after unpadding.");
+    AddOutput(
+        "Out",
+        "(LoDTensor) The output tensor which contains unpadded sequences.");
+    AddComment(R"DOC(
+      Sequence Unpad Operator
+
+      This operator removes the padding data in the input sequences and convert 
+      them into sequences with actual length as output, identitied by lod 
+      information.
+
+      Example:
+
+      Given input tensor Input(X):
+          X.data = [[ 1.0,  2.0,  3.0,  4.0,  5.0],
+                    [ 6.0,  7.0,  8.0,  9.0, 10.0],
+                    [11.0, 12.0, 13.0, 14.0, 15.0]], 
+`     
+      in which there are 3 sequences padded to length 5, and the acutal length 
+      specified by Input(Length):
+
+          Length.data = [[2], [3], [4]],
+
+      after unpadding, Output(Out) will be:
+
+          Out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
+          Out.lod = [[0, 2, 5, 9]]      
+
+    )DOC");
+  }
+};
+
+class SequenceUnpadGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceUnpadGradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput(framework::GradVarName("Out")),
+        "Input(Out@GRAD) of SequenceUnpadGradOp should not be null.");
+
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sequence_unpad, ops::SequenceUnpadOp,
+                  ops::SequenceUnpadOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_unpad_grad, ops::SequenceUnpadGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_unpad,
+    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_unpad_grad,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext,
+                                   int64_t>);
diff --git a/paddle/fluid/operators/sequence_unpad_op.cu b/paddle/fluid/operators/sequence_unpad_op.cu
new file mode 100644
index 0000000000..7524837223
--- /dev/null
+++ b/paddle/fluid/operators/sequence_unpad_op.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_unpad_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_unpad,
+    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_unpad_grad,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext,
+                                   int64_t>);
diff --git a/paddle/fluid/operators/sequence_unpad_op.h b/paddle/fluid/operators/sequence_unpad_op.h
new file mode 100644
index 0000000000..ebe3118b98
--- /dev/null
+++ b/paddle/fluid/operators/sequence_unpad_op.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence_padding.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+template <typename DeviceContext, typename T>
+class SequenceUnpadOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x_t = ctx.Input<LoDTensor>("X");
+    auto* len_t = ctx.Input<LoDTensor>("Length");
+    auto* out_t = ctx.Output<LoDTensor>("Out");
+    out_t->mutable_data<T>(ctx.GetPlace());
+
+    const int64_t* seq_len_ptr = nullptr;
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      LoDTensor seq_len_cpu;
+      seq_len_cpu.Resize(len_t->dims());
+      seq_len_ptr = seq_len_cpu.mutable_data<int64_t>(platform::CPUPlace());
+      framework::TensorCopy(*len_t, platform::CPUPlace(),
+                            ctx.template device_context<DeviceContext>(),
+                            &seq_len_cpu);
+    } else {
+      seq_len_ptr = len_t->data<int64_t>();
+    }
+
+    size_t batch_size = x_t->dims()[0];
+    std::vector<size_t> out_lod0(batch_size + 1, 0);
+    for (size_t i = 0; i < batch_size; ++i) {
+      out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i];
+    }
+
+    framework::LoD out_lod;
+    out_lod.push_back(out_lod0);
+    out_t->set_lod(out_lod);
+
+    std::vector<int64_t> out_dims_vec{static_cast<int64_t>(out_lod0.back())};
+    if (x_t->dims().size() == 2) {
+      out_dims_vec.push_back(1);
+    } else {
+      for (size_t i = 2; i < x_t->dims().size(); ++i) {
+        out_dims_vec.push_back(x_t->dims()[i]);
+      }
+    }
+    out_t->Resize(framework::make_ddim(out_dims_vec));
+
+    int64_t padded_length = x_t->dims()[1];
+    math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *x_t, out_t,
+        padded_length, 0, false, math::kBatchLengthWidth);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequenceUnpadGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_x = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    if (d_x) {
+      const auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+      const auto* x_t = ctx.Input<LoDTensor>("X");
+      d_x->mutable_data<T>(ctx.GetPlace());
+
+      int padded_length = x_t->dims()[1];
+
+      LoDTensor zero_pads;
+      zero_pads.Resize({1, 1});
+      zero_pads.mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<DeviceContext, T> set_zero;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      set_zero(dev_ctx, &zero_pads, static_cast<T>(0));
+
+      math::PaddingLoDTensorFunctor<DeviceContext, T>()(
+          ctx.template device_context<DeviceContext>(), *d_out, d_x, zero_pads,
+          padded_length, 0, false, math::kBatchLengthWidth);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py b/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
new file mode 100644
index 0000000000..673b0ea180
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import six
+import numpy as np
+from op_test import OpTest
+
+
+class TestSequenceUnpadOp(OpTest):
+    def init(self):
+        self.length = [2, 3, 4]
+        self.x_shape = (3, 5)
+        self.dtype = "float32"
+
+    def compute(self):
+        assert len(self.length) == self.x_shape[0]
+        x = np.random.random(self.x_shape).astype(self.dtype)
+        out_lod = [self.length]
+
+        out = x[0, 0:self.length[0]]
+        for i in six.moves.xrange(1, x.shape[0]):
+            out = np.append(out, x[i, 0:self.length[i]], axis=0)
+
+        out_shape = (sum(self.length), )
+        if len(self.x_shape) == 2:
+            out_shape = out_shape + (1, )
+        else:
+            out_shape = out_shape + self.x_shape[2:]
+
+        self.inputs = {
+            'X': x,
+            'Length': np.array(self.length).astype('int64').reshape(-1, 1)
+        }
+        self.outputs = {'Out': (out.reshape(out_shape), out_lod)}
+
+    def setUp(self):
+        self.op_type = 'sequence_unpad'
+        self.init()
+        self.compute()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestSequenceUnpadOp2(TestSequenceUnpadOp):
+    def init(self):
+        self.length = [2, 3, 4]
+        self.x_shape = (3, 5, 4, 3)
+        self.dtype = "float32"
+
+
+class TestSequenceUnpadOp3(TestSequenceUnpadOp):
+    def init(self):
+        self.length = [5, 2, 3, 4]
+        self.x_shape = (4, 5, 3, 3, 6)
+        self.dtype = "float64"
+
+
+if __name__ == '__main__':
+    unittest.main()

From 9c77b65c06af0a2a95e244074f5ad7afcb9dc5e8 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Fri, 12 Oct 2018 19:39:44 +0800
Subject: [PATCH 20/48] Fix layers.uniform_random (#13823)

* fix layers.uniform_random

* fix uniform_random
test=develop

* remove var type set
test=develop

* fix similar error
test=develop
---
 paddle/fluid/operators/uniform_random_op.cc | 32 ++++++++++-----------
 python/paddle/fluid/layers/ops.py           | 17 +++++++----
 2 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 763bb40358..aa907595cb 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -23,14 +23,14 @@ namespace operators {
 template <typename T>
 class CPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    framework::Tensor* tensor = nullptr;
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    framework::Tensor *tensor = nullptr;
     auto out_var = ctx.OutputVar("Out");
     if (out_var->IsType<framework::LoDTensor>()) {
       tensor = out_var->GetMutable<framework::LoDTensor>();
     } else if (out_var->IsType<framework::SelectedRows>()) {
       auto shape = ctx.Attr<std::vector<int>>("shape");
-      auto* selected_rows = out_var->GetMutable<framework::SelectedRows>();
+      auto *selected_rows = out_var->GetMutable<framework::SelectedRows>();
       tensor = selected_rows->mutable_value();
       tensor->Resize(framework::make_ddim(shape));
       selected_rows->mutable_rows()->reserve(shape[0]);
@@ -39,7 +39,7 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
           "uniform_random_op's output only"
           "supports SelectedRows and LoDTensor");
     }
-    T* data = tensor->mutable_data<T>(ctx.GetPlace());
+    T *data = tensor->mutable_data<T>(ctx.GetPlace());
     unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
     std::minstd_rand engine;
     if (seed == 0) {
@@ -60,14 +60,14 @@ class UniformRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of UniformRandomOp should not be null.");
 
     PADDLE_ENFORCE(
         ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
         "uniform_random's min must less then max");
-    auto& shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
     std::vector<int64_t> temp;
     temp.reserve(shape.size());
     for (auto dim : shape) {
@@ -78,7 +78,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
         ctx.GetPlace());
@@ -112,17 +112,17 @@ uniform distribution. The random result is in set [min, max].
 
 class UniformRandomOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
     auto out_var_name = op_desc.Output("Out").front();
-    if (block->FindRecursiveOrCreateVar(out_var_name).GetType() ==
-        framework::proto::VarType::SELECTED_ROWS) {
-      block->FindRecursiveOrCreateVar(out_var_name)
-          .SetType(framework::proto::VarType::SELECTED_ROWS);
-    } else {
-      block->FindRecursiveOrCreateVar(out_var_name)
-          .SetType(framework::proto::VarType::LOD_TENSOR);
+    auto var_data_type = static_cast<framework::proto::VarType::Type>(
+        boost::get<int>(op_desc.GetAttr("dtype")));
+
+    auto out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    if (out_var.GetType() != framework::proto::VarType::SELECTED_ROWS) {
+      out_var.SetType(framework::proto::VarType::LOD_TENSOR);
     }
+    out_var.SetDataType(var_data_type);
   }
 };
 
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 9a8300524d..1ff40a26f2 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -14,6 +14,8 @@
 
 from __future__ import print_function
 from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr
+from .. import core
+from ..framework import convert_np_dtype_to_dtype_
 
 __activations_noattr__ = [
     'sigmoid',
@@ -58,8 +60,11 @@ _uniform_random_ = generate_layer_fn('uniform_random')
 
 
 def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
+    locals_var = locals().keys()
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
     kwargs = dict()
-    for name in locals():
+    for name in locals_var:
         val = locals()[name]
         if val is not None:
             kwargs[name] = val
@@ -78,8 +83,9 @@ _hard_shrink_ = generate_layer_fn('hard_shrink')
 
 
 def hard_shrink(x, threshold=None):
+    locals_var = locals().keys()
     kwargs = dict()
-    for name in locals():
+    for name in locals_var:
         val = locals()[name]
         if val is not None:
             kwargs[name] = val
@@ -99,12 +105,12 @@ _cum_sum_ = generate_layer_fn('cumsum')
 
 
 def cumsum(x, axis=None, exclusive=None, reverse=None):
+    locals_var = locals().keys()
     kwargs = dict()
-    for name in locals():
+    for name in locals_var:
         val = locals()[name]
         if val is not None:
             kwargs[name] = val
-
     return _cum_sum_(**kwargs)
 
 
@@ -121,8 +127,9 @@ _thresholded_relu_ = generate_layer_fn('thresholded_relu')
 
 
 def thresholded_relu(x, threshold=None):
+    locals_var = locals().keys()
     kwargs = dict()
-    for name in locals():
+    for name in locals_var:
         val = locals()[name]
         if val is not None:
             kwargs[name] = val

From 55d6950a1aca273679a5966c6441015a4c960d9c Mon Sep 17 00:00:00 2001
From: Sylwester Fraczek <sylwester.fraczek@intel.com>
Date: Wed, 10 Oct 2018 12:32:43 +0200
Subject: [PATCH 21/48] rewrite conv_bn fuse pass to eigen

test=develop
---
 .../fluid/framework/ir/conv_bn_fuse_pass.cc   | 135 +++++-------------
 1 file changed, 39 insertions(+), 96 deletions(-)

diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 95d7138381..86926bec64 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -44,87 +44,16 @@ namespace ir {
   GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, pattern_name);     \
   GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, pattern_name)
 
-template <typename UnaryOperation>
-LoDTensor tensor_apply(const LoDTensor& vec, UnaryOperation f) {
-  LoDTensor vec_y;
-  vec_y.Resize(vec.dims());
-  const float* x = vec.data<float>();
-  float* y = vec_y.mutable_data<float>(platform::CPUPlace());
-  for (int64_t i = 0; i < vec.numel(); i++) {
-    y[i] = f(x[i]);
-  }
-  return vec_y;
-}
-
-void tensor_apply_inplace(LoDTensor* vec, float (*f)(float)) {
-  float* data = vec->mutable_data<float>(platform::CPUPlace());
-  for (int64_t i = 0; i < vec->numel(); i++) {
-    data[i] = f(data[i]);
-  }
-}
-
-template <typename BinaryOperation>
-LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b,
-                               BinaryOperation f) {
-  PADDLE_ENFORCE_EQ(vec_a.dims(), vec_b.dims());
-  LoDTensor vec_y;
-  vec_y.Resize(vec_a.dims());
-  const float* a = vec_a.data<float>();
-  const float* b = vec_b.data<float>();
-  float* y = vec_y.mutable_data<float>(platform::CPUPlace());
-  for (int64_t i = 0; i < vec_a.numel(); i++) {
-    y[i] = f(a[i], b[i]);
-  }
-  return vec_y;
-}
-
-template <typename BinaryOperation>
-LoDTensor tensor_apply_eltwise_broadcast(const LoDTensor& vec_a,
-                                         const LoDTensor& vec_b,
-                                         BinaryOperation f) {
-  PADDLE_ENFORCE_EQ(vec_a.dims().size(), 2);
-  PADDLE_ENFORCE_EQ(vec_b.dims().size(), 2);
-  PADDLE_ENFORCE_EQ(vec_a.dims()[0], vec_b.dims()[0]);
-  PADDLE_ENFORCE_EQ(vec_b.dims()[1], 1);
-  LoDTensor vec_y;
-  vec_y.Resize(vec_a.dims());
-  const float* a = vec_a.data<float>();
-  const float* b = vec_b.data<float>();
-  float* y = vec_y.mutable_data<float>(platform::CPUPlace());
-  size_t a_height = vec_a.dims()[0];
-  size_t a_width = vec_a.dims()[1];
-  for (size_t h = 0; h < a_height; h++) {
-    for (size_t w = 0; w < a_width; ++w) {
-      *(y++) = f(*(a++), b[h]);
-    }
-  }
-  return vec_y;
-}
-
 // reshape to two dimensions {A, B * C * ...}
-void make_tensor_2d(LoDTensor* tensor_to_reshape) {
-  auto dims_count = tensor_to_reshape->dims().size();
+DDim make_dims_2d(DDim dims) {
+  auto dims_count = dims.size();
   PADDLE_ENFORCE_GT(dims_count, 0);
 
   int size2 = 1;
   for (int i = 1; i < dims_count; i++) {
-    size2 *= tensor_to_reshape->dims()[i];
+    size2 *= dims[i];
   }
-  tensor_to_reshape->Resize(make_ddim({tensor_to_reshape->dims()[0], size2}));
-}
-
-void recompute_conv_weights(LoDTensor* weights, LoDTensor* tmp) {
-  // remember the weights tensor shape {A, B, C, ...}
-  auto weights_shape = weights->dims();
-  // reduce the weights to 2d {A, B * C * ...}
-  make_tensor_2d(weights);
-  // make tmp tensor 2d by adding 1 as second dim {A, 1}
-  make_tensor_2d(tmp);
-
-  *weights =
-      tensor_apply_eltwise_broadcast(*weights, *tmp, std::multiplies<float>());
-  // reshape weights to the original dims {A, B, C, ...}
-  weights->Resize(weights_shape);
+  return make_ddim({dims[0], size2});
 }
 
 void recompute_bias_and_weights(const Scope* scope,
@@ -135,6 +64,13 @@ void recompute_bias_and_weights(const Scope* scope,
                                 const ir::Node& bn_variance,      //
                                 LoDTensor* eltwise_y_in_tensor,   //
                                 float epsilon) {
+  using EigenVectorArrayMap =
+      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using ConstEigenVectorArrayMap =
+      Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using EigenMatrixArrayMap = Eigen::Map<
+      Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+
   // Re-compute bias of conv2d from BN
   PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), bn_bias_tensor.dims());
 
@@ -143,31 +79,38 @@ void recompute_bias_and_weights(const Scope* scope,
       scope->FindVar(bn_variance.Name())->GetMutable<LoDTensor>();
   auto* mean_tensor = scope->FindVar(bn_mean.Name())->GetMutable<LoDTensor>();
 
-  auto std_tensor = LoDTensor();
-  std_tensor.Resize(bn_bias_tensor.dims());
-  std_tensor =
-      tensor_apply(*variance_tensor, [&](float x) { return x + epsilon; });
+  ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
+                                       scale_tensor->numel(), 1);
+  EigenVectorArrayMap variance_array(
+      variance_tensor->mutable_data<float>(platform::CPUPlace()),
+      variance_tensor->numel(), 1);
+  ConstEigenVectorArrayMap mean_array(mean_tensor->data<float>(),
+                                      mean_tensor->numel(), 1);
+  ConstEigenVectorArrayMap bn_bias_array(bn_bias_tensor.data<float>(),
+                                         bn_bias_tensor.numel(), 1);
 
-  using EigenVectorArrayMap =
-      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
+  // variance will not be used anymore, so make it std_array and then tmp_array
+  variance_array += epsilon;
+  variance_array = variance_array.sqrt();
+  variance_array = scale_array / variance_array;
+
+  EigenVectorArrayMap eltwise_y_in_array(
+      eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
+      eltwise_y_in_tensor->numel(), 1);
 
-  EigenVectorArrayMap std_vec(
-      std_tensor.mutable_data<float>(platform::CPUPlace()), std_tensor.numel(),
-      1);
-  std_vec = std_vec.sqrt();
-  auto tmp_tensor =
-      tensor_apply_eltwise(*scale_tensor, std_tensor, std::divides<float>());
-  auto tensor_minus = tensor_apply_eltwise(*eltwise_y_in_tensor, *mean_tensor,
-                                           std::minus<float>());
-  auto tensor_mul =
-      tensor_apply_eltwise(tensor_minus, tmp_tensor, std::multiplies<float>());
-  *eltwise_y_in_tensor =
-      tensor_apply_eltwise(tensor_mul, bn_bias_tensor, std::plus<float>());
+  eltwise_y_in_array =
+      ((eltwise_y_in_array - mean_array) * variance_array) + bn_bias_array;
 
   // Re-compute weight of conv2d from BN
-  auto* current_param =
-      scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
-  recompute_conv_weights(current_param, &tmp_tensor);
+  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
+  auto weights_shape = weights->dims();
+  auto weights_shape_2d = make_dims_2d(weights_shape);
+
+  EigenMatrixArrayMap weights_array_2d(
+      weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
+      weights_shape_2d[1]);
+
+  weights_array_2d.colwise() *= variance_array;
 }
 
 std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(

From 8686f7c68e2e524a3d0cdd2f0c555851eafeb59a Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sat, 13 Oct 2018 18:31:41 +0800
Subject: [PATCH 22/48] add reader_queue_speed_test_mode flag for speed test

---
 paddle/fluid/operators/reader/blocking_queue.h             | 7 ++++++-
 .../fluid/operators/reader/reader_blocking_queue_test.cc   | 4 ++++
 paddle/fluid/platform/enforce.h                            | 7 +++++++
 paddle/fluid/pybind/pybind.cc                              | 4 ++++
 4 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 28cc91a5ed..3eefb2db51 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -14,11 +14,14 @@
 
 #pragma once
 
+#include <gflags/gflags.h>
 #include <condition_variable>  // NOLINT
 #include <deque>
 
 #include "paddle/fluid/platform/enforce.h"
 
+DECLARE_bool(reader_queue_speed_test_mode);
+
 namespace paddle {
 namespace operators {
 namespace reader {
@@ -72,7 +75,9 @@ class BlockingQueue {
     if (!queue_.empty()) {
       PADDLE_ENFORCE_NOT_NULL(elem);
       *elem = queue_.front();
-      queue_.pop_front();
+      if (LIKELY(!FLAGS_reader_queue_speed_test_mode)) {
+        queue_.pop_front();
+      }
       send_cv_.notify_one();
       return true;
     } else {
diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
index 7d1b381d56..9b016469cf 100644
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
@@ -20,6 +20,10 @@
 
 #include "paddle/fluid/operators/reader/blocking_queue.h"
 
+DEFINE_bool(reader_queue_speed_test_mode, false,
+            "If set true, the queue.pop will only get data from queue but not "
+            "remove the data from queue for speed testing");
+
 using paddle::operators::reader::BlockingQueue;
 
 TEST(BlockingQueue, CapacityTest) {
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index f04395a8ac..a251bfcd99 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -130,6 +130,13 @@ struct EOFException : public std::exception {
 #define UNLIKELY(condition) (condition == 0)
 #endif
 
+#if !defined(_WIN32)
+#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
+#else
+// there is no equivalent intrinsics in msvc.
+#define LIKELY(condition) (condition != 0)
+#endif
+
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     bool stat, const Args&... args) {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 311cd94460..2b730f2bdc 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -57,6 +57,10 @@ limitations under the License. */
 
 #include "pybind11/stl.h"
 
+DEFINE_bool(reader_queue_speed_test_mode, false,
+            "If set true, the queue.pop will only get data from queue but not "
+            "remove the data from queue for speed testing");
+
 // disable auto conversion to list in Python
 PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
 

From c61e16b181ff03d4bae1d7c203bec9f598b91e2c Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sat, 13 Oct 2018 19:18:27 +0800
Subject: [PATCH 23/48] add reader_queue_speed_test_mode_flag test

---
 .../reader/reader_blocking_queue_test.cc      | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
index 9b016469cf..cfcac11228 100644
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
@@ -221,3 +221,28 @@ TEST(BlockingQueue, MyClassTest) {
   q.Receive(&b);
   EXPECT_EQ(a.val_, b.val_);
 }
+
+TEST(BlockingQueue, reader_queue_speed_test_mode_flag) {
+  FLAGS_reader_queue_speed_test_mode = false;
+  size_t queue_size = 10;
+  BlockingQueue<size_t> q(queue_size);
+  for (size_t i = 0; i < queue_size; ++i) {
+    q.Send(i);
+  }
+  size_t b;
+  for (size_t i = 0; i < queue_size; ++i) {
+    q.Receive(&b);
+    EXPECT_EQ(b, i);
+  }
+  EXPECT_EQ(q.Size(), 0);
+
+  FLAGS_reader_queue_speed_test_mode = true;
+  for (size_t i = 0; i < queue_size; ++i) {
+    q.Send(i);
+  }
+  for (size_t i = 0; i < queue_size; ++i) {
+    q.Receive(&b);
+    EXPECT_EQ(b, 0);
+  }
+  EXPECT_EQ(q.Size(), queue_size);
+}

From d852be7c4886678d1fe12a6f37148cf7437d0a82 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Sat, 13 Oct 2018 19:25:23 +0800
Subject: [PATCH 24/48] Revert "Make variable::GetMutable robust"

---
 paddle/fluid/framework/executor.cc              |  2 +-
 paddle/fluid/framework/feed_fetch_method.cc     |  3 ++-
 paddle/fluid/framework/naive_executor.cc        |  2 +-
 paddle/fluid/framework/variable.h               |  6 +-----
 paddle/fluid/framework/variable_test.cc         | 11 +++++------
 python/paddle/fluid/tests/book/test_word2vec.py | 16 ++++++++++++++--
 6 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index a070b8efb8..70ec6e90a4 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -66,7 +66,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
   } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope*>>();
+    var->GetMutable<std::vector<framework::Scope>>();
   } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
   } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 3e9353f5cf..8e1f93c5eb 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -27,7 +27,8 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
   // be created.
   VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
   Variable* g_feed_value = scope->Var(var_name);
-  auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
+  auto& feed_inputs =
+      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
   if (index >= feed_inputs.size()) {
     feed_inputs.resize(index + 1);
   }
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 2840d503f1..ba10687d65 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
   } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope *>>();
+    var->GetMutable<std::vector<framework::Scope>>();
   } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
   } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 873e1b20a5..067e0c2b83 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -38,12 +38,8 @@ class Variable {
 
   template <typename T>
   T* GetMutable() {
-    if (!holder_) {
+    if (!IsType<T>()) {
       holder_.reset(new PlaceholderImpl<T>(new T()));
-    } else {
-      PADDLE_ENFORCE(IsType<T>(),
-                     "Variable must be type %s, the holding type is %s",
-                     typeid(T).name(), holder_->Type().name());
     }
     return static_cast<T*>(holder_->Ptr());
   }
diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc
index 003dcfd3df..c5c1d215f4 100644
--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
@@ -33,10 +33,9 @@ TEST(Variable, GetMutable) {
   const Tensor& tt = v->Get<Tensor>();
   EXPECT_EQ(1234, tt.content_);
 
-  try {
-    v->GetMutable<std::string>();
-  } catch (std::exception& e) {
-    return;
-  }
-  EXPECT_TRUE(false);
+  std::string* s = v->GetMutable<std::string>();
+  *s = "hello";
+
+  const std::string& ss = v->Get<std::string>();
+  EXPECT_EQ("hello", ss);
 }
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 1f3a230048..9191f0fc20 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.layers.device import get_places
+from paddle.fluid.layers.control_flow import ParallelDo
 import unittest
 import os
 import numpy as np
@@ -83,7 +84,18 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
         avg_cost, predict_word = __network__(
             [first_word, second_word, third_word, forth_word, next_word])
     else:
-        raise ValueError('is_parallel=True not implemented')
+        places = get_places()
+        pd = ParallelDo(places)
+        with pd.do():
+            avg_cost, predict_word = __network__(
+                list(
+                    map(pd.read_input, [
+                        first_word, second_word, third_word, forth_word,
+                        next_word
+                    ])))
+            pd.write_output(avg_cost)
+
+        avg_cost = fluid.layers.mean(pd())
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
     sgd_optimizer.minimize(avg_cost)
@@ -250,7 +262,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
 
 for use_cuda in (False, True):
     for is_sparse in (False, True):
-        for is_parallel in (False, ):  # TODO(paddle-dev): Add parallel test.
+        for is_parallel in (False, True):
             inject_test_method(use_cuda, is_sparse, is_parallel)
 
 if __name__ == '__main__':

From 2178ae56989d783380b2512a77b65e2a5eedb500 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sat, 13 Oct 2018 19:38:59 +0800
Subject: [PATCH 25/48] add reader_queue_speed_test_mode to python init
 test=develop

---
 python/paddle/fluid/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 7bbdf7de89..41678918b8 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -113,7 +113,8 @@ def __bootstrap__():
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
         'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
         'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb'
+        'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb',
+        'reader_queue_speed_test_mode'
     ]
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')

From 77e9339deb6a9245d9ba14f6aa7b670c34559879 Mon Sep 17 00:00:00 2001
From: Wu Yi <typhoonzero1986@gmail.com>
Date: Sun, 14 Oct 2018 11:16:46 +0800
Subject: [PATCH 26/48] fix api get_pserver_programs (#13820)

---
 python/paddle/fluid/transpiler/distribute_transpiler.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index ecdbe27f4d..91db85b8ec 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -788,7 +788,8 @@ in a single call.")
             tuple: (main_program, startup_program), of type "Program"
         """
         pserver_prog = self.get_pserver_program(endpoint)
-        pserver_startup = self.get_startup_program(endpoint)
+        pserver_startup = self.get_startup_program(
+            endpoint, pserver_program=pserver_prog)
         return pserver_prog, pserver_startup
 
     def get_startup_program(self,

From b11372a0af9938ed0cd1bc5fdee585bfcd364911 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Mon, 15 Oct 2018 10:44:36 +0800
Subject: [PATCH 27/48] fix doc

test=develop
---
 paddle/fluid/pybind/pybind.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index a91894ba89..8a366f7d8f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -170,14 +170,14 @@ PYBIND11_PLUGIN(core) {
      A LoDTensor X can look like the example below. It contains 2 sequences.
      The first has length 2 and the second has length 3, as described by x.lod.
 
-     The first tensor dimension 6=2+3 is calculated from LoD if it's available.
+     The first tensor dimension 5=2+3 is calculated from LoD if it's available.
      It means the total number of sequence element. In X, each element has 2
-     columns, hence [6, 2].
+     columns, hence [5, 2].
 
       x.lod  = [[2, 3]]
       x.data = [[1, 2], [3, 4],
-                [5, 6], [7, 8], [9, 10], [11, 12]]
-      x.shape = [6, 2]
+                [5, 6], [7, 8], [9, 10]]
+      x.shape = [5, 2]
 
       LoD can have multiple levels (for example, a paragraph can have multiple
       sentences and a sentence can have multiple words). In the following

From 2562eb92b895a6d7af769bfc33d10f29397dd8d7 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Mon, 15 Oct 2018 11:02:35 +0800
Subject: [PATCH 28/48] Add strategy doc (#13849)

* add strategy doc
test=develop

* fix doc
test=develop

* add ParallelExecutor arg doc
test=develop
---
 paddle/fluid/pybind/pybind.cc            | 123 ++++++++++++++++-------
 python/paddle/fluid/parallel_executor.py |  21 +++-
 2 files changed, 103 insertions(+), 41 deletions(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index a91894ba89..8d58e018c3 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -667,16 +667,17 @@ All parameter, weight, gradient are variables in Paddle.
     ExecutionStrategy allows the user to more preciously control how to run
     the program in ParallelExecutor by setting the property.
 
-    The available properties include:
-        use_cuda (bool): Whether to use CUDA or not. Default True.
-        num_threads (int): The number of threads that used to run the
-            operators in ParallelExecutor. If it is not set, it will be
-            set in ParallelExecutor according to the device count.
-            Default 0.
-        allow_op_delay (bool): Whether to delay the communication operators
-            to run. Default False.
-        num_iteration_per_drop_scope (int): how many iterations between
-            the two dropping local scopes. Default 100.
+    Examples:
+        .. code-block:: python
+
+          exec_strategy = fluid.ExecutionStrategy()
+          exec_strategy.num_threads = 4
+
+          train_exe = fluid.ParallelExecutor(use_cuda=True,
+                                             loss_name=loss.name,
+                                             exec_strategy=exec_strategy)
+
+          train_loss, = train_exe.run([loss.name], feed=feed_dict)
 
         )DOC");
 
@@ -686,19 +687,34 @@ All parameter, weight, gradient are variables in Paddle.
           [](const ExecutionStrategy &self) { return self.num_threads_; },
           [](ExecutionStrategy &self, size_t num_threads) {
             self.num_threads_ = num_threads;
-          })
+          },
+          R"DOC(The type is INT, num_threads represents the size of thread pool that
+            used to run the operators of the current program in ParallelExecutor.
+            If :math:`num\_threads=1`, all the operators will execute one by one,
+            but the order maybe difference between iterations.
+            If it is not set, it will be set in ParallelExecutor according to the
+            device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
+            :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
+            if it is not set, ParallelExecutor will get the cpu count by calling
+            `multiprocessing.cpu_count()`. Default 0.)DOC")
       .def_property(
           "use_cuda",
           [](const ExecutionStrategy &self) { return self.use_cuda_; },
           [](ExecutionStrategy &self, bool use_cuda) {
             self.use_cuda_ = use_cuda;
-          })
+          })  // FIXME(chengduo): Doesn't add doc for 'use_cuda', use_cuda may
+      // make user confuse, because ParallelExecutor has a parameter named
+      // 'use_cuda' too, in current implementation, ParallelExecutor's
+      // 'use_cuda' will rewrite ExecutionStrategy's 'use_cuda'.
       .def_property(
           "allow_op_delay",
           [](const ExecutionStrategy &self) { return self.allow_op_delay_; },
           [](ExecutionStrategy &self, bool allow_op_delay) {
             self.allow_op_delay_ = allow_op_delay;
-          })
+          },
+          R"DOC(The type is BOOL, allow_op_delay represents whether to delay the
+                communication operators to run, it may make the execution faster.
+                Note that in some models, allow_op_delay may cause program hang. Default False.)DOC")
       .def_property(
           "num_iteration_per_drop_scope",
           [](const ExecutionStrategy &self) {
@@ -706,7 +722,19 @@ All parameter, weight, gradient are variables in Paddle.
           },
           [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
             self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
-          });
+          },
+          R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
+                many iterations to clean up the temp variables which
+                is generated during execution. It may make the execution faster,
+                because the temp variable's shape maybe the same between two iterations. Default 100.
+
+                NOTES:
+                    1. If you fetch data when calling the 'run', the ParallelExecutor
+                       will clean up the temp variables at the end of the current iteration.
+                    2. In some NLP model, it may cause the GPU memory is insufficient,
+                       in this case, you should reduce `num_iteration_per_drop_scope`.
+              )DOC");
+
   exec_strategy.def_property(
       "use_experimental_executor",
       [](const ExecutionStrategy &self) {
@@ -721,20 +749,17 @@ All parameter, weight, gradient are variables in Paddle.
     BuildStrategy allows the user to more preciously control how to
     build the SSA Graph in ParallelExecutor by setting the property.
 
-    The available properties include:
-        reduce_strategy (str): There are two reduce strategies, 'AllReduce'
-            and 'Reduce'. If you want that all parameters will be optimized
-            on all devices, you can choose 'AllReduce'; if you choose
-            'Reduce', all parameters will be evenly allocated to different
-            devices for optimization, and then broadcast the optimized
-            parameter to other devices. Default 'AllReduce'.
-        gradient_scale_strategy (str): There are two ways of defining loss@grad,
-            'CoeffNumDevice' and 'Customized'. By default, ParallelExecutor
-            sets the loss@grad according to the number of devices. If you want
-            to customize loss@grad, you can choose 'Customized'.
-            Default 'CoeffNumDevice'.
-        debug_graphviz_path (str): Whether to write the SSA Graph to file in the
-            form of graphviz. It is useful for debugging. Default "".
+    Examples:
+        .. code-block:: python
+
+          build_strategy = fluid.BuildStrategy()
+          build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+
+          train_exe = fluid.ParallelExecutor(use_cuda=True,
+                                             loss_name=loss.name,
+                                             build_strategy=build_strategy)
+
+          train_loss, = train_exe.run([loss.name], feed=feed_dict)
 )DOC");
 
   py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
@@ -753,31 +778,51 @@ All parameter, weight, gradient are variables in Paddle.
           [](const BuildStrategy &self) { return self.reduce_; },
           [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
             self.reduce_ = strategy;
-          })
+          },
+          R"DOC(The type is STR, there are two reduce strategies in ParallelExecutor,
+                  'AllReduce' and 'Reduce'. If you want that all the parameters'
+                  optimization are done on all devices independently, you should choose 'AllReduce';
+                  if you choose 'Reduce', all the parameters' optimization will be evenly distributed
+                  to different devices, and then broadcast the optimized parameter to other devices.
+                  In some models, `Reduce` is faster. Default 'AllReduce'. )DOC")
       .def_property(
           "gradient_scale_strategy",
           [](const BuildStrategy &self) { return self.gradient_scale_; },
           [](BuildStrategy &self,
              BuildStrategy::GradientScaleStrategy strategy) {
             self.gradient_scale_ = strategy;
-          })
+          },
+          R"DOC(The type is STR, there are three ways of defining :math:`loss@grad` in
+                   ParallelExecutor, 'CoeffNumDevice', 'One' and 'Customized'. By default,
+                   ParallelExecutor sets the :math:`loss@grad` according to the number of devices.
+                   If you want to customize :math:`loss@grad`, you can choose 'Customized'.
+                   Default 'CoeffNumDevice'.)DOC")
       .def_property(
           "debug_graphviz_path",
           [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
           [](BuildStrategy &self, const std::string &path) {
             self.debug_graphviz_path_ = path;
-          })
+          },
+          R"DOC(The type is STR, debug_graphviz_path indicate the path that
+                    writing the SSA Graph to file in the form of graphviz, you.
+                    It is useful for debugging. Default "")DOC")
       .def_property(
           "enable_data_balance",
           [](const BuildStrategy &self) { return self.enable_data_balance_; },
-          [](BuildStrategy &self, bool b) { self.enable_data_balance_ = b; })
-      .def_property("fuse_elewise_add_act_ops",
-                    [](const BuildStrategy &self) {
-                      return self.fuse_elewise_add_act_ops_;
-                    },
-                    [](BuildStrategy &self, bool b) {
-                      self.fuse_elewise_add_act_ops_ = b;
-                    })
+          [](BuildStrategy &self, bool b) {
+            self.enable_data_balance_ = b;
+          })  // FIXME(chengudo): enable_data_balance seems not important
+      .def_property(
+          "fuse_elewise_add_act_ops",
+          [](const BuildStrategy &self) {
+            return self.fuse_elewise_add_act_ops_;
+          },
+          [](BuildStrategy &self, bool b) {
+            self.fuse_elewise_add_act_ops_ = b;
+          },
+          R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
+                     to fuse elementwise_add_op and activation_op,
+                     it may make the execution faster. Default False)DOC")
       .def("_create_passes_from_strategy",
            [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
              return self.CreatePassesFromStrategy();
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 57d272cbfb..3f4dd5eb71 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -31,15 +31,32 @@ BuildStrategy = core.ParallelExecutor.BuildStrategy
 
 class ParallelExecutor(object):
     """
-    ParallelExecutor can run program in parallel.
+    ParallelExecutor is designed for data parallelism, which focuses on distributing
+    the data across different nodes and every node operates on the data in parallel.
+    If you use ParallelExecutor to run the current program on GPU, the node means GPU
+    device, and ParallelExecutor will get the available GPU device automatically on
+    the current machine. If you use ParallelExecutor to run the current program on CPU,
+    the node means the CPU device, and you can specify the CPU device number by adding
+    'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable
+    is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number
+    of CPUs in the system.
 
     Args:
         use_cuda (bool): Whether to use CUDA or not.
         loss_name (str): The loss name must set in training. Default None.
         main_program (Program): The program that need to run, if not provided,
             then default_main_program will be used. Default None.
-        share_vars_from(ParallelExecutor): If provied, it will share variables
+        share_vars_from(ParallelExecutor): If provide, it will share variables
             from the specified ParallelExecutor. Default None.
+        exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run
+            the program in ParallelExecutor, for example how many threads are used to
+            execute the program, how many iterations to clean up the temp variables
+            which is generated during execution. For more information, please refer
+            to fluid.ExecutionStrategy. Default None.
+        build_strategy(BuildStrategy): build_strategy is used to control how to
+            build the SSA Graph in ParallelExecutor by setting the property,
+            for example reduce_strategy, gradient_scale_strategy. For more information,
+            please refer to fluid.BuildStrategy. Default None.
         num_trainers(int): If greater than 1, NCCL will be initialized with
             multiple rank of nodes, each node should have same number of GPUs.
             Distributed training will be enabled then. Default 1.

From 84d9300365fbfda987d5beff7b868e4828454580 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 11 Oct 2018 14:10:16 +0000
Subject: [PATCH 29/48] test=develop

---
 paddle/fluid/operators/rmsprop_op.h           |  32 +--
 .../fluid/tests/unittests/test_rmsprop_op.py  | 226 +++++++++++-------
 2 files changed, 151 insertions(+), 107 deletions(-)

diff --git a/paddle/fluid/operators/rmsprop_op.h b/paddle/fluid/operators/rmsprop_op.h
index 406730407d..797cd45fdc 100644
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
@@ -131,21 +131,21 @@ template <typename DeviceContext, typename T>
 class RmspropOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    using Tensor = framework::LoDTensor;
+    using LoDTensor = framework::LoDTensor;
     auto *grad_var = ctx.InputVar("Grad");
-    auto *param_out = ctx.Output<Tensor>("ParamOut");
-    auto *moment_out = ctx.Output<Tensor>("MomentOut");
-    auto *mean_square_out = ctx.Output<Tensor>("MeanSquareOut");
+    auto *param_out = ctx.Output<LoDTensor>("ParamOut");
+    auto *moment_out = ctx.Output<LoDTensor>("MomentOut");
+    auto *mean_square_out = ctx.Output<LoDTensor>("MeanSquareOut");
 
     auto epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
     auto rho = static_cast<T>(ctx.Attr<float>("decay"));
     auto momentum = static_cast<T>(ctx.Attr<float>("momentum"));
     bool centered = ctx.Attr<bool>("centered");
 
-    auto &p_tensor = *ctx.Input<Tensor>("Param");
-    auto &ms_tensor = *ctx.Input<Tensor>("MeanSquare");
-    auto &lr_tensor = *ctx.Input<Tensor>("LearningRate");
-    auto &mom_tensor = *ctx.Input<Tensor>("Moment");
+    auto &p_tensor = *ctx.Input<LoDTensor>("Param");
+    auto &ms_tensor = *ctx.Input<LoDTensor>("MeanSquare");
+    auto &lr_tensor = *ctx.Input<LoDTensor>("LearningRate");
+    auto &mom_tensor = *ctx.Input<LoDTensor>("Moment");
 
     PADDLE_ENFORCE_EQ(&p_tensor, param_out,
                       "Param and ParamOut must be the same Tensor");
@@ -157,8 +157,8 @@ class RmspropOpKernel : public framework::OpKernel<T> {
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
     size_t limit = static_cast<size_t>(ms_tensor.numel());
 
-    if (grad_var->IsType<Tensor>()) {
-      auto &grad_tensor = grad_var->Get<Tensor>();
+    if (grad_var->IsType<LoDTensor>()) {
+      auto &grad_tensor = grad_var->Get<LoDTensor>();
 
       if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value) {
         auto &place =
@@ -176,9 +176,9 @@ class RmspropOpKernel : public framework::OpKernel<T> {
 
         ms_out.device(place) = rho * ms + (1 - rho) * g * g;
         if (centered) {
-          auto &mg_tensor = *ctx.Input<Tensor>("MeanGrad");
+          auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
           auto mg = EigenVector<T>::Flatten(mg_tensor);
-          auto *mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
+          auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
           PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
                          "MeanGrad and MeanGradOut must be the same Tensor");
           auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
@@ -196,8 +196,8 @@ class RmspropOpKernel : public framework::OpKernel<T> {
         DenseRmspropGradFunctor<T> grad_func(grad_tensor.data<T>());
         platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
         if (centered) {
-          auto &mg_tensor = *ctx.Input<Tensor>("MeanGrad");
-          auto *mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
+          auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
+          auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
           PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
                          "MeanGrad and MeanGradOut must be the same Tensor");
           for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
@@ -241,8 +241,8 @@ class RmspropOpKernel : public framework::OpKernel<T> {
                                             row_numel, row_count);
 
       if (centered) {
-        auto &mg_tensor = *ctx.Input<Tensor>("MeanGrad");
-        auto *mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
+        auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
+        auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
         PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
                        "MeanGrad and MeanGradOut must be the same Tensor");
         for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
index 70848e4e22..335d595f3d 100644
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -19,29 +19,72 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+
+
+def create_selected_rows_and_tensor(scope, place, height, row_num,
+                                    embedding_size):
+    sr = scope.var("@selected_rows@").get_selected_rows()
+    tensor = scope.var("grad").get_tensor()
+
+    rows = np.random.random_integers(
+        low=0, high=height - 1, size=[row_num, ]).astype('int64')
+    sr_val = np.random.random(size=[row_num, embedding_size]).astype('float32')
+
+    sr.set_height(height)
+    sr.set_rows(rows)
+    sr.get_tensor().set(sr_val, place)
+
+    tensor_val = np.zeros(shape=[height, embedding_size], dtype='float32')
+    for i in range(row_num):
+        row = rows[i]
+        tensor_val[row, :] = tensor_val[row, :] + sr_val[i, :]
+
+    tensor.set(tensor_val, place)
+    return tensor_val, sr_val
 
 
 class TestBase(unittest.TestCase):
-    def setup(self, centered, epsilon=1e-6):
+    def setup(self,
+              place,
+              is_sparse,
+              centered,
+              size,
+              row_num=None,
+              epsilon=1e-6):
         np.random.seed(5)  # fix seed
 
+        self.scope = fluid.global_scope()
+        self.place = place
+
         self.param_name = "param"
-        self.param = np.random.random((123, 321)).astype("float32")
+        self.param = np.random.random(size).astype("float32")
 
         self.mean_square_name = "mean_square"
-        self.mean_square = np.random.random((123, 321)).astype("float32")
+        self.mean_square = np.random.uniform(
+            low=1, high=2, size=size).astype("float32")
 
         self.mean_grad_name = "mean_grad"
-        self.mean_grad = np.random.random((123, 321)).astype("float32")
+        self.mean_grad = np.random.random(size).astype("float32")
 
         self.lr_name = "lr"
         self.learning_rate = np.array([0.01]).astype("float32")
 
         self.grad_name = "grad"
-        self.grad = np.random.random((123, 321)).astype("float32")
+
+        self.is_sparse = is_sparse
+        if self.is_sparse:
+            self.grad_sr_name = "@selected_rows@"
+            self.grad, self.grad_sr = create_selected_rows_and_tensor(
+                self.scope, place, size[0], row_num, size[1])
+        else:
+            self.grad = np.random.random(size).astype("float32")
+            grad_tensor = self.scope.var(self.grad_name).get_tensor()
+            grad_tensor.set(self.grad, place)
 
         self.moment_name = "moment"
-        self.moment = np.zeros((123, 321)).astype("float32")
+        self.moment = np.random.uniform(
+            low=0, high=1, size=size).astype("float32")
 
         self.epsilon = epsilon
         self.decay = 0.9
@@ -61,118 +104,119 @@ class TestBase(unittest.TestCase):
 
         self.param_out = self.param - self.moment_out
 
-    def check(self,
-              actual_t,
-              expect_t,
-              place,
-              out_name,
-              atol=1e-5,
-              equal_nan=False):
-        self.assertTrue(
-            np.allclose(
-                actual_t, expect_t, atol=atol, equal_nan=equal_nan),
-            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
-            + str(expect_t) + "\n" + "But Got" + str(actual_t))
-
-
-class TestRmspropOp(TestBase):
-    def check_with_place(self, place, centered, epsilon):
-        self.setup(centered, epsilon)
-        scope = core.Scope()
-
         # create and initialize Param Variable
-        param = scope.var(self.param_name).get_tensor()
-        param.set(self.param, place)
+        self.param_tensor = self.scope.var(self.param_name).get_tensor()
+        self.param_tensor.set(self.param, place)
 
-        mean_square = scope.var(self.mean_square_name).get_tensor()
-        mean_square.set(self.mean_square, place)
+        self.mean_square_tensor = self.scope.var(
+            self.mean_square_name).get_tensor()
+        self.mean_square_tensor.set(self.mean_square, place)
 
-        lr = scope.var(self.lr_name).get_tensor()
+        lr = self.scope.var(self.lr_name).get_tensor()
         lr.set(self.learning_rate, place)
 
-        grad = scope.var(self.grad_name).get_tensor()
-        grad.set(self.grad, place)
+        self.moment_tensor = self.scope.var(self.moment_name).get_tensor()
+        self.moment_tensor.set(self.moment, place)
 
-        moment = scope.var(self.moment_name).get_tensor()
-        moment.set(self.moment, place)
+        if self.centered:
+            self.mean_grad_tensor = self.scope.var(
+                self.mean_grad_name).get_tensor()
+            self.mean_grad_tensor.set(self.mean_grad, place)
 
-        # create and run sgd operator
+    def check(self, actual_t, expect_t, place, out_name, atol=1e-5):
+        self.assertTrue(
+            np.allclose(
+                actual_t, expect_t, atol=atol),
+            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
+            + str(expect_t) + "\n" + "But Got" + str(actual_t))
 
-        if self.centered:
-            mean_grad = scope.var(self.mean_grad_name).get_tensor()
-            mean_grad.set(self.mean_grad, place)
-
-            rmsprop_op = Operator(
-                "rmsprop",
-                Param=self.param_name,
-                Grad=self.grad_name,
-                MeanSquare=self.mean_square_name,
-                MeanGrad=self.mean_grad_name,
-                Moment=self.moment_name,
-                LearningRate=self.lr_name,
-                ParamOut=self.param_name,
-                MeanSquareOut=self.mean_square_name,
-                MomentOut=self.moment_name,
-                MeanGradOut=self.mean_grad_name,
-                epsilon=self.epsilon,
-                decay=self.decay,
-                momentum=self.momentum,
-                centered=True)
-        else:
-            rmsprop_op = Operator(
-                "rmsprop",
-                Param=self.param_name,
-                Grad=self.grad_name,
-                MeanSquare=self.mean_square_name,
-                Moment=self.moment_name,
-                LearningRate=self.lr_name,
-                ParamOut=self.param_name,
-                MeanSquareOut=self.mean_square_name,
-                MomentOut=self.moment_name,
-                epsilon=self.epsilon,
-                decay=self.decay,
-                momentum=self.momentum,
-                centered=False)
-
-        rmsprop_op.run(scope, place)
-
-        atol = 1e-5
-        equal_nan = False
+
+class TestRmspropOp(TestBase):
+    def check_with_place(self,
+                         place,
+                         is_sparse,
+                         centered,
+                         size,
+                         row_num=None,
+                         epsilon=1e-6):
+        self.setup(place, is_sparse, centered, size, row_num, epsilon)
+        self.run_and_check()
+
+    def run_and_check(self):
+        grad_name = self.grad_sr_name if self.is_sparse else self.grad_name
+
+        kwargs = {
+            'Param': self.param_name,
+            'Grad': grad_name,
+            'MeanSquare': self.mean_square_name,
+            'Moment': self.moment_name,
+            'LearningRate': self.lr_name,
+            'ParamOut': self.param_name,
+            'MeanSquareOut': self.mean_square_name,
+            'MomentOut': self.moment_name,
+            'epsilon': self.epsilon,
+            'decay': self.decay,
+            'momentum': self.momentum,
+            'centered': self.centered
+        }
 
         if self.centered:
-            atol = 1e-3
-            equal_nan = True
+            kwargs['MeanGrad'] = self.mean_grad_name
+            kwargs['MeanGradOut'] = self.mean_grad_name
+
+        rmsprop_op = Operator('rmsprop', **kwargs)
+        atol = 1e-6
+
+        rmsprop_op.run(self.scope, self.place)
 
         self.check(
-            np.array(mean_square), self.ms_out, place, self.mean_square_name)
+            np.array(self.mean_square_tensor), self.ms_out, self.place,
+            self.mean_square_name)
         self.check(
-            np.array(moment),
+            np.array(self.moment_tensor),
             self.moment_out,
-            place,
+            self.place,
             self.moment_name,
-            atol=atol,
-            equal_nan=equal_nan)
+            atol=atol)
         self.check(
-            np.array(param),
+            np.array(self.param_tensor),
             self.param_out,
-            place,
+            self.place,
             self.param_name,
-            atol=atol,
-            equal_nan=equal_nan)
+            atol=atol)
 
         if self.centered:
             self.check(
-                np.array(mean_grad), self.mg_out, place, self.mean_grad_name)
+                np.array(self.mean_grad_tensor), self.mg_out, self.place,
+                self.mean_grad_name)
 
     def test_rmsprop(self):
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
+
+        size = (128, 320)
         for place in places:
-            self.check_with_place(place, False, 1e-6)
-            self.check_with_place(place, False, 1e-10)
-            self.check_with_place(place, True, 1e-6)
-            self.check_with_place(place, True, 1e-10)
+            for centered in [False, True]:
+                with fluid.scope_guard(core.Scope()):
+                    self.check_with_place(
+                        place, is_sparse=False, centered=centered, size=size)
+
+                with fluid.scope_guard(core.Scope()):
+                    self.check_with_place(
+                        place,
+                        is_sparse=True,
+                        centered=centered,
+                        row_num=512,
+                        size=size)
+
+                with fluid.scope_guard(core.Scope()):
+                    self.check_with_place(
+                        place,
+                        is_sparse=True,
+                        centered=centered,
+                        row_num=60,
+                        size=size)
 
 
 if __name__ == "__main__":

From 3d976f3f18403c2f066bd361812f67e338257ada Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Mon, 15 Oct 2018 11:57:23 +0800
Subject: [PATCH 30/48] rename inference_lib_dist to fluid_lib_dist

test=develop
---
 cmake/inference_lib.cmake         |  5 +++--
 paddle/fluid/train/demo/README.md |  2 +-
 paddle/scripts/paddle_build.sh    | 24 ++++++++++++------------
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 077072f6ea..a3e682e54a 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -18,7 +18,7 @@ function(copy TARGET)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DSTS DEPS)
     cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE)
+    set(fluid_lib_dist_dep ${TARGET} ${fluid_lib_dist_dep} PARENT_SCOPE)
 
     list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
     list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
@@ -185,7 +185,8 @@ copy(cmake_cache
   SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
   DSTS ${FLUID_INSTALL_DIR})
 
-add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 
+# This command generates a complete fluid library for both train and inference
+add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) 
 
 # paddle fluid version
 execute_process(
diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md
index 41b01d3382..191da20669 100644
--- a/paddle/fluid/train/demo/README.md
+++ b/paddle/fluid/train/demo/README.md
@@ -15,7 +15,7 @@ cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
          -DWITH_MKL=OFF \
          -DWITH_MKLDNN=OFF
 make -j8
-make -j8 inference_lib_dist
+make -j8 fluid_lib_dist
 ```
 
 ### step 2. generate program desc
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index b97e63ecc8..da6f5ca158 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -648,25 +648,25 @@ function gen_capi_package() {
     fi
 }
 
-function gen_fluid_inference_lib() {
+function gen_fluid_lib() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
         cat <<EOF
     ========================================
-    Generating fluid inference library ...
+    Generating fluid library for train and inference ...
     ========================================
 EOF
         cmake .. -DWITH_DISTRIBUTE=OFF
-        make -j `nproc` inference_lib_dist
+        make -j `nproc` fluid_lib_dist
       fi
 }
 
-function tar_fluid_inference_lib() {
+function tar_fluid_lib() {
     if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
         cat <<EOF
     ========================================
-    Taring fluid inference library ...
+    Taring fluid library for train and inference ...
     ========================================
 EOF
         cd ${PADDLE_ROOT}/build
@@ -675,11 +675,11 @@ EOF
       fi
 }
 
-function test_fluid_inference_lib() {
+function test_fluid_lib() {
     if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
         cat <<EOF
     ========================================
-    Testing fluid inference library ...
+    Testing fluid library for inference ...
     ========================================
 EOF
         cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
@@ -731,9 +731,9 @@ function main() {
         ;;
       fluid_inference_lib)
         cmake_gen ${PYTHON_ABI:-""}
-        gen_fluid_inference_lib
-        tar_fluid_inference_lib
-        test_fluid_inference_lib
+        gen_fluid_lib
+        tar_fluid_lib
+        test_fluid_lib
         ;;
       check_style)
         check_style
@@ -744,8 +744,8 @@ function main() {
         assert_api_not_changed ${PYTHON_ABI:-""}
         run_test
         gen_capi_package
-        gen_fluid_inference_lib
-        test_fluid_inference_lib
+        gen_fluid_lib
+        test_fluid_lib
         assert_api_spec_approvals
         ;;
       maccheck)

From 46e61d81a7410a6c72c37cd5fe290db630810aed Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Mon, 15 Oct 2018 02:45:19 +0000
Subject: [PATCH 31/48] Wrapper py api for sequence_unpad test=develop

---
 paddle/fluid/API.spec                         |  1 +
 python/paddle/fluid/layers/nn.py              | 59 +++++++++++++++++++
 .../fluid/tests/unittests/test_layers.py      |  8 +++
 3 files changed, 68 insertions(+)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index c6dd919a93..f19e4e3827 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -76,6 +76,7 @@ paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'outp
 paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
 paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sequence_unpad ArgSpec(args=['x', 'length'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
 paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
 paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 8c0ef7a824..1cd9a61ff9 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -56,6 +56,7 @@ __all__ = [
     'sequence_expand',
     'sequence_expand_as',
     'sequence_pad',
+    'sequence_unpad',
     'lstm_unit',
     'reduce_sum',
     'reduce_mean',
@@ -2843,6 +2844,64 @@ def sequence_pad(x, pad_value, maxlen=None):
     return out, length
 
 
+def sequence_unpad(x, length):
+    """
+    Sequence Unpad Layer
+
+    This layer removes the padding data in the input sequences and convert 
+    them into sequences with actual length as output, identitied by lod 
+    information.
+
+    .. code-block:: text
+
+	Example:
+
+	Given input Variable **x**:
+	    x.data = [[ 1.0,  2.0,  3.0,  4.0,  5.0],
+		      [ 6.0,  7.0,  8.0,  9.0, 10.0],
+		      [11.0, 12.0, 13.0, 14.0, 15.0]], 
+     
+	in which there are 3 sequences padded to length 5, and the acutal length 
+	specified by input Variable *length*:
+
+	    length.data = [[2], [3], [4]],
+
+	after unpadding, the output Variable will be:
+
+	    out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
+	    out.lod = [[0, 2, 5, 9]]      
+
+    Args:
+        x(Variable): Input Variable which contains the padded sequences with
+            equal length.
+        length(Variable): The Variable that specifies the actual ength of
+            sequences after unpadding.
+
+    Returns:
+        Variable: The Variable contains the unpadded sequences.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10, 5], dtype='float32')
+            len = fluid.layers.data(name='length', shape=[1], dtype='int64')
+            out = fluid.layers.sequence_unpad(x=x, length=len)
+    """
+
+    helper = LayerHelper('sequence_unpad', input=x, **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+
+    length.stop_gradient = True
+
+    helper.append_op(
+        type='sequence_unpad',
+        inputs={'X': x,
+                'Length': length},
+        outputs={'Out': out})
+    return out
+
+
 def beam_search(pre_ids,
                 pre_scores,
                 ids,
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 1d8d0b55f0..91502514a6 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -194,6 +194,14 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(layers.sequence_expand(x=x, y=y, ref_level=1))
         print(str(program))
 
+    def test_sequence_unpad(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10, 5], dtype='float32')
+            length = layers.data(name='length', shape=[1], dtype='int64')
+            self.assertIsNotNone(layers.sequence_unpad(x=x, length=length))
+        print(str(program))
+
     def test_lstm_unit(self):
         program = Program()
         with program_guard(program):

From 8e2fdc54b11e5471cbbee41f826abe3f9e75894f Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Mon, 15 Oct 2018 14:36:37 +0800
Subject: [PATCH 32/48] Add check for opt op (#13840)

* add check for opt op

* fix opt op
test=develop

* fix test fail
test=develop

* fix optimization doc
test=develop

* test=develop
---
 paddle/fluid/operators/adadelta_op.cc        | 12 +++++++
 paddle/fluid/operators/adadelta_op.h         | 11 +++++++
 paddle/fluid/operators/adagrad_op.h          | 33 ++++++++++++--------
 paddle/fluid/operators/adam_op.h             |  6 ++++
 paddle/fluid/operators/adamax_op.cc          | 10 ++++++
 paddle/fluid/operators/adamax_op.h           | 11 +++++++
 paddle/fluid/operators/decayed_adagrad_op.cc | 10 ++++++
 paddle/fluid/operators/decayed_adagrad_op.h  | 11 +++++++
 paddle/fluid/operators/ftrl_op.cc            | 10 ++++++
 paddle/fluid/operators/ftrl_op.h             | 11 +++++++
 paddle/fluid/operators/momentum_op.cc        |  5 +++
 paddle/fluid/operators/momentum_op.cu        | 11 +++++++
 paddle/fluid/operators/momentum_op.h         |  6 ++++
 paddle/fluid/operators/rmsprop_op.cc         |  5 +++
 paddle/fluid/operators/rmsprop_op.h          |  6 ++++
 paddle/fluid/operators/sgd_op.cc             | 29 +++++++++--------
 paddle/fluid/operators/sgd_op.cu             |  6 ++++
 python/paddle/fluid/optimizer.py             | 12 +++++++
 18 files changed, 179 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/operators/adadelta_op.cc b/paddle/fluid/operators/adadelta_op.cc
index d1970515f5..89a7a49e0f 100644
--- a/paddle/fluid/operators/adadelta_op.cc
+++ b/paddle/fluid/operators/adadelta_op.cc
@@ -18,6 +18,7 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+
 class AdadeltaOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -31,6 +32,16 @@ class AdadeltaOp : public framework::OperatorWithKernel {
                    "Input(AvgSquaredGrad) of AdadeltaOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"),
                    "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Grad").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
 
     PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                    "Output(ParamOut) of AdadeltaOp should not be null.");
@@ -56,6 +67,7 @@ class AdadeltaOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
     ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
   }
+
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
diff --git a/paddle/fluid/operators/adadelta_op.h b/paddle/fluid/operators/adadelta_op.h
index 822458daf6..6c616aa03d 100644
--- a/paddle/fluid/operators/adadelta_op.h
+++ b/paddle/fluid/operators/adadelta_op.h
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
 class AdadeltaOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto avg_squared_grad_out_tensor =
         ctx.Output<framework::Tensor>("AvgSquaredGradOut");
diff --git a/paddle/fluid/operators/adagrad_op.h b/paddle/fluid/operators/adagrad_op.h
index df520fcc89..0a16ce00f7 100644
--- a/paddle/fluid/operators/adagrad_op.h
+++ b/paddle/fluid/operators/adagrad_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -21,25 +22,31 @@ namespace operators {
 
 template <typename DeviceContext, typename T>
 struct SparseAdagradFunctor {
-  void operator()(const DeviceContext& context,
-                  const framework::SelectedRows& grad,
-                  const framework::Tensor& learning_rate, T epsilon,
-                  framework::Tensor* moment, framework::Tensor* param);
+  void operator()(const DeviceContext &context,
+                  const framework::SelectedRows &grad,
+                  const framework::Tensor &learning_rate, T epsilon,
+                  framework::Tensor *moment, framework::Tensor *param);
 };
 
 template <typename DeviceContext, typename T>
 class AdagradOpKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto* moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+
+    auto *param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto *moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
 
     param_out_tensor->mutable_data<T>(ctx.GetPlace());
     moment_out_tensor->mutable_data<T>(ctx.GetPlace());
 
     T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
 
-    auto* grad_var = ctx.InputVar("Grad");
+    auto *grad_var = ctx.InputVar("Grad");
     if (grad_var->IsType<framework::LoDTensor>()) {
       auto param = framework::EigenVector<T>::Flatten(
           *ctx.Input<framework::Tensor>("Param"));
@@ -47,16 +54,16 @@ class AdagradOpKernel : public framework::OpKernel<T> {
           *ctx.Input<framework::Tensor>("Grad"));
       auto moment = framework::EigenVector<T>::Flatten(
           *ctx.Input<framework::Tensor>("Moment"));
-      auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+      auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
 
       auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
       auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-      auto* place = ctx.template device_context<DeviceContext>().eigen_device();
+      auto *place = ctx.template device_context<DeviceContext>().eigen_device();
 
       moment_out.device(*place) = moment + grad * grad;
       Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
       if (platform::is_cpu_place(ctx.GetPlace())) {
-        auto* lr = learning_rate->data<T>();
+        auto *lr = learning_rate->data<T>();
         param_out.device(*place) =
             param - lr[0] * grad / (moment_out.sqrt() + epsilon);
       } else {
@@ -66,10 +73,10 @@ class AdagradOpKernel : public framework::OpKernel<T> {
             lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
       }
     } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto* param_tensor = ctx.Input<framework::Tensor>("Param");
+      auto *param_tensor = ctx.Input<framework::Tensor>("Param");
       PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor);
 
-      auto* moment_tensor = ctx.Input<framework::Tensor>("Moment");
+      auto *moment_tensor = ctx.Input<framework::Tensor>("Moment");
       PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);
 
       SparseAdagradFunctor<DeviceContext, T> functor;
diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h
index 4cb1f3a80e..3e724f52e4 100644
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -244,6 +244,12 @@ template <typename DeviceContext, typename T>
 class AdamOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+
     using paddle::framework::LoDTensor;
     using paddle::operators::detail::Ref;
 
diff --git a/paddle/fluid/operators/adamax_op.cc b/paddle/fluid/operators/adamax_op.cc
index 32062574bc..d4aa4d338a 100644
--- a/paddle/fluid/operators/adamax_op.cc
+++ b/paddle/fluid/operators/adamax_op.cc
@@ -35,6 +35,16 @@ class AdamaxOp : public framework::OperatorWithKernel {
                    "Input(LearningRate) of AdamaxOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
                    "Input(Beta1Pow) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Grad").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
 
     PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                    "Output(ParamOut) of AdamaxOp should not be null.");
diff --git a/paddle/fluid/operators/adamax_op.h b/paddle/fluid/operators/adamax_op.h
index de644676fd..7137fbd965 100644
--- a/paddle/fluid/operators/adamax_op.h
+++ b/paddle/fluid/operators/adamax_op.h
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
 class AdamaxOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
     auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");
diff --git a/paddle/fluid/operators/decayed_adagrad_op.cc b/paddle/fluid/operators/decayed_adagrad_op.cc
index c0f2b49a04..d73ae9e272 100644
--- a/paddle/fluid/operators/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/decayed_adagrad_op.cc
@@ -32,6 +32,16 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(
         ctx->HasInput("LearningRate"),
         "Input(LearningRate) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Grad").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
 
     PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                    "Output(ParamOut) of DecayedAdagradOp should not be null.");
diff --git a/paddle/fluid/operators/decayed_adagrad_op.h b/paddle/fluid/operators/decayed_adagrad_op.h
index a46af078e0..5df43d33ef 100644
--- a/paddle/fluid/operators/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/decayed_adagrad_op.h
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
 class DecayedAdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
 
diff --git a/paddle/fluid/operators/ftrl_op.cc b/paddle/fluid/operators/ftrl_op.cc
index 70ba25c213..b77e12d650 100644
--- a/paddle/fluid/operators/ftrl_op.cc
+++ b/paddle/fluid/operators/ftrl_op.cc
@@ -34,6 +34,16 @@ class FTRLOp : public framework::OperatorWithKernel {
                    "Input(Grad) of FTRL should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
                    "Input(LearningRate) of FTRL should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Grad").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
 
     PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                    "Output(ParamOut) of FTRL should not be null.");
diff --git a/paddle/fluid/operators/ftrl_op.h b/paddle/fluid/operators/ftrl_op.h
index 6f821e7e99..8f812c9a03 100644
--- a/paddle/fluid/operators/ftrl_op.h
+++ b/paddle/fluid/operators/ftrl_op.h
@@ -28,6 +28,17 @@ template <typename DeviceContext, typename T>
 class FTRLOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+
     auto* param_out = ctx.Output<Tensor>("ParamOut");
     auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut");
     auto* lin_accum_out = ctx.Output<Tensor>("LinearAccumOut");
diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc
index 5f43c58108..c8079a99fb 100644
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -33,6 +33,11 @@ class MomentumOp : public framework::OperatorWithKernel {
                    "Input(velocity) of Momentum should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
                    "Input(LearningRate) of Momentum should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
 
     PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                    "Output(ParamOut) of Momentum should not be null.");
diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu
index a3932db1f3..5dc920c709 100644
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -46,6 +46,17 @@ template <typename T>
 class MomentumOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+
     auto param_out = ctx.Output<framework::Tensor>("ParamOut");
     auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
     auto param = ctx.Input<framework::Tensor>("Param");
diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h
index 264726040f..40073d21b7 100644
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
@@ -23,6 +23,12 @@ template <typename T>
 class MomentumOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+
     auto param_out = ctx.Output<framework::Tensor>("ParamOut");
     auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
     auto param = ctx.Input<framework::Tensor>("Param");
diff --git a/paddle/fluid/operators/rmsprop_op.cc b/paddle/fluid/operators/rmsprop_op.cc
index 2f773f222e..f06f87e61d 100644
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
@@ -32,6 +32,11 @@ class RmspropOp : public framework::OperatorWithKernel {
                    "Input(Grad) of RmspropOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Moment"),
                    "Input(Moment) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
 
     PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                    "Output(param_out) of RmspropOp should not be null.");
diff --git a/paddle/fluid/operators/rmsprop_op.h b/paddle/fluid/operators/rmsprop_op.h
index 25ed32c5eb..a04d1bd2ca 100644
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
@@ -28,6 +28,12 @@ template <typename DeviceContext, typename T>
 class RmspropOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+
     auto* param_out = ctx.Output<Tensor>("ParamOut");
     auto* moment_out = ctx.Output<Tensor>("MomentOut");
     auto* mean_square_out = ctx.Output<Tensor>("MeanSquareOut");
diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc
index fef230e42d..411a126bc8 100644
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -21,7 +21,7 @@ class SGDOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Param"),
                    "Input(Param) of SGDOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Grad"),
@@ -42,7 +42,7 @@ class SGDOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
     return framework::OpKernelType(data_type, ctx.device_context());
   }
@@ -50,17 +50,20 @@ class SGDOp : public framework::OperatorWithKernel {
 
 class SGDOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto input_var = op_desc.Input("Param")[0];
-    for (auto& out_var : op_desc.Output("ParamOut")) {
-      if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
-          framework::proto::VarType::SELECTED_ROWS) {
-        block->FindRecursiveOrCreateVar(out_var).SetType(
-            framework::proto::VarType::SELECTED_ROWS);
-      } else {
-        block->FindRecursiveOrCreateVar(out_var).SetType(
-            framework::proto::VarType::LOD_TENSOR);
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto input_var_n = op_desc.Input("Param")[0];
+    auto in_var_type = block->FindRecursiveOrCreateVar(input_var_n).GetType();
+    PADDLE_ENFORCE(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
+                       in_var_type == framework::proto::VarType::LOD_TENSOR,
+                   "The input Var's type should be LoDtensor or SelectedRows,"
+                   " but the received var(%s)'s type is %s",
+                   input_var_n, in_var_type);
+
+    for (auto &out_var_n : op_desc.Output("ParamOut")) {
+      auto &out_var = block->FindRecursiveOrCreateVar(out_var_n);
+      if (out_var.GetType() != in_var_type) {
+        out_var.SetType(in_var_type);
       }
     }
   }
diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu
index 2436090757..d3f4eba3b2 100644
--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
@@ -56,6 +56,12 @@ template <typename T>
 class SGDOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.Inputs("Param").front(), param_var->Type().name());
+
     auto* param = ctx.Input<framework::Tensor>("Param");
     auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
     auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 1b9571f6d3..ed1784bd27 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -659,6 +659,9 @@ class AdamaxOptimizer(Optimizer):
 
             optimizer = fluid.optimizer.Adamax(learning_rate=0.2)
             optimizer.minimize(cost)
+
+    Notes:
+       Currently, AdamaxOptimizer doesn't support sparse parameter optimization.
     """
     _moment_acc_str = "moment"
     _inf_norm_acc_str = "inf_norm"
@@ -778,6 +781,9 @@ class DecayedAdagradOptimizer(Optimizer):
 
             optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2)
             optimizer.minimize(cost)
+
+    Notes:
+       Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.
     """
     _moment_acc_str = "moment"
 
@@ -858,6 +864,9 @@ class AdadeltaOptimizer(Optimizer):
             optimizer = fluid.optimizer.Adadelta(
                 learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
             _, params_grads = optimizer.minimize(cost)
+
+    Notes:
+       Currently, AdadeltaOptimizer doesn't support sparse parameter optimization.
     """
 
     _avg_squared_grad_acc_str = "_avg_squared_grad"
@@ -1126,6 +1135,9 @@ class FtrlOptimizer(Optimizer):
 
               optimizer = fluid.optimizer.Ftrl(0.0001)
               _, params_grads = optimizer.minimize(cost)
+
+    Notes:
+       Currently, FtrlOptimizer doesn't support sparse parameter optimization.
     """
 
     _squared_acc_str = "squared"

From fb6201e93ec6ffbc19885621896c46b5901104d7 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 12 Oct 2018 08:28:32 +0000
Subject: [PATCH 33/48] test=develop

---
 .../fluid/framework/details/op_handle_base.h  |  3 +-
 paddle/fluid/framework/executor.cc            | 84 ++++++++++---------
 paddle/fluid/framework/executor.h             | 44 +++++-----
 paddle/fluid/framework/parallel_executor.cc   | 32 ++++++-
 paddle/fluid/framework/parallel_executor.h    | 15 +++-
 paddle/fluid/framework/scope.cc               | 29 ++++---
 paddle/fluid/framework/scope.h                |  5 ++
 7 files changed, 129 insertions(+), 83 deletions(-)

diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 9fbefabc84..d09b94a3fd 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -64,7 +64,8 @@ class OpHandleBase {
   virtual bool IsMultiDeviceTransfer() { return false; }
 
   const platform::DeviceContext *DeviceContext(platform::Place place) {
-    return dev_ctxes_[place];
+    auto it = dev_ctxes_.find(place);
+    return it != dev_ctxes_.end() ? it->second : nullptr;
   }
 
   void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 70ec6e90a4..4576999c8e 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -46,6 +46,41 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
   VLOG(5) << "destroy ExecutorPrepareContext";
 }
 
+template <typename RefCntMap>
+static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
+                                GarbageCollector<Tensor>* gc,
+                                RefCntMap* ref_cnts) {
+  std::unordered_set<Tensor*> erase_tensors;
+
+  auto handler = [&](const VariableNameMap& name_map) {
+    for (auto& name_pair : name_map) {
+      for (auto& name : name_pair.second) {
+        auto it = ref_cnts->find(name);
+        if (it == ref_cnts->end()) continue;
+        if ((it->second)-- == 1) {
+          auto* var = scope.FindVar(name);
+          if (var != nullptr) {
+            VLOG(10) << "Erase tensor \'" << name << "\'";
+            if (var->IsType<LoDTensor>()) {
+              erase_tensors.insert(var->GetMutable<LoDTensor>());
+            } else if (var->IsType<SelectedRows>()) {
+              erase_tensors.insert(
+                  var->GetMutable<SelectedRows>()->mutable_value());
+            }
+          }
+        }
+      }
+    }
+  };
+
+  handler(op->Inputs());
+  handler(op->Outputs());
+
+  if (!erase_tensors.empty()) {
+    gc->Add(erase_tensors);
+  }
+}
+
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
 void Executor::Close() {
@@ -331,9 +366,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 
   int64_t max_memory_size = GetEagerDeletionThreshold();
-
   std::unique_ptr<GarbageCollector<Tensor>> gc;
-  if (max_memory_size >= 0) {
+  // WhileOp would set keep_kids to false
+  // WhileGradOp would need the scopes created in WhileOp
+  // Perhaps, we should not perform eager deletion in WhileOp
+  // The scopes and variables created by WhileOp would be deleted
+  // in WhileGradOp.
+  if (max_memory_size >= 0 && !keep_kids) {
     ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
@@ -352,45 +391,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
     op->Run(*local_scope, place_);
 
     if (gc != nullptr) {
-      std::vector<std::string> erase_vars;
-      for (auto& input : op->Inputs()) {
-        for (auto& input_name : input.second) {
-          auto it = ctx->cur_ref_cnts_.find(input_name);
-          if (it == ctx->cur_ref_cnts_.end()) continue;
-          if (it->second == 1) {  // should delete it
-            erase_vars.emplace_back(input_name);
-            ctx->cur_ref_cnts_.erase(input_name);
-          } else {
-            --(it->second);
-          }
-        }
-      }
-
-      for (auto& output : op->Outputs()) {
-        for (auto& output_name : output.second) {
-          auto it = ctx->cur_ref_cnts_.find(output_name);
-          if (it == ctx->cur_ref_cnts_.end()) continue;
-          if (it->second == 1) {
-            erase_vars.emplace_back(output_name);
-            ctx->cur_ref_cnts_.erase(output_name);
-          } else {
-            --(it->second);
-          }
-        }
-      }
-
-      if (!erase_vars.empty()) {
-        std::vector<framework::LoDTensor*> erase_tensors;
-        for (auto& name : erase_vars) {
-          auto* var = local_scope->FindVar(name);
-          if (var == nullptr) continue;
-          if (var->IsType<framework::LoDTensor>()) {
-            auto* tensor = var->GetMutable<framework::LoDTensor>();
-            erase_tensors.push_back(tensor);
-          }
-        }
-        if (!erase_tensors.empty()) gc->Add(erase_tensors);
-      }
+      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
+                          &(ctx->cur_ref_cnts_));
     }
 
     if (FLAGS_benchmark) {
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index f0cc1338a8..36b36d49c2 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -32,38 +32,32 @@ template <typename T>
 std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
     const ProgramDesc& prog, size_t block_id) {
   auto& block = prog.Block(block_id);
-  std::unordered_set<std::string> ignored_vars;
   std::unordered_map<std::string, T> ref_cnts;
 
-  for (auto var_desc : block.AllVars()) {
-    auto type = var_desc->Proto()->type().type();
-    if (type != proto::VarType::LOD_TENSOR || var_desc->Persistable()) {
-      ignored_vars.insert(var_desc->Name());  // ignore persistable vars
-    }
-  }
-
-  for (auto op_desc : block.AllOps()) {
-    for (auto& input : op_desc->Inputs()) {
-      for (auto& input_name : input.second) {
-        if (!ignored_vars.count(input_name)) {
-          if (ref_cnts.count(input_name))
-            ++ref_cnts[input_name];
-          else
-            ref_cnts[input_name] = 1;
+  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
+    for (auto& name_pair : name_map) {
+      for (auto& name : name_pair.second) {
+        auto* var_desc = block.FindVar(name);
+        if (var_desc == nullptr || var_desc->Persistable()) continue;
+        auto type = var_desc->Proto()->type().type();
+        if (type != proto::VarType::LOD_TENSOR &&
+            type != proto::VarType::SELECTED_ROWS) {
+          continue;
         }
-      }
-    }
 
-    for (auto& output : op_desc->Outputs()) {
-      for (auto output_name : output.second) {
-        if (!ignored_vars.count(output_name)) {
-          if (ref_cnts.count(output_name))
-            ++ref_cnts[output_name];
-          else
-            ref_cnts[output_name] = 1;
+        auto it = ref_cnts.find(name);
+        if (it != ref_cnts.end()) {
+          ++it->second;
+        } else {
+          ref_cnts[name] = 1;
         }
       }
     }
+  };
+
+  for (auto op_desc : block.AllOps()) {
+    update_ref_cnts(op_desc, op_desc->Inputs());
+    update_ref_cnts(op_desc, op_desc->Outputs());
   }
   return ref_cnts;
 }
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index f06bad6c78..8d2e66009c 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -64,6 +64,8 @@ ParallelExecutor::ParallelExecutor(
     const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
     size_t num_trainers, size_t trainer_id)
     : member_(new ParallelExecutorPrivate(places)) {
+  is_alive_.test_and_set();
+
   member_->global_scope_ = scope;
   member_->use_cuda_ = exec_strategy.use_cuda_;
   member_->use_all_reduce_ =
@@ -246,6 +248,15 @@ void ParallelExecutor::BCastParamsToDevices(
 
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                            const std::string &fetched_var_name) {
+  // If ParallelExecutor has been destructed
+  // just return
+  if (!is_alive_.test_and_set()) return;
+
+  // If ParallelExecutor is running
+  if (is_running_.test_and_set()) {
+    PADDLE_THROW("The previous ParallelExecutor::Run() has not stopped");
+  }
+
   platform::RecordBlock b(0);
 #ifdef PADDLE_WITH_CUDA
   if (!gcs_.empty()) {
@@ -259,9 +270,17 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
     }
   }
 #endif
-  auto fetch_data = member_->executor_->Run(fetch_tensors);
-  *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
-      fetch_data;
+  try {
+    auto fetch_data = member_->executor_->Run(fetch_tensors);
+    *member_->global_scope_->Var(fetched_var_name)
+         ->GetMutable<FeedFetchList>() = fetch_data;
+    is_running_.clear();
+  } catch (...) {
+    is_running_.clear();
+    if (is_alive_.test_and_set()) {
+      std::rethrow_exception(std::current_exception());
+    }
+  }
 }
 
 void ParallelExecutor::FeedTensorsIntoLocalScopes(
@@ -299,6 +318,7 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
 }
 
 ParallelExecutor::~ParallelExecutor() {
+  is_alive_.clear();
   if (member_->own_local_scope_) {
     for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
       Scope *local_scope = member_->local_scopes_[i];
@@ -307,6 +327,12 @@ ParallelExecutor::~ParallelExecutor() {
       }
     }
   }
+
+  while (is_running_.test_and_set()) {
+    // wait unitl all threads have been stopped
+  }
+
+  member_.reset();
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index fd386a5987..b78f717375 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -75,7 +75,20 @@ class ParallelExecutor {
  private:
   void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
 
-  ParallelExecutorPrivate *member_;
+  std::unique_ptr<ParallelExecutorPrivate> member_;
+
+  // FIXME(zjl): HOT-FIX
+  // A flag to indicate whether ParallelExecutor is destructed.
+  // In Python side, when users interrupt the process manually, such as
+  // keyboard interrupt, ParallelExecutor may be destructed before Run() ends.
+  // Thus, disturbing exception messages would occur when interrupted.
+  // If is_alive_ is false, we would discard the last exception thrown by Run().
+  // Since std::atomic_flag is always lock-free and faster than
+  // std::atomic<bool>, we choose std::atomic_flag to be the flag here.
+  std::atomic_flag is_alive_ = ATOMIC_FLAG_INIT;
+
+  // A flag to indicate whether ParallelExecutor is running.
+  std::atomic_flag is_running_ = ATOMIC_FLAG_INIT;
 
 #ifdef PADDLE_WITH_CUDA
   // ref_cnts_ is only initialized when ParallelExecutor constructs, and then
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 1a727a2c8c..a4abd1b128 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -49,18 +49,18 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   kids_.push_back(new Scope(this));
   return *kids_.back();
 }
 
 Variable* Scope::Var(const std::string& name) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   return VarInternal(name);
 }
 
 Variable* Scope::Var(std::string* name) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   if (name != nullptr) {
     *name = new_name;
@@ -69,29 +69,34 @@ Variable* Scope::Var(std::string* name) {
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   return FindVarInternal(name);
 }
 
+Variable* Scope::FindLocalVar(const std::string& name) const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return FindVarLocally(name);
+}
+
 const Scope* Scope::FindScope(const Variable* var) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   return FindScopeInternal(var);
 }
 
 void Scope::DropKids() {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
 bool Scope::HasKid(const Scope* scope) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   return it != this->kids_.end();
 }
 
 std::vector<std::string> Scope::LocalVarNames() const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   std::vector<std::string> known_vars;
   known_vars.reserve(this->vars_.size());
   for (auto& p : vars_) {
@@ -101,7 +106,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 
 void Scope::DeleteScope(Scope* scope) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
   this->kids_.erase(it);
@@ -114,7 +119,7 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   std::set<std::string> var_set(var_names.begin(), var_names.end());
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
@@ -127,12 +132,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   RenameInternal(origin_name, new_name);
 }
 
 std::string Scope::Rename(const std::string& origin_name) const {
-  std::unique_lock<std::mutex> lock(mutex_);
+  std::lock_guard<std::mutex> lock(mutex_);
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   RenameInternal(origin_name, new_name);
   return new_name;
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index e42fff1d79..14f9f36812 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -63,6 +63,11 @@ class Scope {
   /// Caller doesn't own the returned Variable.
   Variable* FindVar(const std::string& name) const;
 
+  /// Find a variable in the current scope.
+  /// Return nullptr if cannot find.
+  /// Caller doesn't own the returned Variable.
+  Variable* FindLocalVar(const std::string& name) const;
+
   const Scope* parent() const { return parent_; }
 
   /// Find the scope or an ancestor scope that contains the given variable.

From d9b202e7172ce649945fd7042029cd6a742e1aa3 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 15 Oct 2018 15:25:09 +0800
Subject: [PATCH 34/48] Move tensor copy src_ptr and dst_ptr check to
 TensorCopy function

test=develop
---
 paddle/fluid/framework/tensor_util.cc | 11 ++++
 paddle/fluid/operators/reshape_op.cc  | 77 +++++++++++----------------
 2 files changed, 43 insertions(+), 45 deletions(-)

diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 1d7a2eb5b3..de77d189c8 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -114,6 +114,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   auto dst_ptr = dst->mutable_data(dst_place, src.type());
   auto size = src.numel() * SizeOfType(src.type());
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data from " << src.place() << " to "
+              << dst_place;
+      return;
+    }
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
@@ -132,6 +137,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
              platform::is_gpu_place(dst_place)) {
     auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
     auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    if (src_ptr == dst_ptr &&
+        src_gpu_place.GetDeviceId() == dst_gpu_place.GetDeviceId()) {
+      VLOG(3) << "Skip copy the same data from " << src.place() << " to "
+              << dst_place;
+      return;
+    }
     memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
   }
 #endif
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index b8fdc3f826..500d86fec3 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -195,7 +195,6 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
   }
 };
 
-template <typename T>
 class ReshapeKernel {
  public:
   void operator()(const framework::ExecutionContext &ctx) const {
@@ -228,15 +227,12 @@ class ReshapeKernel {
           "sequence_reshape op.");
     }
 
-    if (in->data<T>() !=
-        reinterpret_cast<T *>(out->mutable_data(ctx.GetPlace(), in->type()))) {
-      framework::TensorCopySync(*in, ctx.GetPlace(), out);
-    }
+    out->mutable_data(ctx.GetPlace(), in->type());
+    framework::TensorCopySync(*in, ctx.GetPlace(), out);
     out->Resize(out_dims);
   }
 };
 
-template <typename T>
 class ReshapeGradKernel {
  public:
   void operator()(const framework::ExecutionContext &ctx) const {
@@ -244,9 +240,8 @@ class ReshapeGradKernel {
     auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     auto in_dims = d_x->dims();
 
-    if (d_out->data<T>() != d_x->mutable_data(ctx.GetPlace(), d_out->type())) {
-      framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
-    }
+    d_x->mutable_data(ctx.GetPlace(), d_out->type());
+    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
     d_x->Resize(in_dims);
   }
 };
@@ -341,46 +336,38 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel<float>,
-                               double, ops::ReshapeKernel<double>, int,
-                               ops::ReshapeKernel<int>, int64_t,
-                               ops::ReshapeKernel<int64_t>);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float,
-                               ops::ReshapeGradKernel<float>, double,
-                               ops::ReshapeGradKernel<double>, int,
-                               ops::ReshapeGradKernel<int>, int64_t,
-                               ops::ReshapeGradKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel);
 
 REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
                   ops::Reshape2GradMaker);
 REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel<float>,
-                               double, ops::ReshapeKernel<double>, int,
-                               ops::ReshapeKernel<int>, int64_t,
-                               ops::ReshapeKernel<int64_t>);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float,
-                               ops::ReshapeGradKernel<float>, double,
-                               ops::ReshapeGradKernel<double>, int,
-                               ops::ReshapeGradKernel<int>, int64_t,
-                               ops::ReshapeGradKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel);
 
 #ifdef PADDLE_WITH_CUDA
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel<float>,
-                                double, ops::ReshapeKernel<double>, int,
-                                ops::ReshapeKernel<int>, int64_t,
-                                ops::ReshapeKernel<int64_t>);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float,
-                                ops::ReshapeGradKernel<float>, double,
-                                ops::ReshapeGradKernel<double>, int,
-                                ops::ReshapeGradKernel<int>, int64_t,
-                                ops::ReshapeGradKernel<int64_t>);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel<float>,
-                                double, ops::ReshapeKernel<double>, int,
-                                ops::ReshapeKernel<int>, int64_t,
-                                ops::ReshapeKernel<int64_t>);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float,
-                                ops::ReshapeGradKernel<float>, double,
-                                ops::ReshapeGradKernel<double>, int,
-                                ops::ReshapeGradKernel<int>, int64_t,
-                                ops::ReshapeGradKernel<int64_t>);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
+                                ops::ReshapeKernel, int, ops::ReshapeKernel,
+                                int64_t, ops::ReshapeKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
+                                double, ops::ReshapeGradKernel, int,
+                                ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                                ops::ReshapeKernel, int, ops::ReshapeKernel,
+                                int64_t, ops::ReshapeKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                                double, ops::ReshapeGradKernel, int,
+                                ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel);
 #endif

From d3ed070e10abffd4e8315f42f3090be9a38c54b7 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 15 Oct 2018 07:27:16 +0000
Subject: [PATCH 35/48] test=develop

---
 paddle/fluid/framework/parallel_executor.cc | 32 ++++-----------------
 paddle/fluid/framework/parallel_executor.h  | 13 ---------
 2 files changed, 5 insertions(+), 40 deletions(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 8d2e66009c..e8adabd265 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -64,8 +64,6 @@ ParallelExecutor::ParallelExecutor(
     const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
     size_t num_trainers, size_t trainer_id)
     : member_(new ParallelExecutorPrivate(places)) {
-  is_alive_.test_and_set();
-
   member_->global_scope_ = scope;
   member_->use_cuda_ = exec_strategy.use_cuda_;
   member_->use_all_reduce_ =
@@ -248,15 +246,6 @@ void ParallelExecutor::BCastParamsToDevices(
 
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                            const std::string &fetched_var_name) {
-  // If ParallelExecutor has been destructed
-  // just return
-  if (!is_alive_.test_and_set()) return;
-
-  // If ParallelExecutor is running
-  if (is_running_.test_and_set()) {
-    PADDLE_THROW("The previous ParallelExecutor::Run() has not stopped");
-  }
-
   platform::RecordBlock b(0);
 #ifdef PADDLE_WITH_CUDA
   if (!gcs_.empty()) {
@@ -270,17 +259,9 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
     }
   }
 #endif
-  try {
-    auto fetch_data = member_->executor_->Run(fetch_tensors);
-    *member_->global_scope_->Var(fetched_var_name)
-         ->GetMutable<FeedFetchList>() = fetch_data;
-    is_running_.clear();
-  } catch (...) {
-    is_running_.clear();
-    if (is_alive_.test_and_set()) {
-      std::rethrow_exception(std::current_exception());
-    }
-  }
+  auto fetch_data = member_->executor_->Run(fetch_tensors);
+  *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
+      fetch_data;
 }
 
 void ParallelExecutor::FeedTensorsIntoLocalScopes(
@@ -318,7 +299,6 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
 }
 
 ParallelExecutor::~ParallelExecutor() {
-  is_alive_.clear();
   if (member_->own_local_scope_) {
     for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
       Scope *local_scope = member_->local_scopes_[i];
@@ -328,10 +308,8 @@ ParallelExecutor::~ParallelExecutor() {
     }
   }
 
-  while (is_running_.test_and_set()) {
-    // wait unitl all threads have been stopped
-  }
-
+  // member_ must be destructed before gcs_ since the destructor of
+  // ReferenceCountOpHandle use raw pointers of gcs_ inside.
   member_.reset();
 }
 
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index b78f717375..ef09b98b2a 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -77,19 +77,6 @@ class ParallelExecutor {
 
   std::unique_ptr<ParallelExecutorPrivate> member_;
 
-  // FIXME(zjl): HOT-FIX
-  // A flag to indicate whether ParallelExecutor is destructed.
-  // In Python side, when users interrupt the process manually, such as
-  // keyboard interrupt, ParallelExecutor may be destructed before Run() ends.
-  // Thus, disturbing exception messages would occur when interrupted.
-  // If is_alive_ is false, we would discard the last exception thrown by Run().
-  // Since std::atomic_flag is always lock-free and faster than
-  // std::atomic<bool>, we choose std::atomic_flag to be the flag here.
-  std::atomic_flag is_alive_ = ATOMIC_FLAG_INIT;
-
-  // A flag to indicate whether ParallelExecutor is running.
-  std::atomic_flag is_running_ = ATOMIC_FLAG_INIT;
-
 #ifdef PADDLE_WITH_CUDA
   // ref_cnts_ is only initialized when ParallelExecutor constructs, and then
   // keeps unchanged

From 2c9839c847fa3382e4e2165aa8b66413ad92ab98 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Mon, 15 Oct 2018 15:57:23 +0800
Subject: [PATCH 36/48] add cuda version display (#13885)

test=develop
---
 paddle/fluid/platform/device_context.cc | 20 +++++++++++++++-----
 paddle/fluid/platform/device_context.h  |  8 +++++---
 paddle/fluid/platform/gpu_info.cc       | 18 ++++++++++++++++++
 paddle/fluid/platform/gpu_info.h        |  6 ++++++
 4 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index dfc079e986..4286242b2a 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -198,9 +198,9 @@ class CudnnHolder {
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
     : place_(place), cudnn_holder_(nullptr) {
   SetDeviceId(place_.device);
-  compute_capability = GetCUDAComputeCapability(place_.device);
-  multi_process = GetCUDAMultiProcessors(place_.device);
-  max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
+  compute_capability_ = GetCUDAComputeCapability(place_.device);
+  multi_process_ = GetCUDAMultiProcessors(place_.device);
+  max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
   PADDLE_ENFORCE(cudaStreamCreate(&stream_));
   eigen_stream_.reset(new EigenCudaStreamDevice());
   eigen_stream_->Reinitialize(&stream_, place);
@@ -211,6 +211,16 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
     cudnn_holder_.reset(new CudnnHolder(&stream_, place));
   }
 
+  driver_version_ = GetCUDADriverVersion(place_.device);
+  runtime_version_ = GetCUDARuntimeVersion(place_.device);
+
+  LOG(INFO) << "device: " << place_.device
+            << ", CUDA Capability: " << compute_capability_
+            << ", Driver Version: " << driver_version_ / 1000 << "."
+            << (driver_version_ % 100) / 10
+            << ", Runtime Version: " << runtime_version_ / 1000 << "."
+            << (runtime_version_ % 100) / 10;
+
   callback_manager_.reset(new StreamCallbackManager(stream_));
 }
 
@@ -232,11 +242,11 @@ void CUDADeviceContext::Wait() const {
 }
 
 int CUDADeviceContext::GetComputeCapability() const {
-  return compute_capability;
+  return compute_capability_;
 }
 
 int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
-  return multi_process * max_threads_per_mp;
+  return multi_process_ * max_threads_per_mp_;
 }
 
 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 7953919515..e1ff1a1746 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -135,9 +135,11 @@ class CUDADeviceContext : public DeviceContext {
   cudaStream_t stream_;
   cublasHandle_t cublas_handle_;
 
-  int compute_capability;
-  int multi_process;
-  int max_threads_per_mp;
+  int compute_capability_;
+  int runtime_version_;
+  int driver_version_;
+  int multi_process_;
+  int max_threads_per_mp_;
 
   mutable std::mutex mtx_;
 
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index f599e7fbc8..8fff9844db 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -46,6 +46,24 @@ int GetCUDAComputeCapability(int id) {
   return device_prop.major * 10 + device_prop.minor;
 }
 
+int GetCUDARuntimeVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  int runtime_version = 0;
+  PADDLE_ENFORCE(cudaRuntimeGetVersion(&runtime_version),
+                 "cudaRuntimeGetVersion failed in "
+                 "paddle::platform::cudaRuntimeGetVersion");
+  return runtime_version;
+}
+
+int GetCUDADriverVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  int driver_version = 0;
+  PADDLE_ENFORCE(cudaDriverGetVersion(&driver_version),
+                 "cudaDriverGetVersion failed in "
+                 "paddle::platform::GetCUDADriverVersion");
+  return driver_version;
+}
+
 int GetCUDAMultiProcessors(int id) {
   PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   int count;
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index f4640d3eaa..be44158431 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -29,6 +29,12 @@ int GetCUDADeviceCount();
 //! Get the compute capability of the ith GPU (format: major * 10 + minor)
 int GetCUDAComputeCapability(int i);
 
+//! Get the runtime version of the ith GPU
+int GetCUDARuntimeVersion(int id);
+
+//! Get the driver version of the ith GPU
+int GetCUDADriverVersion(int id);
+
 //! Get the MultiProcessors of the ith GPU.
 int GetCUDAMultiProcessors(int i);
 

From 24c9fbdba36b4b9804c63f7ddefeb1074714e63b Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 15 Oct 2018 16:13:29 +0800
Subject: [PATCH 37/48] Polish code

test=develop
---
 paddle/fluid/framework/tensor_util.cc | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index de77d189c8..69bcbc0e58 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -36,6 +36,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
   auto size = src.numel() * SizeOfType(src.type());
 
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
@@ -71,6 +76,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     if (platform::is_same_place(src_place, dst_place)) {
+      if (src_ptr == dst_ptr) {
+        VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+                << dst_place;
+        return;
+      }
       memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
                    stream);
     } else {
@@ -115,7 +125,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   auto size = src.numel() * SizeOfType(src.type());
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
     if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data from " << src.place() << " to "
+      VLOG(3) << "Skip copy the same data from " << src_place << " to "
               << dst_place;
       return;
     }
@@ -135,14 +145,13 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
     memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
   } else if (platform::is_gpu_place(src_place) &&
              platform::is_gpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
-    if (src_ptr == dst_ptr &&
-        src_gpu_place.GetDeviceId() == dst_gpu_place.GetDeviceId()) {
-      VLOG(3) << "Skip copy the same data from " << src.place() << " to "
+    if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
+      VLOG(3) << "Skip copy the same data from " << src_place << " to "
               << dst_place;
       return;
     }
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
     memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
   }
 #endif

From ddb76d0d091c965d07d79f9743683d6efd2ac7e7 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Mon, 15 Oct 2018 16:45:47 +0800
Subject: [PATCH 38/48] Make GetMutable more robust

test=develop
---
 paddle/fluid/framework/executor.cc          |  2 +-
 paddle/fluid/framework/feed_fetch_method.cc |  3 +--
 paddle/fluid/framework/naive_executor.cc    |  2 +-
 paddle/fluid/framework/var_desc.h           |  1 +
 paddle/fluid/framework/variable.h           |  6 +++++-
 paddle/fluid/framework/variable_test.cc     | 11 ++++++-----
 paddle/fluid/operators/parallel_do_op.cc    | 21 ++++++++++++++++++++-
 python/paddle/fluid/layers/control_flow.py  |  2 +-
 8 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 70ec6e90a4..a070b8efb8 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -66,7 +66,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
   } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope>>();
+    var->GetMutable<std::vector<framework::Scope*>>();
   } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
   } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 8e1f93c5eb..3e9353f5cf 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -27,8 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
   // be created.
   VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
   Variable* g_feed_value = scope->Var(var_name);
-  auto& feed_inputs =
-      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
+  auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
   if (index >= feed_inputs.size()) {
     feed_inputs.resize(index + 1);
   }
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index ba10687d65..2840d503f1 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
   } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope>>();
+    var->GetMutable<std::vector<framework::Scope *>>();
   } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
   } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h
index e33849ef50..9d3fb81119 100644
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
@@ -59,6 +59,7 @@ class VarDesc {
  public:
   explicit VarDesc(const std::string &name) {
     desc_.set_name(name);
+    // TODO(paddle-dev): Why default to lodtensor.
     desc_.mutable_type()->set_type(proto::VarType::LOD_TENSOR);
   }
 
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 067e0c2b83..873e1b20a5 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -38,8 +38,12 @@ class Variable {
 
   template <typename T>
   T* GetMutable() {
-    if (!IsType<T>()) {
+    if (!holder_) {
       holder_.reset(new PlaceholderImpl<T>(new T()));
+    } else {
+      PADDLE_ENFORCE(IsType<T>(),
+                     "Variable must be type %s, the holding type is %s",
+                     typeid(T).name(), holder_->Type().name());
     }
     return static_cast<T*>(holder_->Ptr());
   }
diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc
index c5c1d215f4..003dcfd3df 100644
--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
@@ -33,9 +33,10 @@ TEST(Variable, GetMutable) {
   const Tensor& tt = v->Get<Tensor>();
   EXPECT_EQ(1234, tt.content_);
 
-  std::string* s = v->GetMutable<std::string>();
-  *s = "hello";
-
-  const std::string& ss = v->Get<std::string>();
-  EXPECT_EQ("hello", ss);
+  try {
+    v->GetMutable<std::string>();
+  } catch (std::exception& e) {
+    return;
+  }
+  EXPECT_TRUE(false);
 }
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index 97c36a83fc..ab25628d45 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -397,6 +397,24 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
   }
 };
 
+class ParallelDoGradOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    framework::BlockDesc *sub_block =
+        boost::get<framework::BlockDesc *>(op_desc.GetAttr(kParallelBlock));
+    for (auto &out_vars : op_desc.Outputs()) {
+      for (auto &out_var : out_vars.second) {
+        auto &var = block->FindRecursiveOrCreateVar(out_var);
+        auto sub_var = sub_block->FindRecursiveOrCreateVar(out_var);
+        if (sub_var.GetType() != var.GetType()) {
+          var.SetType(sub_var.GetType());
+        }
+      }
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -404,4 +422,5 @@ REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp,
                   paddle::operators::ParallelDoOpProtoMaker,
                   paddle::operators::ParallelDoGradOpDescMaker);
 REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp,
-                  paddle::operators::ParallelDoGradOpShapeInference);
+                  paddle::operators::ParallelDoGradOpShapeInference,
+                  paddle::operators::ParallelDoGradOpVarTypeInference);
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 4af97e8632..e868ff8b6a 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1267,7 +1267,7 @@ class ConditionalBlock(object):
         ]
 
         step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
+            name='control_scope', type=core.VarDesc.VarType.STEP_SCOPES)
         parent_block.append_op(
             type='conditional_block',
             inputs={

From aeec82acd5c37d110a71832d647f3c27834c7c8a Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 15 Oct 2018 17:21:10 +0800
Subject: [PATCH 39/48] Add unittest for reshape op

test=develop
---
 paddle/fluid/framework/tensor_util_test.cc | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index a1e5b967a8..793ccfc79f 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -41,6 +41,11 @@ TEST(TensorCopy, Tensor) {
     EXPECT_EQ(src_ptr[i], dst_ptr[i]);
   }
 
+  TensorCopy(dst_tensor, *cpu_place, &dst_tensor);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+
   EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
 
   Tensor slice_tensor = src_tensor.Slice(1, 2);
@@ -82,6 +87,15 @@ TEST(TensorCopy, Tensor) {
       EXPECT_EQ(src_ptr[i], dst_ptr[i]);
     }
 
+    // Copy the same tensor
+    TensorCopy(gpu_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+    gpu_ctx.Wait();
+    const int* dst_ptr_tmp = dst_tensor.data<int>();
+    EXPECT_NE(src_ptr, dst_ptr_tmp);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]);
+    }
+
     Tensor slice_tensor = src_tensor.Slice(1, 2);
 
     // CPU Slice Tensor to GPU Tensor

From 50c5e9b0c63a547f46b677ad1e0090a953528bbc Mon Sep 17 00:00:00 2001
From: Sylwester Fraczek <sylwester.fraczek@intel.com>
Date: Fri, 12 Oct 2018 14:27:32 +0200
Subject: [PATCH 40/48] reshape_2d used from ddim.h

test=develop
---
 paddle/fluid/framework/ir/conv_bn_fuse_pass.cc | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 86926bec64..04459612a7 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -44,18 +44,6 @@ namespace ir {
   GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, pattern_name);     \
   GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, pattern_name)
 
-// reshape to two dimensions {A, B * C * ...}
-DDim make_dims_2d(DDim dims) {
-  auto dims_count = dims.size();
-  PADDLE_ENFORCE_GT(dims_count, 0);
-
-  int size2 = 1;
-  for (int i = 1; i < dims_count; i++) {
-    size2 *= dims[i];
-  }
-  return make_ddim({dims[0], size2});
-}
-
 void recompute_bias_and_weights(const Scope* scope,
                                 ir::Node* conv_weight,            //
                                 const ir::Node& bn_scale,         //
@@ -104,7 +92,7 @@ void recompute_bias_and_weights(const Scope* scope,
   // Re-compute weight of conv2d from BN
   auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
   auto weights_shape = weights->dims();
-  auto weights_shape_2d = make_dims_2d(weights_shape);
+  auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
 
   EigenMatrixArrayMap weights_array_2d(
       weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],

From 60030e867892e657104a34436bf83bda93c4b8ca Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 15 Oct 2018 17:33:12 +0800
Subject: [PATCH 41/48] change the use of FLAGS_reader_queue_speed_test_mode
 test=develop

---
 paddle/fluid/CMakeLists.txt                   |  2 +-
 .../fluid/operators/reader/blocking_queue.h   | 10 ++++----
 .../reader/lod_tensor_blocking_queue.h        | 10 ++++----
 .../reader/reader_blocking_queue_test.cc      | 23 ++++++++-----------
 paddle/fluid/pybind/pybind.cc                 |  3 ++-
 5 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 519a00fb07..6e3411f7a2 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -14,4 +14,4 @@ if(WITH_INFERENCE)
   add_subdirectory(inference)
 endif()
 
-add_subdirectory(train)
+#add_subdirectory(train)
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 3eefb2db51..51b980acb5 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -14,14 +14,11 @@
 
 #pragma once
 
-#include <gflags/gflags.h>
 #include <condition_variable>  // NOLINT
 #include <deque>
 
 #include "paddle/fluid/platform/enforce.h"
 
-DECLARE_bool(reader_queue_speed_test_mode);
-
 namespace paddle {
 namespace operators {
 namespace reader {
@@ -34,8 +31,8 @@ class BlockingQueue {
   // is a workaround and a simplified version of framework::Channel as it
   // doesn't support GPU and it implements on buffered blocking queue.
  public:
-  explicit BlockingQueue(size_t capacity)
-      : capacity_(capacity), closed_(false) {
+  explicit BlockingQueue(size_t capacity, bool speed_test_mode = false)
+      : capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) {
     PADDLE_ENFORCE_GT(
         capacity_, 0,
         "The capacity of a reader::BlockingQueue must be greater than 0.");
@@ -75,7 +72,7 @@ class BlockingQueue {
     if (!queue_.empty()) {
       PADDLE_ENFORCE_NOT_NULL(elem);
       *elem = queue_.front();
-      if (LIKELY(!FLAGS_reader_queue_speed_test_mode)) {
+      if (LIKELY(!speed_test_mode_)) {
         queue_.pop_front();
       }
       send_cv_.notify_one();
@@ -119,6 +116,7 @@ class BlockingQueue {
 
  private:
   size_t capacity_;
+  bool speed_test_mode_;
   bool closed_;
   std::deque<T> queue_;
 
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
index 4f7cfc24ec..3f041ff7e4 100644
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -33,8 +33,9 @@ class LoDTensorBlockingQueue {
 
  private:
   LoDTensorBlockingQueue(size_t capacity,
-                         const std::vector<framework::DDim>& dims)
-      : queue_(capacity), dims_(dims) {}
+                         const std::vector<framework::DDim>& dims,
+                         bool speed_test_mode = false)
+      : queue_(capacity, speed_test_mode), dims_(dims) {}
 
  public:
   bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
@@ -69,11 +70,12 @@ class LoDTensorBlockingQueue {
 
 class LoDTensorBlockingQueueHolder {
  public:
-  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims,
+                bool speed_test_mode = false) {
     PADDLE_ENFORCE(
         queue_ == nullptr,
         "LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
-    queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+    queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode));
   }
 
   inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {
diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
index cfcac11228..bd7ac64b2f 100644
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
@@ -20,10 +20,6 @@
 
 #include "paddle/fluid/operators/reader/blocking_queue.h"
 
-DEFINE_bool(reader_queue_speed_test_mode, false,
-            "If set true, the queue.pop will only get data from queue but not "
-            "remove the data from queue for speed testing");
-
 using paddle::operators::reader::BlockingQueue;
 
 TEST(BlockingQueue, CapacityTest) {
@@ -222,27 +218,26 @@ TEST(BlockingQueue, MyClassTest) {
   EXPECT_EQ(a.val_, b.val_);
 }
 
-TEST(BlockingQueue, reader_queue_speed_test_mode_flag) {
-  FLAGS_reader_queue_speed_test_mode = false;
+TEST(BlockingQueue, speed_test_mode) {
   size_t queue_size = 10;
-  BlockingQueue<size_t> q(queue_size);
+  BlockingQueue<size_t> q1(queue_size, false);
   for (size_t i = 0; i < queue_size; ++i) {
-    q.Send(i);
+    q1.Send(i);
   }
   size_t b;
   for (size_t i = 0; i < queue_size; ++i) {
-    q.Receive(&b);
+    q1.Receive(&b);
     EXPECT_EQ(b, i);
   }
-  EXPECT_EQ(q.Size(), 0);
+  EXPECT_EQ(q1.Size(), 0);
 
-  FLAGS_reader_queue_speed_test_mode = true;
+  BlockingQueue<size_t> q2(queue_size, true);
   for (size_t i = 0; i < queue_size; ++i) {
-    q.Send(i);
+    q2.Send(i);
   }
   for (size_t i = 0; i < queue_size; ++i) {
-    q.Receive(&b);
+    q2.Receive(&b);
     EXPECT_EQ(b, 0);
   }
-  EXPECT_EQ(q.Size(), queue_size);
+  EXPECT_EQ(q2.Size(), queue_size);
 }
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 2b730f2bdc..7af5b77051 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -341,7 +341,8 @@ All parameter, weight, gradient are variables in Paddle.
                                return make_ddim(shape);
                              });
               auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
-              holder->InitOnce(capacity, dims);
+              holder->InitOnce(capacity, dims,
+                               FLAGS_reader_queue_speed_test_mode);
               return holder->GetQueue();
             },
         py::return_value_policy::copy);

From ec25a09bd590f69cf6014e7282dda3a9c09deab3 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 15 Oct 2018 18:58:21 +0800
Subject: [PATCH 42/48] revert unused change test=develop

---
 paddle/fluid/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 6e3411f7a2..519a00fb07 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -14,4 +14,4 @@ if(WITH_INFERENCE)
   add_subdirectory(inference)
 endif()
 
-#add_subdirectory(train)
+add_subdirectory(train)

From b16e9cd105a97160143cfce00ab5cfbd3c547ddb Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 15 Oct 2018 19:03:40 +0800
Subject: [PATCH 43/48] a small fix for compile WITH_INFERENCE=OFF (#13869)

test=develop
---
 paddle/fluid/CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 519a00fb07..48b36df649 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -12,6 +12,5 @@ endif(NOT WIN32)
 if(WITH_INFERENCE)
   # NOTE: please add subdirectory inference at last.
   add_subdirectory(inference)
+  add_subdirectory(train)
 endif()
-
-add_subdirectory(train)

From b78579858504bd81e165e854700a17ad2fb6e733 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Mon, 15 Oct 2018 07:57:20 +0000
Subject: [PATCH 44/48] Expose layer's name for sequence pad & unpad

test=develop
---
 paddle/fluid/API.spec            |  4 ++--
 python/paddle/fluid/layers/nn.py | 10 +++++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index f19e4e3827..2a4ad38214 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -75,8 +75,8 @@ paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'outp
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
 paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sequence_unpad ArgSpec(args=['x', 'length'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.sequence_unpad ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
 paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
 paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 1cd9a61ff9..7583299ff2 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2793,7 +2793,7 @@ def sequence_expand_as(x, y, name=None):
 
 
 @templatedoc()
-def sequence_pad(x, pad_value, maxlen=None):
+def sequence_pad(x, pad_value, maxlen=None, name=None):
     """
     ${comment}
 
@@ -2807,7 +2807,9 @@ def sequence_pad(x, pad_value, maxlen=None):
             None or any positive int. When it is None, all sequences will be
             padded up to the length of the longest one among them; when it a
             certain positive value, it must be greater than the length of the
-            longest original sequence."
+            longest original sequence.
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
 
     Returns:
         Variable: The padded sequence batch and the original lengths before
@@ -2844,7 +2846,7 @@ def sequence_pad(x, pad_value, maxlen=None):
     return out, length
 
 
-def sequence_unpad(x, length):
+def sequence_unpad(x, length, name=None):
     """
     Sequence Unpad Layer
 
@@ -2876,6 +2878,8 @@ def sequence_unpad(x, length):
             equal length.
         length(Variable): The Variable that specifies the actual ength of
             sequences after unpadding.
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
 
     Returns:
         Variable: The Variable contains the unpadded sequences.

From 288a112ffdbf0f2c858eefa650135959aeb25c01 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 16 Oct 2018 09:44:12 +0800
Subject: [PATCH 45/48] Revert "Revert "Revert "Make variable::GetMutable
 robust"""

---
 paddle/fluid/framework/executor.cc          |  2 +-
 paddle/fluid/framework/feed_fetch_method.cc |  3 ++-
 paddle/fluid/framework/naive_executor.cc    |  2 +-
 paddle/fluid/framework/var_desc.h           |  1 -
 paddle/fluid/framework/variable.h           |  6 +-----
 paddle/fluid/framework/variable_test.cc     | 11 +++++------
 paddle/fluid/operators/parallel_do_op.cc    | 21 +--------------------
 python/paddle/fluid/layers/control_flow.py  |  2 +-
 8 files changed, 12 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index a070b8efb8..70ec6e90a4 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -66,7 +66,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
   } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope*>>();
+    var->GetMutable<std::vector<framework::Scope>>();
   } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
   } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 3e9353f5cf..8e1f93c5eb 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -27,7 +27,8 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
   // be created.
   VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
   Variable* g_feed_value = scope->Var(var_name);
-  auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
+  auto& feed_inputs =
+      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
   if (index >= feed_inputs.size()) {
     feed_inputs.resize(index + 1);
   }
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 2840d503f1..ba10687d65 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
   } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope *>>();
+    var->GetMutable<std::vector<framework::Scope>>();
   } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
   } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h
index 9d3fb81119..e33849ef50 100644
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
@@ -59,7 +59,6 @@ class VarDesc {
  public:
   explicit VarDesc(const std::string &name) {
     desc_.set_name(name);
-    // TODO(paddle-dev): Why default to lodtensor.
     desc_.mutable_type()->set_type(proto::VarType::LOD_TENSOR);
   }
 
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 873e1b20a5..067e0c2b83 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -38,12 +38,8 @@ class Variable {
 
   template <typename T>
   T* GetMutable() {
-    if (!holder_) {
+    if (!IsType<T>()) {
       holder_.reset(new PlaceholderImpl<T>(new T()));
-    } else {
-      PADDLE_ENFORCE(IsType<T>(),
-                     "Variable must be type %s, the holding type is %s",
-                     typeid(T).name(), holder_->Type().name());
     }
     return static_cast<T*>(holder_->Ptr());
   }
diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc
index 003dcfd3df..c5c1d215f4 100644
--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
@@ -33,10 +33,9 @@ TEST(Variable, GetMutable) {
   const Tensor& tt = v->Get<Tensor>();
   EXPECT_EQ(1234, tt.content_);
 
-  try {
-    v->GetMutable<std::string>();
-  } catch (std::exception& e) {
-    return;
-  }
-  EXPECT_TRUE(false);
+  std::string* s = v->GetMutable<std::string>();
+  *s = "hello";
+
+  const std::string& ss = v->Get<std::string>();
+  EXPECT_EQ("hello", ss);
 }
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index ab25628d45..97c36a83fc 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -397,24 +397,6 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
   }
 };
 
-class ParallelDoGradOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    framework::BlockDesc *sub_block =
-        boost::get<framework::BlockDesc *>(op_desc.GetAttr(kParallelBlock));
-    for (auto &out_vars : op_desc.Outputs()) {
-      for (auto &out_var : out_vars.second) {
-        auto &var = block->FindRecursiveOrCreateVar(out_var);
-        auto sub_var = sub_block->FindRecursiveOrCreateVar(out_var);
-        if (sub_var.GetType() != var.GetType()) {
-          var.SetType(sub_var.GetType());
-        }
-      }
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -422,5 +404,4 @@ REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp,
                   paddle::operators::ParallelDoOpProtoMaker,
                   paddle::operators::ParallelDoGradOpDescMaker);
 REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp,
-                  paddle::operators::ParallelDoGradOpShapeInference,
-                  paddle::operators::ParallelDoGradOpVarTypeInference);
+                  paddle::operators::ParallelDoGradOpShapeInference);
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index e868ff8b6a..4af97e8632 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1267,7 +1267,7 @@ class ConditionalBlock(object):
         ]
 
         step_scope = parent_block.create_var(
-            name='control_scope', type=core.VarDesc.VarType.STEP_SCOPES)
+            type=core.VarDesc.VarType.STEP_SCOPES)
         parent_block.append_op(
             type='conditional_block',
             inputs={

From 699825a9d5dcb487a59a7abe64e87f23c9c01046 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Tue, 16 Oct 2018 04:53:48 +0000
Subject: [PATCH 46/48] Use length-based lod in seq_unpad's doc

test=develop
---
 python/paddle/fluid/layers/nn.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 7583299ff2..ebe536f1ec 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2848,7 +2848,7 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
 
 def sequence_unpad(x, length, name=None):
     """
-    Sequence Unpad Layer
+    **Sequence Unpad Layer**
 
     This layer removes the padding data in the input sequences and convert 
     them into sequences with actual length as output, identitied by lod 
@@ -2864,14 +2864,14 @@ def sequence_unpad(x, length, name=None):
 		      [11.0, 12.0, 13.0, 14.0, 15.0]], 
      
 	in which there are 3 sequences padded to length 5, and the acutal length 
-	specified by input Variable *length*:
+	specified by input Variable **length**:
 
 	    length.data = [[2], [3], [4]],
 
 	after unpadding, the output Variable will be:
 
 	    out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
-	    out.lod = [[0, 2, 5, 9]]      
+	    out.lod = [[2, 3, 4]]      
 
     Args:
         x(Variable): Input Variable which contains the padded sequences with

From 7a751b83ac733df2203712ccc536b4e07bf35092 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Tue, 16 Oct 2018 17:01:48 +0800
Subject: [PATCH 47/48] fix isfinite_op sprintf (#13850)

test=develop
---
 paddle/fluid/operators/isfinite_op.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index 248c779356..7b42efd623 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -60,7 +60,7 @@ class OverflowOpMaker : public framework::OpProtoAndCheckerMaker {
               "(Tensor) 1-dim tensor, contains a bool scalar. The output "
               "tensor of overflow operator.");
     AddComment(string::Sprintf(R"DOC(
-Overflow operator.
+Overflow %s operator.
 
 $$Out = any(X)$$
 
@@ -69,6 +69,8 @@ Out = Inf if any X contains Inf,
 Out = Nan if any X contains Nan,
 Out = 0 if no Inf/Nan detected.
 If X contains both Inf/Nan, it will return the first indicator it meeted.
+
+%s
 )DOC",
                                GetName(), GetComments()));
   }

From fa2ab3346ce7700223a6bf42ff209715ca0464a0 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 16 Oct 2018 18:37:06 +0800
Subject: [PATCH 48/48] fill constant add infervarshape, lookuptable clone lr
 var (#13830)

* fill constant add infervarshape, lookuptable clone lr var

* test=develop

* add lookuptable ut, test=develop

* bug fix in transpliler about async with lookup table

* test=develop
---
 paddle/fluid/operators/fill_constant_op.cc    |  9 ++-
 .../fluid/tests/unittests/dist_simnet_bow.py  | 22 ++++--
 .../tests/unittests/test_dist_simnet_bow.py   | 78 ++++++++++++++++++-
 .../fluid/transpiler/distribute_transpiler.py | 24 +++---
 4 files changed, 113 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 2826b82117..e04a68717b 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -70,6 +70,12 @@ class FillConstantOp : public framework::OperatorBase {
   }
 };
 
+class FillConstantOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {}
+};
+
 class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -102,4 +108,5 @@ Fill up a variable with specified constant value.
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
                   ops::FillConstantInferShape, ops::FillConstantOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::FillConstantOpVarTypeInference);
diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
index 6456d1b53a..fac5e037a4 100644
--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
@@ -81,7 +81,10 @@ def get_optimizer():
     return optimizer
 
 
-def train_network(batch_size, is_distributed=False, is_sparse=False):
+def train_network(batch_size,
+                  is_distributed=False,
+                  is_sparse=False,
+                  is_self_contained_lr=False):
     # query
     q = fluid.layers.data(
         name="query_ids", shape=[1], dtype="int64", lod_level=1)
@@ -93,7 +96,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
         param_attr=fluid.ParamAttr(
             initializer=fluid.initializer.Constant(value=0.01),
             name="__emb__",
-            learning_rate=emb_lr),
+            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__"),
         is_sparse=is_sparse)
     ## vsum
     q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
@@ -119,7 +124,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
         param_attr=fluid.ParamAttr(
             initializer=fluid.initializer.Constant(value=0.01),
             name="__emb__",
-            learning_rate=emb_lr),
+            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__"),
         is_sparse=is_sparse)
     ## vsum
     pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
@@ -144,7 +151,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
         param_attr=fluid.ParamAttr(
             initializer=fluid.initializer.Constant(value=0.01),
             name="__emb__",
-            learning_rate=emb_lr),
+            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__"),
         is_sparse=is_sparse)
     ## vsum
     nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
@@ -220,7 +229,10 @@ class TestDistSimnetBow2x2(TestDistRunnerBase):
     def get_model(self, batch_size=2):
         # Train program
         avg_cost, acc, predict = \
-            train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"])))
+            train_network(batch_size,
+                          bool(int(os.environ["IS_DISTRIBUTED"])),
+                          bool(int(os.environ["IS_SPARSE"])),
+                          bool(int(os.environ["IS_SELF_CONTAINED_LR"])))
 
         inference_program = fluid.default_main_program().clone()
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
index e971f29db4..11095f2359 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
@@ -25,7 +25,11 @@ class TestDistSimnetBowDense2x2(TestDistBase):
         self._enforce_place = "CPU"
 
     def test_simnet_bow(self):
-        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '0',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
         self.check_with_place(
             "dist_simnet_bow.py",
             delta=1e-5,
@@ -39,7 +43,11 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase):
         self._enforce_place = "CPU"
 
     def test_simnet_bow(self):
-        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '0',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
         self.check_with_place(
             "dist_simnet_bow.py",
             delta=100,
@@ -53,7 +61,11 @@ class TestDistSimnetBowSparse2x2(TestDistBase):
         self._enforce_place = "CPU"
 
     def test_simnet_bow(self):
-        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '1',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
         self.check_with_place(
             "dist_simnet_bow.py",
             delta=1e-5,
@@ -67,7 +79,11 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
         self._enforce_place = "CPU"
 
     def test_simnet_bow(self):
-        need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '1',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
         self.check_with_place(
             "dist_simnet_bow.py",
             delta=100,
@@ -75,5 +91,59 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
             need_envs=need_envs)
 
 
+class TestDistSimnetBow2x2LookupTableSync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+
+    def test_simnet_bow(self):
+        need_envs = {
+            "IS_DISTRIBUTED": '1',
+            "IS_SPARSE": '1',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=1e-5,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+class TestDistSimnetBow2x2LookupTableAsync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._enforce_place = "CPU"
+
+    def test_simnet_bow(self):
+        need_envs = {
+            "IS_DISTRIBUTED": '1',
+            "IS_SPARSE": '1',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=100,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+
+    def test_simnet_bow(self):
+        need_envs = {
+            "IS_DISTRIBUTED": '1',
+            "IS_SPARSE": '1',
+            'IS_SELF_CONTAINED_LR': '0'
+        }
+        self.check_with_place(
+            "dist_simnet_bow.py",
+            delta=1e-5,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 91db85b8ec..2192139f8d 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -1119,6 +1119,7 @@ to transpile() call.")
 
     def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints):
         # 2. add split_ids_op and send_op to send gradient to pservers
+
         # there should only be one table_name
         all_ops = program.global_block().ops
         table_grad_name = grad_var_name(self.table_name)
@@ -1143,7 +1144,7 @@ to transpile() call.")
                         if self.sync_mode else []
                     },
                     attrs={
-                        "sync_mode": self.sync_mode,
+                        "sync_mode": not self.sync_mode,
                         "epmap": pserver_endpoints,
                         RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
                         OP_ROLE_VAR_ATTR_NAME: [
@@ -1189,7 +1190,15 @@ to transpile() call.")
     def _create_table_optimize_block(self, pserver_index, pserver_program,
                                      pre_block_idx, grad_to_block_id):
         # STEP: create table optimize block
+        table_opt_block = pserver_program._create_block(pre_block_idx)
         # create table param and grad var in pserver program
+        # create table optimize block in pserver program
+        table_opt_op = [
+            op for op in self.optimize_ops
+            if 'Param' in op.input_names and op.input("Param")[0] ==
+            self.table_name
+        ][0]
+
         origin_param_var = self.origin_program.global_block().vars[
             self.table_name]
 
@@ -1205,19 +1214,16 @@ to transpile() call.")
             dtype=origin_param_var.dtype,
             type=core.VarDesc.VarType.SELECTED_ROWS,
             persistable=True)
+
         # parameter must be selected rows
         param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
         grad_var = pserver_program.global_block()._clone_variable(
             self.origin_program.global_block().vars[grad_var_name(
                 self.table_name)])
 
-        # create table optimize block in pserver program
-        table_opt_op = [
-            op for op in self.optimize_ops
-            if 'Param' in op.input_names and op.input("Param")[0] ==
-            self.table_name
-        ][0]
-        table_opt_block = pserver_program._create_block(pre_block_idx)
+        lr_var = pserver_program.global_block()._clone_variable(
+            self.origin_program.global_block().vars[table_opt_op.input(
+                "LearningRate")[0]])
 
         if self.sync_mode:
             # create grad vars in pserver program
@@ -1249,8 +1255,6 @@ to transpile() call.")
             grad_var = pserver_program.global_block()._rename_var(
                 origin_grad_name, splited_grad_name)
 
-        lr_var = pserver_program.global_block().vars[table_opt_op.input(
-            "LearningRate")[0]]
         inputs = {
             "Param": [param_var],
             "Grad": [grad_var],