From 4ee1c9e60d278a5172c18549bfebbbe533fdfade Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Mon, 19 Mar 2018 19:07:57 -0700
Subject: [PATCH 01/67] "add sequence expand kernel"

---
 paddle/fluid/operators/sequence_expand_op.cu | 52 +++++++++++++++++++
 paddle/fluid/operators/sequence_expand_op.h  | 53 +++++++++++++-------
 2 files changed, 86 insertions(+), 19 deletions(-)
diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 26622d23af..6477af89f1 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -15,6 +15,58 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/sequence_expand_op.h"
 
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+__global__ sequence_expand_kernel(const T* x_data, T* out_data, size_t* lod,
+                                  size_t element_len) {
+  int BLOCK_SIZE = 1024;
+  __shared__ T shm_lod[BLOCK_SIZE];
+  for (int idx = threadIdx.x; idx < BLOCK_SIZE; ++idx) {
+    shm_lod[idx] = lod[idx];
+  }
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < lod.size();
+       idx += blockDim.x * gridDim.x) {
+    int scale = lod[i]
+  }
+}
+
+template <typename T>
+void SequenceExpandFunctor<platform::CPUDeviceContext, T>::operator()(
+    const platform::CPUDeviceContext& context, const LoDTensor& x,
+    LoDTensor* out) {
+  x_dims = x.dims();
+  size_t element_len = framework::product(x_dims) / x_dims[0];
+  T* out_data = out->mutable_data<T>(context.GetPlace());
+  auto out_starts = out->lod().back();
+
+  const int kThreadsPerBlock = 1024;
+  int block_cols = kThreadsPerBlock;
+  if (out_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
+    block_cols = ((out_cols + 31) >> 5) << 5;
+  }
+  int block_rows = kThreadsPerBlock / block_cols;
+  dim3 block_size = dim3(block_cols, block_rows, 1);
+
+  int max_threads = context.GetMaxPhysicalThreadCount();
+  int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+
+  int grid_cols =
+      std::min((out_cols + block_cols - 1) / block_cols, max_blocks);
+  int grid_rows =
+      std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1));
+  dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+  sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+      x.data<T>(), out->mutable_data<T>(context.GetPlace()),
+      out_starts.CUDAData(context.GetPlace()), element_len);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     sequence_expand,
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index 76dde976db..12e4018b95 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -16,13 +16,44 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "unsupported/Eigen/CXX11/Tensor"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
 
+template <typename DeviceContext, typename T>
+struct SequenceExpandFunctor {
+  void operator()(const DeviceContext& ctx, const LoDTensor& x, LoDTensor* out);
+};
+
+// template <typename DeviceContext, typename T>
+// struct SequenceExpandGradFunctor {};
+
+template <typename T>
+void SequenceExpandFunctor<platform::CPUDeviceContext, T>::operator()(
+    const platform::CPUDeviceContext& context, const LoDTensor& x,
+    LoDTensor* out) {
+  x_dims = x.dims();
+  size_t element_len = framework::product(x_dims) / x_dims[0];
+  T* out_data = out->mutable_data<T>(context.GetPlace());
+  auto out_starts = out->lod().back();
+
+  for (size_t i = 0; i < out_starts.size() - 1; i++) {
+    int scale = out_starts[i + 1] - out_starts[i];
+    Eigen::TensorMap<
+        Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+        x_t(x_data, 1, element_len);
+    Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+        out_t(out_data, scale, element_len);
+    Eigen::array<int, 2> cast({{scale, 1}});
+    out_t.device(*context.eigen_device()) = x_t.broadcast(cast);
+    x_data += element_len;
+    out_data += element_len * scale;
+  }
+}
+
 template <typename DeviceContext, typename T>
 class SequenceExpandKernel : public framework::OpKernel<T> {
  public:
@@ -38,24 +69,8 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
                       "The size of last lod level in Input(Y)"
                       "must be equal to dims[0] of Input(X).");
     out->set_lod(y->lod());
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-    size_t element_len = framework::product(x_dims) / x_dims[0];
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto out_starts = out->lod().back();
-
-    for (size_t i = 0; i < out_starts.size() - 1; i++) {
-      int scale = out_starts[i + 1] - out_starts[i];
-      Eigen::TensorMap<
-          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-          x_t(x_data, 1, element_len);
-      Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-          out_t(out_data, scale, element_len);
-      Eigen::array<int, 2> cast({{scale, 1}});
-      out_t.device(*place) = x_t.broadcast(cast);
-      x_data += element_len;
-      out_data += element_len * scale;
-    }
+    SequenceExpandFunctor<DeviceContext, T> functor;
+    functor(context.template device_context<DeviceContext>(), *x, out);
   }
 };
 

From 26822bd774a99d19d5bb37f4890e82aacd57c391 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Tue, 20 Mar 2018 04:04:58 -0700
Subject: [PATCH 02/67] "add sequence kernel"

---
 paddle/fluid/operators/sequence_expand_op.cu | 107 +++++++++++++------
 paddle/fluid/operators/sequence_expand_op.h  |  86 ++++++++-------
 2 files changed, 123 insertions(+), 70 deletions(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 6477af89f1..9cdb89f8fd 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -21,48 +21,89 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
-__global__ sequence_expand_kernel(const T* x_data, T* out_data, size_t* lod,
-                                  size_t element_len) {
-  int BLOCK_SIZE = 1024;
-  __shared__ T shm_lod[BLOCK_SIZE];
-  for (int idx = threadIdx.x; idx < BLOCK_SIZE; ++idx) {
-    shm_lod[idx] = lod[idx];
+__global__ void sequence_expand_kernel(const T* x_data, T* out_data,
+                                       const size_t* lod, size_t lod_size,
+                                       size_t element_len) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; tid_x < static_cast<int>(lod_size - 1);
+       tid_x += blockDim.x * gridDim.x) {
+    int scale = lod[tid_x + 1] - lod[tid_x];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < scale; tid_y += blockDim.y * gridDim.y) {
+      int tid_z = blockIdx.z * blockDim.z + threadIdx.z;
+      int item_start = tid_x / element_len;
+      for (; tid_z < element_len; tid_z += blockDim.z * gridDim.z) {
+        out_data[item_start * scale + tid_z] = x_data[item_start + tid_z];
+      }
+    }
   }
-  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < lod.size();
-       idx += blockDim.x * gridDim.x) {
-    int scale = lod[i]
+}
+
+template <typename T>
+__global__ void sequence_expand_grad_kernel(const T* dout_data, T* dx_data,
+                                            const size_t* lod, size_t lod_size,
+                                            size_t element_len,
+                                            size_t dout_size) {
+  extern __shared__ T shm[];
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; tid_x < static_cast<int>(lod_size - 1);
+       tid_x += blockDim.x * gridDim.x) {
+    int scale = lod[tid_x + 1] - lod[tid_x];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < scale; tid_y += blockDim.y * gridDim.y) {
+      int tid_z = blockIdx.z * blockDim.z + threadIdx.z;
+      int item_start = tid_x / element_len;
+      for (; tid_z < element_len; tid_z += blockDim.z * gridDim.z) {
+        shm[item_start + tid_z] += doutx_data[item_start * scale + tid_z];
+      }
+    }
+  }
+  // synchronize before write to dx
+  __syncthreads();
+  for (int idx = blockDimx * blockIdx.x + threadIdx.x;
+       idx < static_cast<int>(dout_size); idx += blockDim.x * gridDim.x) {
+    dx_data[idx] = shm[idx;]
   }
 }
 
 template <typename T>
-void SequenceExpandFunctor<platform::CPUDeviceContext, T>::operator()(
-    const platform::CPUDeviceContext& context, const LoDTensor& x,
-    LoDTensor* out) {
-  x_dims = x.dims();
-  size_t element_len = framework::product(x_dims) / x_dims[0];
-  T* out_data = out->mutable_data<T>(context.GetPlace());
-  auto out_starts = out->lod().back();
+struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const LoDTensor& x, LoDTensor* out) {
+    auto x_dims = x.dims();
+    size_t element_len = framework::product(x_dims) / x_dims[0];
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto out_starts = out->lod().back();
 
-  const int kThreadsPerBlock = 1024;
-  int block_cols = kThreadsPerBlock;
-  if (out_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
-    block_cols = ((out_cols + 31) >> 5) << 5;
+    dim3 block_size(16, 32, element_len);
+    dim3 grid_size(10, 10);
+    sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+        x.data<T>(), out->mutable_data<T>(context.GetPlace()),
+        out_starts.CUDAData(context.GetPlace()), out_starts.size(),
+        element_len);
   }
-  int block_rows = kThreadsPerBlock / block_cols;
-  dim3 block_size = dim3(block_cols, block_rows, 1);
+};
 
-  int max_threads = context.GetMaxPhysicalThreadCount();
-  int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+template <typename T>
+struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx, const LoDTensor& x,
+                  const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) {
+    auto x_dims = x.dims();
+    size_t element_len = framework::product(x_dims) / x_dims[0];
+    const T* x_data = x->data<T>();
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto out_starts = out->lod().back();
 
-  int grid_cols =
-      std::min((out_cols + block_cols - 1) / block_cols, max_blocks);
-  int grid_rows =
-      std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1));
-  dim3 grid_size = dim3(grid_cols, grid_rows, 1);
-  sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-      x.data<T>(), out->mutable_data<T>(context.GetPlace()),
-      out_starts.CUDAData(context.GetPlace()), element_len);
-}
+    dim3 block_size(16, 32, element_len);
+    dim3 grid_size(10, 10);
+    size_t out_size = framework::product(dx->dims());
+    sequence_expand_kernel<<<grid_size, block_size, out_size * sizeof(T),
+                             context.stream()>>>(
+        dout.data<T>(), dx->mutable_data<T>(context.GetPlace()),
+        out_starts.CUDAData(context.GetPlace()), out_starts.size(), element_len,
+        out_size);
+  }
+};
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index 12e4018b95..3b66bf3d8c 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -28,31 +28,36 @@ struct SequenceExpandFunctor {
   void operator()(const DeviceContext& ctx, const LoDTensor& x, LoDTensor* out);
 };
 
-// template <typename DeviceContext, typename T>
-// struct SequenceExpandGradFunctor {};
+template <typename DeviceContext, typename T>
+struct SequenceExpandGradFunctor {
+  void operator()(const DeviceContext& ctx, const LoDTensor& x,
+                  const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx);
+};
 
 template <typename T>
-void SequenceExpandFunctor<platform::CPUDeviceContext, T>::operator()(
-    const platform::CPUDeviceContext& context, const LoDTensor& x,
-    LoDTensor* out) {
-  x_dims = x.dims();
-  size_t element_len = framework::product(x_dims) / x_dims[0];
-  T* out_data = out->mutable_data<T>(context.GetPlace());
-  auto out_starts = out->lod().back();
+struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x,
+                  LoDTensor* out) {
+    auto x_dims = x.dims();
+    size_t element_len = framework::product(x_dims) / x_dims[0];
+    const T* x_data = x->data<T>();
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto out_starts = out->lod().back();
 
-  for (size_t i = 0; i < out_starts.size() - 1; i++) {
-    int scale = out_starts[i + 1] - out_starts[i];
-    Eigen::TensorMap<
-        Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-        x_t(x_data, 1, element_len);
-    Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-        out_t(out_data, scale, element_len);
-    Eigen::array<int, 2> cast({{scale, 1}});
-    out_t.device(*context.eigen_device()) = x_t.broadcast(cast);
-    x_data += element_len;
-    out_data += element_len * scale;
+    for (size_t i = 0; i < out_starts.size() - 1; i++) {
+      int scale = out_starts[i + 1] - out_starts[i];
+      Eigen::TensorMap<
+          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+          x_t(x_data, 1, element_len);
+      Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+          out_t(out_data, scale, element_len);
+      Eigen::array<int, 2> cast({{scale, 1}});
+      out_t.device(*context.eigen_device()) = x_t.broadcast(cast);
+      x_data += element_len;
+      out_data += element_len * scale;
+    }
   }
-}
+};
 
 template <typename DeviceContext, typename T>
 class SequenceExpandKernel : public framework::OpKernel<T> {
@@ -60,7 +65,6 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<LoDTensor>("X");
     auto* out = context.Output<LoDTensor>("Out");
-    const T* x_data = x->data<T>();
     auto x_dims = x->dims();
     auto* y = context.Input<LoDTensor>("Y");
     PADDLE_ENFORCE(!y->lod().empty(), "y should have lod");
@@ -86,19 +90,14 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
  *    Grad(X).lod = Input(X).lod
  *
  * */
-template <typename DeviceContext, typename T>
-class SequenceExpandGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* x = context.Input<LoDTensor>("X");
-    auto* out = context.Input<LoDTensor>("Out");
-    auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto out_last_level = out->lod().back();
-    d_x->set_lod(x->lod());
-    const T* d_out_data = d_out->data<T>();
+template <typename T>
+struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const LoDTensor& x,
+                  const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) {
+    auto out_last_level = out.lod().back();
+    const T* d_out_data = d_out.data<T>();
     T* d_x_data = d_x->mutable_data<T>(context.GetPlace());
-    size_t element_len = d_out->numel() / d_out->dims()[0];
+    size_t element_len = d_out.numel() / d_out.dims()[0];
     for (size_t i = 0; i < out_last_level.size() - 1; ++i) {
       size_t repeat = out_last_level[i + 1] - out_last_level[i];
       Eigen::TensorMap<
@@ -106,14 +105,27 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
       d_out_t(d_out_data, static_cast<int>(repeat), element_len);
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>
       d_x_t(d_x_data, static_cast<int>(element_len));
-      auto place =
-          context.template device_context<DeviceContext>().eigen_device();
-      d_x_t.device(*place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
+      d_x_t.device(*context.eigen_device()) =
+          d_out_t.sum(Eigen::array<int, 1>({{0}}));
       d_out_data += (repeat * element_len);
       d_x_data += element_len;
     }
   }
 };
 
+template <typename DeviceContext, typename T>
+class SequenceExpandGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* x = context.Input<LoDTensor>("X");
+    auto* out = context.Input<LoDTensor>("Out");
+    auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
+    d_x->set_lod(x->lod());
+    SequenceExpandGradFunctor(context.template device_context(), *x, *out,
+                              d_out, d_x);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

From e4c35d837d79c4b1a4f30e42efe143f64ec10e71 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Tue, 20 Mar 2018 04:43:00 -0700
Subject: [PATCH 03/67] "add details"

---
 paddle/fluid/operators/sequence_expand_op.cu | 19 +++++++++----------
 paddle/fluid/operators/sequence_expand_op.h  | 18 ++++++++++--------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 9cdb89f8fd..cae0a69284 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -54,15 +54,15 @@ __global__ void sequence_expand_grad_kernel(const T* dout_data, T* dx_data,
       int tid_z = blockIdx.z * blockDim.z + threadIdx.z;
       int item_start = tid_x / element_len;
       for (; tid_z < element_len; tid_z += blockDim.z * gridDim.z) {
-        shm[item_start + tid_z] += doutx_data[item_start * scale + tid_z];
+        shm[item_start + tid_z] += dout_data[item_start * scale + tid_z];
       }
     }
   }
   // synchronize before write to dx
   __syncthreads();
-  for (int idx = blockDimx * blockIdx.x + threadIdx.x;
+  for (int idx = blockDim.x * blockIdx.x + threadIdx.x;
        idx < static_cast<int>(dout_size); idx += blockDim.x * gridDim.x) {
-    dx_data[idx] = shm[idx;]
+    dx_data[idx] = shm[idx];
   }
 }
 
@@ -86,19 +86,18 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
 
 template <typename T>
 struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx, const LoDTensor& x,
-                  const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const LoDTensor& x, const LoDTensor& out,
+                  const LoDTensor& dout, LoDTensor* dx) {
     auto x_dims = x.dims();
     size_t element_len = framework::product(x_dims) / x_dims[0];
-    const T* x_data = x->data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto out_starts = out->lod().back();
+    auto out_starts = out.lod().back();
 
     dim3 block_size(16, 32, element_len);
     dim3 grid_size(10, 10);
     size_t out_size = framework::product(dx->dims());
-    sequence_expand_kernel<<<grid_size, block_size, out_size * sizeof(T),
-                             context.stream()>>>(
+    sequence_expand_grad_kernel<<<grid_size, block_size, out_size * sizeof(T),
+                                  context.stream()>>>(
         dout.data<T>(), dx->mutable_data<T>(context.GetPlace()),
         out_starts.CUDAData(context.GetPlace()), out_starts.size(), element_len,
         out_size);
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index 3b66bf3d8c..11890b30ae 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -40,7 +40,7 @@ struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
                   LoDTensor* out) {
     auto x_dims = x.dims();
     size_t element_len = framework::product(x_dims) / x_dims[0];
-    const T* x_data = x->data<T>();
+    const T* x_data = x.data<T>();
     T* out_data = out->mutable_data<T>(context.GetPlace());
     auto out_starts = out->lod().back();
 
@@ -92,12 +92,12 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
  * */
 template <typename T>
 struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx, const LoDTensor& x,
+  void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x,
                   const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) {
     auto out_last_level = out.lod().back();
-    const T* d_out_data = d_out.data<T>();
-    T* d_x_data = d_x->mutable_data<T>(context.GetPlace());
-    size_t element_len = d_out.numel() / d_out.dims()[0];
+    const T* d_out_data = dout.data<T>();
+    T* d_x_data = dx->mutable_data<T>(context.GetPlace());
+    size_t element_len = dout.numel() / dout.dims()[0];
     for (size_t i = 0; i < out_last_level.size() - 1; ++i) {
       size_t repeat = out_last_level[i + 1] - out_last_level[i];
       Eigen::TensorMap<
@@ -117,13 +117,15 @@ template <typename DeviceContext, typename T>
 class SequenceExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* x = context.Input<LoDTensor>("X");
     auto* out = context.Input<LoDTensor>("Out");
+    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+
     auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
     d_x->set_lod(x->lod());
-    SequenceExpandGradFunctor(context.template device_context(), *x, *out,
-                              d_out, d_x);
+    SequenceExpandGradFunctor<DeviceContext, T> functor;
+    functor(context.template device_context<DeviceContext>(), *x, *out, *d_out,
+            d_x);
   }
 };
 

From 53c8c36a04f92685f3fc380cbc41b9af1031de67 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Wed, 21 Mar 2018 05:49:53 -0700
Subject: [PATCH 04/67] "debug the process"

---
 paddle/fluid/framework/executor.cc            |   2 +-
 paddle/fluid/operators/sequence_expand_op.cu  | 128 ++++++++++++------
 .../paddle/fluid/tests/unittests/op_test.py   |   3 +
 .../tests/unittests/test_sequence_expand.py   |  88 ++++++------
 4 files changed, 133 insertions(+), 88 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 7155d5ef2f..5125072ddd 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -44,7 +44,7 @@ struct ExecutorPrepareContext {
   ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id)
       : prog_(prog), block_id_(block_id) {}
 
-  const framework::ProgramDesc& prog_;
+  const framework::ProgramDesc prog_;
   size_t block_id_;
   std::vector<std::unique_ptr<OperatorBase>> ops_;
 };
diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index cae0a69284..bf453ca7e8 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
+#include <stdio.h>
+#include <algorithm>
 #include "paddle/fluid/operators/sequence_expand_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -22,47 +25,71 @@ using LoDTensor = framework::LoDTensor;
 
 template <typename T>
 __global__ void sequence_expand_kernel(const T* x_data, T* out_data,
-                                       const size_t* lod, size_t lod_size,
-                                       size_t element_len) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; tid_x < static_cast<int>(lod_size - 1);
-       tid_x += blockDim.x * gridDim.x) {
-    int scale = lod[tid_x + 1] - lod[tid_x];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < scale; tid_y += blockDim.y * gridDim.y) {
-      int tid_z = blockIdx.z * blockDim.z + threadIdx.z;
-      int item_start = tid_x / element_len;
-      for (; tid_z < element_len; tid_z += blockDim.z * gridDim.z) {
-        out_data[item_start * scale + tid_z] = x_data[item_start + tid_z];
-      }
+                                       const size_t* lod,
+                                       const size_t* out_offset,
+                                       size_t lod_size, size_t element_len,
+                                       size_t x_size) {
+  int bid_x = blockIdx.x;
+  if (bid_x > lod_size) return;
+  int repeats = lod[bid_x];
+  int offset = out_offset[bid_x];
+  for (int tid_y = threadIdx.y; tid_y < repeats; tid_y += blockDim.y) {
+    for (int tid_x = threadIdx.x; tid_x < element_len; tid_x += blockDim.x) {
+      out_data[(offset + tid_y) * element_len + tid_x] =
+          x_data[bid_x * element_len + tid_x];
     }
   }
 }
 
 template <typename T>
 __global__ void sequence_expand_grad_kernel(const T* dout_data, T* dx_data,
-                                            const size_t* lod, size_t lod_size,
-                                            size_t element_len,
-                                            size_t dout_size) {
+                                            const size_t* lod,
+                                            const size_t* out_offset,
+                                            size_t lod_size, size_t element_len,
+                                            size_t dout_size, size_t dx_size) {
+  // reduce visit memory time.
+  // dout_shm = [0 - dout_size-1], dx_shm = [dout_size-1, dout_size + dx_size-1]
+  if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 &&
+      threadIdx.y == 0) {
+    printf("lod_size=%ld, element_size=%ld, dout_size=%ld, dx_size=%ld\n",
+           lod_size, element_len, dout_size, dx_size);
+  }
   extern __shared__ T shm[];
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; tid_x < static_cast<int>(lod_size - 1);
-       tid_x += blockDim.x * gridDim.x) {
-    int scale = lod[tid_x + 1] - lod[tid_x];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < scale; tid_y += blockDim.y * gridDim.y) {
-      int tid_z = blockIdx.z * blockDim.z + threadIdx.z;
-      int item_start = tid_x / element_len;
-      for (; tid_z < element_len; tid_z += blockDim.z * gridDim.z) {
-        shm[item_start + tid_z] += dout_data[item_start * scale + tid_z];
-      }
+  T* dout_shm = shm;
+  T* dx_shm = &shm[dout_size];
+
+  // int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int idx = 0; idx < dout_size; ++idx) {
+    if (idx < dx_size) {
+      dx_shm[idx] = 0.0;
+    }
+    if (idx < dout_size) {
+      dout_shm[idx] = dout_data[idx];
+    }
+  }
+
+  int bid_x = blockIdx.x;
+  if (bid_x > lod_size) return;
+  int repeats = lod[bid_x];
+  int offset = out_offset[bid_x];
+  if (threadIdx.x == 0) {
+    printf("repeats=%d, offset=%ld\n", repeats, offset);
+  }
+  for (int tid_y = threadIdx.y; tid_y < repeats; tid_y += blockDim.y) {
+    for (int tid_x = threadIdx.x; tid_x < element_len; tid_x += blockDim.x) {
+      T val = dout_shm[(offset + tid_y) * element_len + tid_x];
+      platform::CudaAtomicAdd(&dx_shm[bid_x * element_len + tid_x], val);
+      int dx_idx = bid_x * element_len + tid_x;
+      int dout_idx = (offset + tid_y) * element_len + tid_x;
+      printf("dx_idx=%d, dout_idx=%d, dx_data=%f, dout_data=%f, val=%f \n",
+             dx_idx, dout_idx, dx_shm[dx_idx], dout_shm[dout_idx], val);
     }
   }
-  // synchronize before write to dx
   __syncthreads();
-  for (int idx = blockDim.x * blockIdx.x + threadIdx.x;
-       idx < static_cast<int>(dout_size); idx += blockDim.x * gridDim.x) {
-    dx_data[idx] = shm[idx];
+  // copy shared memory back to dx
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < dx_size;
+       idx += blockDim.x * gridDim.x) {
+    dx_data[idx] = dx_shm[idx];
   }
 }
 
@@ -72,15 +99,20 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
                   const LoDTensor& x, LoDTensor* out) {
     auto x_dims = x.dims();
     size_t element_len = framework::product(x_dims) / x_dims[0];
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto out_starts = out->lod().back();
+    auto lod = out->lod().back();
+    framework::Vector<size_t> out_lod;
+    for (size_t i = 0; i < lod.size() - 1; ++i) {
+      out_lod.push_back(lod[i + 1] - lod[i]);
+    }
 
-    dim3 block_size(16, 32, element_len);
-    dim3 grid_size(10, 10);
+    int thread_x = std::max(static_cast<int>(element_len), 32);
+    int block_x = static_cast<int>(out_lod.size());
+    dim3 block_size(thread_x, 1024 / thread_x);
+    dim3 grid_size(block_x, 1);
     sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
         x.data<T>(), out->mutable_data<T>(context.GetPlace()),
-        out_starts.CUDAData(context.GetPlace()), out_starts.size(),
-        element_len);
+        out_lod.CUDAData(context.GetPlace()), lod.CUDAData(context.GetPlace()),
+        out_lod.size(), element_len, framework::product(x_dims));
   }
 };
 
@@ -91,16 +123,24 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
                   const LoDTensor& dout, LoDTensor* dx) {
     auto x_dims = x.dims();
     size_t element_len = framework::product(x_dims) / x_dims[0];
-    auto out_starts = out.lod().back();
+    auto lod = out.lod().back();
+    framework::Vector<size_t> out_lod;
+    for (size_t i = 0; i < lod.size() - 1; ++i) {
+      out_lod.push_back(lod[i + 1] - lod[i]);
+    }
+    size_t dout_size = framework::product(dout.dims());
+    size_t dx_size = framework::product(dx->dims());
 
-    dim3 block_size(16, 32, element_len);
-    dim3 grid_size(10, 10);
-    size_t out_size = framework::product(dx->dims());
-    sequence_expand_grad_kernel<<<grid_size, block_size, out_size * sizeof(T),
+    int thread_x = std::max(static_cast<int>(element_len), 32);
+    dim3 block_size(thread_x, 1024 / thread_x);
+    int block_x = static_cast<int>(out_lod.size());
+    dim3 grid_size(block_x, 1);
+    sequence_expand_grad_kernel<<<grid_size, block_size,
+                                  (dout_size + dx_size) * sizeof(T),
                                   context.stream()>>>(
         dout.data<T>(), dx->mutable_data<T>(context.GetPlace()),
-        out_starts.CUDAData(context.GetPlace()), out_starts.size(), element_len,
-        out_size);
+        out_lod.CUDAData(context.GetPlace()), lod.CUDAData(context.GetPlace()),
+        out_lod.size(), element_len, dout_size, dx_size);
   }
 };
 
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 8393f7827b..555f188abb 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -362,6 +362,9 @@ class OpTest(unittest.TestCase):
         for a, b, name in itertools.izip(numeric_grads, analytic_grads, names):
             abs_a = np.abs(a)
             abs_a[abs_a < 1e-3] = 1
+            print("actual", a)
+            print("*****")
+            print("expected", b)
 
             diff_mat = np.abs(a - b) / abs_a
             max_diff = np.max(diff_mat)
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index 957fa5d2c4..f984127b4d 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -19,8 +19,14 @@ from op_test import OpTest
 
 class TestSequenceExpand(OpTest):
     def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
-        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
+        x = [i / 10.0 for i in range(3)]
+        y = [i / 10.0 for i in range(8)]
+        x_data = np.array(x).reshape(3, 1).astype('float32')
+        y_data = np.array(y).reshape(8, 1).astype('float32')
+        print(x_data)
+        print(y_data)
+        # x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
+        # y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
         y_lod = [[0, 1, 4, 8]]
         self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
 
@@ -45,47 +51,43 @@ class TestSequenceExpand(OpTest):
     def test_check_grad(self):
         self.check_grad(["X"], "Out")
 
-
-class TestSequenceExpandCase1(TestSequenceExpand):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
-        x_lod = [[0, 2, 5]]
-        y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
-        y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
-
-
-class TestSequenceExpandCase2(TestSequenceExpand):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
-        x_lod = [[0, 1]]
-        y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32')
-        y_lod = [[0, 2]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
-
-
-class TestSequenceExpandCase3(TestSequenceExpand):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
-        x_lod = [[0, 1, 2, 3, 4]]
-        y_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32')
-        y_lod = [[0, 2, 4, 4, 6]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
-
-
-class TestSequenceExpandCase4(TestSequenceExpand):
-    def set_data(self):
-        x_data = np.array(
-            [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]).reshape(
-                [2, 5]).astype('float32')
-        x_lod = [[
-            0,
-            1,
-            2,
-        ]]
-        y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32')
-        y_lod = [[0, 1, 2], [0, 1, 2]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+    # class TestSequenceExpandCase1(TestSequenceExpand):
+    #     def set_data(self):
+    #         x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
+    #         x_lod = [[0, 2, 5]]
+    #         y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
+    #         y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]]
+    #         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+
+    # class TestSequenceExpandCase2(TestSequenceExpand):
+    #     def set_data(self):
+    #         x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
+    #         x_lod = [[0, 1]]
+    #         y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32')
+    #         y_lod = [[0, 2]]
+    #         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+
+    # class TestSequenceExpandCase3(TestSequenceExpand):
+    #     def set_data(self):
+    #         x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
+    #         x_lod = [[0, 1, 2, 3, 4]]
+    #         y_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32')
+    #         y_lod = [[0, 2, 4, 4, 6]]
+    #         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+
+    # class TestSequenceExpandCase4(TestSequenceExpand):
+    #     def set_data(self):
+    #         x_data = np.array(
+    #             [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]).reshape(
+    #                 [2, 5]).astype('float32')
+    #         x_lod = [[
+    #             0,
+    #             1,
+    #             2,
+    #         ]]
+    #         y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32')
+    #         y_lod = [[0, 1, 2], [0, 1, 2]]
+    #         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
 if __name__ == '__main__':

From db1b128feb63a14514c2e38e344f6b464e1b7a68 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Mon, 26 Mar 2018 20:16:57 -0700
Subject: [PATCH 05/67] "add details"

---
 paddle/fluid/operators/sequence_expand_op.h | 161 ++++++++++++++------
 1 file changed, 114 insertions(+), 47 deletions(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index 11890b30ae..5cab367988 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -13,15 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <numeric>  // std::itoa
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename DeviceContext, typename T>
 struct SequenceExpandFunctor {
@@ -38,23 +42,35 @@ template <typename T>
 struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x,
                   LoDTensor* out) {
-    auto x_dims = x.dims();
-    size_t element_len = framework::product(x_dims) / x_dims[0];
-    const T* x_data = x.data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto out_starts = out->lod().back();
-
-    for (size_t i = 0; i < out_starts.size() - 1; i++) {
-      int scale = out_starts[i + 1] - out_starts[i];
-      Eigen::TensorMap<
-          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-          x_t(x_data, 1, element_len);
-      Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-          out_t(out_data, scale, element_len);
-      Eigen::array<int, 2> cast({{scale, 1}});
-      out_t.device(*context.eigen_device()) = x_t.broadcast(cast);
-      x_data += element_len;
-      out_data += element_len * scale;
+    auto& out_lod = out->lod()[0];
+    framework::Vector<size_t> x_lod;
+    if (x.lod() == 1) {
+      x_lod = x.lod()[0];
+    } else {
+      x_lod.reserve(out_lod.size());
+      std::itoa(x_lod.begin(), x_lod.end(), 0);  // fill 0 ~ out_lod.size()-1
+    }
+    int out_offset = 0;
+    auto& eigen_place = *context.eigen_device();
+    for (size_t i = 1; i < out_lod.size(); ++i) {
+      int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+      int x_start = x_lod[i - 1];
+      int x_end = x_lod[i];
+      int x_seq_len = x_end - x_start;
+      if (repeat_num > 0) {
+        auto x_sub_tensor = x->Slice(x_start, x_end);
+        x_sub_tensor.Resize({1, x_sub_tensor.numel()});
+        int out_start = out_offset;
+        if (x_lod.size() == 1) {
+          out_start = out_lod[0][out_offset];
+        }
+        auto out_sub_tensor =
+            out->Slice(out_start, out_start + x_seq_len * repeat_num);
+        out_sub_tensor.Resize({repeat_num, x_sub_tensor.dims()[1]});
+        EigenMatrix<T>::From(out_sub_tensor).device(eigen_place) =
+            EigenMatrix<T>::From(x_sub_tensor)
+                .broadcast(Eigen::array<int, 2>({{repeat_num, 1}}));
+      }
     }
   }
 };
@@ -64,15 +80,42 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    auto x_dims = x->dims();
     auto* y = context.Input<LoDTensor>("Y");
-    PADDLE_ENFORCE(!y->lod().empty(), "y should have lod");
-    PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims[0]),
-                      y->lod().back().size() - 1,
-                      "The size of last lod level in Input(Y)"
-                      "must be equal to dims[0] of Input(X).");
-    out->set_lod(y->lod());
+    auto* out = context.Output<LoDTensor>("Out");
+
+    int ref_level = context.Attr<int>("ref_level");
+    auto& x_lod = x->lod();
+    auto& y_lod = y->lod();
+
+    if (ref_level == -1) ref_level = y_lod.size() - 1;
+
+    out->mutable_data<T>(context.GetPlace());
+
+    if (y_lod[ref_level].size() <= 1) {
+      framework::TensorCopy(*x, context.GetPlace(), out);
+      return;
+    }
+
+    auto& out_lod = *out->mutable_lod();
+    // x lod level is at most 1.
+    if (x_lod.size() == 0) {
+      out_lod = y_lod[ref_level];
+    } else if (x_lod.size() == 1) {
+      out_lod.resize(1);
+      out_lod[0] = {0};
+      int out_offset = 0;
+      for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
+        int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+        int x_start = x_lod[0][i - 1];
+        int x_end = x_lod[0][i];
+        int x_seq_len = x_end - x_start;
+        for (int j = 0; j < repeat_num; ++j) {
+          out_lod[0].push_back(out_lod[0].back() + x_seq_len);
+          out_offset++;
+        }
+      }
+    }
+
     SequenceExpandFunctor<DeviceContext, T> functor;
     functor(context.template device_context<DeviceContext>(), *x, out);
   }
@@ -94,21 +137,31 @@ template <typename T>
 struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x,
                   const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) {
-    auto out_last_level = out.lod().back();
-    const T* d_out_data = dout.data<T>();
-    T* d_x_data = dx->mutable_data<T>(context.GetPlace());
-    size_t element_len = dout.numel() / dout.dims()[0];
-    for (size_t i = 0; i < out_last_level.size() - 1; ++i) {
-      size_t repeat = out_last_level[i + 1] - out_last_level[i];
-      Eigen::TensorMap<
-          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-      d_out_t(d_out_data, static_cast<int>(repeat), element_len);
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>
-      d_x_t(d_x_data, static_cast<int>(element_len));
-      d_x_t.device(*context.eigen_device()) =
-          d_out_t.sum(Eigen::array<int, 1>({{0}}));
-      d_out_data += (repeat * element_len);
-      d_x_data += element_len;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    math::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, g_x, static_cast<T>(0));
+
+    int g_out_offset = 0;
+    for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
+      int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+      if (repeat_num > 0) {
+        int x_start = i - 1;
+        int x_end = i;
+        if (x_lod.size() == 1) {
+          x_start = x_lod[0][i - 1];
+          x_end = x_lod[0][i];
+        }
+        int x_seq_len = x_end - x_start;
+        auto g_x_sub = g_x->Slice(x_start, x_end);
+        g_x_sub.Resize(flatten_to_1d(g_x_sub.dims()));
+        int g_out_end = g_out_offset + repeat_num * x_seq_len;
+        auto g_out_sub = g_out->Slice(g_out_offset, g_out_end);
+        g_out_sub.Resize({repeat_num, g_x_sub.dims()[0]});
+        math::ColwiseSum<DeviceContext, T> col_sum;
+        col_sum(dev_ctx, g_out_sub, &g_x_sub);
+        g_out_offset += repeat_num * x_seq_len;
+      }
     }
   }
 };
@@ -117,15 +170,29 @@ template <typename DeviceContext, typename T>
 class SequenceExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
+    auto* g_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* x = context.Input<LoDTensor>("X");
-    auto* out = context.Input<LoDTensor>("Out");
-    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* y = context.Input<LoDTensor>("Y");
+    auto* g_x = context.Output<LoDTensor>(framework::GradVarName("X"));
+    int ref_level = context.Attr<int>("ref_level");
+
+    g_x->mutable_data<T>(context.GetPlace());
+    g_x->set_lod(x->lod());
+
+    auto& x_lod = x->lod();
+    auto& y_lod = y->lod();
+
+    if (ref_level == -1) ref_level = y_lod.size() - 1;
+
+    // just copy the gradient
+    if (y_lod[ref_level].size() <= 1) {
+      framework::TensorCopy(*g_out, context.GetPlace(), g_x);
+      return;
+    }
 
-    auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
-    d_x->set_lod(x->lod());
     SequenceExpandGradFunctor<DeviceContext, T> functor;
-    functor(context.template device_context<DeviceContext>(), *x, *out, *d_out,
-            d_x);
+    functor(context.template device_context<DeviceContext>(), *x, *y, *g_out,
+            g_x);
   }
 };
 

From 0be1e09f2c703c1479259ab68b06cc4bd1cb5c43 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Wed, 28 Mar 2018 02:34:34 -0700
Subject: [PATCH 06/67] "fix ci"

---
 paddle/fluid/operators/sequence_expand_op.cc  |   5 +-
 paddle/fluid/operators/sequence_expand_op.cu  | 193 +++++++++---------
 paddle/fluid/operators/sequence_expand_op.h   | 130 ++++++------
 .../tests/unittests/test_sequence_expand.py   |  22 +-
 4 files changed, 183 insertions(+), 167 deletions(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.cc b/paddle/fluid/operators/sequence_expand_op.cc
index 786fe63e75..ae52849162 100644
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
@@ -84,12 +84,11 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
         }
       }
       out_dims[0] = out_first_dim;
-      ctx->SetOutputDim("Out", out_dims);
     } else {
       out_dims[0] = -1;
-      ctx->SetOutputDim("Out", out_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
     }
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 743e3bbc29..1bd7342652 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -24,123 +24,128 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
-__global__ void sequence_expand_kernel(const T* x_data, T* out_data,
-                                       const size_t* lod,
-                                       const size_t* out_offset,
-                                       size_t lod_size, size_t element_len,
-                                       size_t x_size) {
-  int bid_x = blockIdx.x;
-  if (bid_x > lod_size) return;
-  int repeats = lod[bid_x];
-  int offset = out_offset[bid_x];
-  for (int tid_y = threadIdx.y; tid_y < repeats; tid_y += blockDim.y) {
-    for (int tid_x = threadIdx.x; tid_x < element_len; tid_x += blockDim.x) {
-      out_data[(offset + tid_y) * element_len + tid_x] =
-          x_data[bid_x * element_len + tid_x];
+__global__ void sequence_expand_kernel(const T* x_data, const size_t* x_lod,
+                                       const size_t* ref_lod,
+                                       const size_t lod_size,
+                                       /* default=1,
+                                          the instance length*/
+                                       const int x_item_length, T* out_data) {
+  constexpr int N = 1024;
+  __shared__ int mem[N];
+  int offset = 0;
+  for (int i = 0; i < lod_size; ++i) {
+    mem[i] = offset;
+    if (i < lod_size - 1) {
+      offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]);
     }
   }
-}
+  __syncthreads();
 
-template <typename T>
-__global__ void sequence_expand_grad_kernel(const T* dout_data, T* dx_data,
-                                            const size_t* lod,
-                                            const size_t* out_offset,
-                                            size_t lod_size, size_t element_len,
-                                            size_t dout_size, size_t dx_size) {
-  // reduce visit memory time.
-  // dout_shm = [0 - dout_size-1], dx_shm = [dout_size-1, dout_size + dx_size-1]
-  if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 &&
-      threadIdx.y == 0) {
-    printf("lod_size=%ld, element_size=%ld, dout_size=%ld, dx_size=%ld\n",
-           lod_size, element_len, dout_size, dx_size);
-  }
-  extern __shared__ T shm[];
-  T* dout_shm = shm;
-  T* dx_shm = &shm[dout_size];
-
-  // int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int idx = 0; idx < dout_size; ++idx) {
-    if (idx < dx_size) {
-      dx_shm[idx] = 0.0;
-    }
-    if (idx < dout_size) {
-      dout_shm[idx] = dout_data[idx];
+  int bid = blockIdx.x;
+  if (bid >= lod_size - 1) return;
+
+  int x_item_count = x_lod[bid + 1] - x_lod[bid];
+  int repeats = ref_lod[bid + 1] - ref_lod[bid];
+  int out_offset = mem[bid];
+  int x_offset = x_lod[bid];
+  for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
+    for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
+      for (int tid_x = threadIdx.x; tid_x < x_item_length;
+           tid_x += blockDim.x) {
+        out_data[(out_offset + tid_z * x_item_count + tid_y) * x_item_length +
+                 tid_x] = x_data[(x_offset + tid_y) * x_item_length + tid_x];
+      }
     }
   }
+}
 
-  int bid_x = blockIdx.x;
-  if (bid_x > lod_size) return;
-  int repeats = lod[bid_x];
-  int offset = out_offset[bid_x];
-  if (threadIdx.x == 0) {
-    printf("repeats=%d, offset=%ld\n", repeats, offset);
-  }
-  for (int tid_y = threadIdx.y; tid_y < repeats; tid_y += blockDim.y) {
-    for (int tid_x = threadIdx.x; tid_x < element_len; tid_x += blockDim.x) {
-      T val = dout_shm[(offset + tid_y) * element_len + tid_x];
-      platform::CudaAtomicAdd(&dx_shm[bid_x * element_len + tid_x], val);
-      int dx_idx = bid_x * element_len + tid_x;
-      int dout_idx = (offset + tid_y) * element_len + tid_x;
-      printf("dx_idx=%d, dout_idx=%d, dx_data=%f, dout_data=%f, val=%f \n",
-             dx_idx, dout_idx, dx_shm[dx_idx], dout_shm[dout_idx], val);
+template <typename T>
+__global__ void sequence_expand_grad_kernel(const T* dout_data,
+                                            const size_t* ref_lod,
+                                            const size_t* dx_lod,
+                                            const size_t lod_size,
+                                            /* default=1,
+                                               the instance length*/
+                                            const int x_item_length,
+                                            T* dx_data) {
+  // TODO(dzhwinter) : too many atomicAdd
+  // use shared memory to reduce memory visits
+  constexpr int N = 1024;
+  __shared__ int mem[N];
+  int offset = 0;
+  for (int i = 0; i < lod_size; ++i) {
+    mem[i] = offset;
+    if (i < lod_size - 1) {
+      offset += (ref_lod[i + 1] - ref_lod[i]) * (dx_lod[i + 1] - dx_lod[i]);
     }
   }
   __syncthreads();
-  // copy shared memory back to dx
-  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < dx_size;
-       idx += blockDim.x * gridDim.x) {
-    dx_data[idx] = dx_shm[idx];
+
+  int bid = blockIdx.x;
+  if (bid >= lod_size - 1) return;
+  int x_item_count = dx_lod[bid + 1] - dx_lod[bid];
+  int repeats = ref_lod[bid + 1] - ref_lod[bid];
+  int out_offset = mem[bid];
+  int x_offset = dx_lod[bid];
+
+  for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
+    for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
+      for (int tid_x = threadIdx.x; tid_x < x_item_length;
+           tid_x += blockDim.x) {
+        platform::CudaAtomicAdd(
+            &dx_data[(x_offset + tid_y) * x_item_length + tid_x],
+            dout_data[(out_offset + tid_z * x_item_count + tid_y) *
+                          x_item_length +
+                      tid_x]);
+      }
+    }
   }
 }
 
 template <typename T>
 struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const LoDTensor& x, LoDTensor* out) {
-    auto x_dims = x.dims();
-    size_t element_len = framework::product(x_dims) / x_dims[0];
-    auto lod = out->lod().back();
-    framework::Vector<size_t> out_lod;
-    for (size_t i = 0; i < lod.size() - 1; ++i) {
-      out_lod.push_back(lod[i + 1] - lod[i]);
-    }
-
-    int thread_x = std::max(static_cast<int>(element_len), 32);
-    int block_x = static_cast<int>(out_lod.size());
-    dim3 block_size(thread_x, 1024 / thread_x);
+  void operator()(
+      const platform::CUDADeviceContext& context, const LoDTensor& x,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* out) {
+    int x_item_length = 1;
+    x_item_length = x.numel() / x.dims()[0];
+    VLOG(0) << "x_item_length" << x_item_length;
+    int thread_x = std::max(static_cast<int>(ref_lod.size()), 32);
+    int thread_y = std::max(1024 / thread_x, 16);
+    int thread_z = std::min(1024 / thread_x / thread_y, 16);
+    int block_x = static_cast<int>(ref_lod.size());
+    dim3 block_size(thread_x, thread_y, thread_z);
     dim3 grid_size(block_x, 1);
+
     sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-        x.data<T>(), out->mutable_data<T>(context.GetPlace()),
-        out_lod.CUDAData(context.GetPlace()), lod.CUDAData(context.GetPlace()),
-        out_lod.size(), element_len, framework::product(x_dims));
+        x.data<T>(), x_lod.CUDAData(context.GetPlace()),
+        ref_lod.CUDAData(context.GetPlace()), x_lod.size(), x_item_length,
+        out->mutable_data<T>(context.GetPlace()));
   }
 };
 
 template <typename T>
 struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
-                  const LoDTensor& x, const LoDTensor& out,
-                  const LoDTensor& dout, LoDTensor* dx) {
-    auto x_dims = x.dims();
-    size_t element_len = framework::product(x_dims) / x_dims[0];
-    auto lod = out.lod().back();
-    framework::Vector<size_t> out_lod;
-    for (size_t i = 0; i < lod.size() - 1; ++i) {
-      out_lod.push_back(lod[i + 1] - lod[i]);
-    }
-    size_t dout_size = framework::product(dout.dims());
-    size_t dx_size = framework::product(dx->dims());
-
-    int thread_x = std::max(static_cast<int>(element_len), 32);
-    dim3 block_size(thread_x, 1024 / thread_x);
-    int block_x = static_cast<int>(out_lod.size());
+                  const LoDTensor& dout,
+                  const framework::Vector<size_t>& x_lod, /*expand source lod*/
+                  const framework::Vector<size_t>& ref_lod, /*expand based lod*/
+                  LoDTensor* dx) {
+    int x_item_length = 1;
+    x_item_length = framework::product(dx->dims()) / dx->dims()[0];
+
+    int thread_x = std::max(static_cast<int>(ref_lod.size()), 32);
+    int thread_y = std::max(1024 / thread_x, 16);
+    int thread_z = std::min(1024 / thread_x / thread_y, 16);
+    int block_x = static_cast<int>(ref_lod.size());
+    dim3 block_size(thread_x, thread_y, thread_z);
     dim3 grid_size(block_x, 1);
-    sequence_expand_grad_kernel<<<grid_size, block_size,
-                                  (dout_size + dx_size) * sizeof(T),
-                                  context.stream()>>>(
-        dout.data<T>(), dx->mutable_data<T>(context.GetPlace()),
-        out_lod.CUDAData(context.GetPlace()), lod.CUDAData(context.GetPlace()),
-        out_lod.size(), element_len, dout_size, dx_size);
+    sequence_expand_grad_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+        dout.data<T>(), ref_lod.CUDAData(context.GetPlace()),
+        x_lod.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length,
+        dx->mutable_data<T>(context.GetPlace()));
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index 5cab367988..c55c3e215a 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <numeric>  // std::itoa
+#include <numeric>  // std::iota
 
+#include <glog/logging.h>
+#include <sstream>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -29,40 +31,42 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename DeviceContext, typename T>
 struct SequenceExpandFunctor {
-  void operator()(const DeviceContext& ctx, const LoDTensor& x, LoDTensor* out);
+  void operator()(
+      const DeviceContext& ctx, const LoDTensor& x,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* out);
 };
 
 template <typename DeviceContext, typename T>
 struct SequenceExpandGradFunctor {
-  void operator()(const DeviceContext& ctx, const LoDTensor& x,
-                  const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx);
+  void operator()(
+      const DeviceContext& ctx, const LoDTensor& dout,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* dx);
 };
 
 template <typename T>
 struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x,
-                  LoDTensor* out) {
-    auto& out_lod = out->lod()[0];
-    framework::Vector<size_t> x_lod;
-    if (x.lod() == 1) {
-      x_lod = x.lod()[0];
-    } else {
-      x_lod.reserve(out_lod.size());
-      std::itoa(x_lod.begin(), x_lod.end(), 0);  // fill 0 ~ out_lod.size()-1
-    }
+  void operator()(
+      const platform::CPUDeviceContext& context, const LoDTensor& x,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* out) {
     int out_offset = 0;
     auto& eigen_place = *context.eigen_device();
-    for (size_t i = 1; i < out_lod.size(); ++i) {
-      int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      int repeat_num = ref_lod[i] - ref_lod[i - 1];
       int x_start = x_lod[i - 1];
       int x_end = x_lod[i];
       int x_seq_len = x_end - x_start;
       if (repeat_num > 0) {
-        auto x_sub_tensor = x->Slice(x_start, x_end);
+        auto x_sub_tensor = x.Slice(x_start, x_end);
         x_sub_tensor.Resize({1, x_sub_tensor.numel()});
         int out_start = out_offset;
-        if (x_lod.size() == 1) {
-          out_start = out_lod[0][out_offset];
+        if (out->lod().size() == 1) {
+          out_start = out->lod()[0][out_offset];
         }
         auto out_sub_tensor =
             out->Slice(out_start, out_start + x_seq_len * repeat_num);
@@ -71,6 +75,7 @@ struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
             EigenMatrix<T>::From(x_sub_tensor)
                 .broadcast(Eigen::array<int, 2>({{repeat_num, 1}}));
       }
+      out_offset += repeat_num;
     }
   }
 };
@@ -96,13 +101,10 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
       return;
     }
 
-    auto& out_lod = *out->mutable_lod();
     // x lod level is at most 1.
-    if (x_lod.size() == 0) {
-      out_lod = y_lod[ref_level];
-    } else if (x_lod.size() == 1) {
-      out_lod.resize(1);
-      out_lod[0] = {0};
+    framework::Vector<size_t> out_lod;
+    if (x_lod.size() == 1) {
+      out_lod.push_back(0);
       int out_offset = 0;
       for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
         int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
@@ -110,14 +112,25 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
         int x_end = x_lod[0][i];
         int x_seq_len = x_end - x_start;
         for (int j = 0; j < repeat_num; ++j) {
-          out_lod[0].push_back(out_lod[0].back() + x_seq_len);
+          out_lod.push_back(out_lod.back() + x_seq_len);
           out_offset++;
         }
       }
+      // write lod to out if x has lod
+      auto& ref_lod = *out->mutable_lod();
+      ref_lod[0] = out_lod;
+    }
+    framework::Vector<size_t> ref_x_lod;
+    if (x->lod().size() == 1) {
+      ref_x_lod = x->lod()[0];
+    } else {
+      // x_lod doesn't has lod, use fake x lod, level = 0
+      ref_x_lod.resize(x->dims()[0] + 1);
+      std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0);
     }
-
     SequenceExpandFunctor<DeviceContext, T> functor;
-    functor(context.template device_context<DeviceContext>(), *x, out);
+    functor(context.template device_context<DeviceContext>(), *x, ref_x_lod,
+            y_lod[ref_level], out);
   }
 };
 
@@ -135,32 +148,29 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
  * */
 template <typename T>
 struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x,
-                  const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, g_x, static_cast<T>(0));
-
-    int g_out_offset = 0;
-    for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
-      int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+  void operator()(
+      const platform::CPUDeviceContext& context, const LoDTensor& dout,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* dx) {
+    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    set_zero(context, dx, static_cast<T>(0));
+
+    int dout_offset = 0;
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      int repeat_num = ref_lod[i] - ref_lod[i - 1];
       if (repeat_num > 0) {
-        int x_start = i - 1;
-        int x_end = i;
-        if (x_lod.size() == 1) {
-          x_start = x_lod[0][i - 1];
-          x_end = x_lod[0][i];
-        }
+        int x_start = x_lod[i - 1];
+        int x_end = x_lod[i];
         int x_seq_len = x_end - x_start;
-        auto g_x_sub = g_x->Slice(x_start, x_end);
-        g_x_sub.Resize(flatten_to_1d(g_x_sub.dims()));
-        int g_out_end = g_out_offset + repeat_num * x_seq_len;
-        auto g_out_sub = g_out->Slice(g_out_offset, g_out_end);
-        g_out_sub.Resize({repeat_num, g_x_sub.dims()[0]});
-        math::ColwiseSum<DeviceContext, T> col_sum;
-        col_sum(dev_ctx, g_out_sub, &g_x_sub);
-        g_out_offset += repeat_num * x_seq_len;
+        auto dx_sub = dx->Slice(x_start, x_end);
+        dx_sub.Resize(flatten_to_1d(dx_sub.dims()));
+        int dout_end = dout_offset + repeat_num * x_seq_len;
+        auto dout_sub = dout.Slice(dout_offset, dout_end);
+        dout_sub.Resize({repeat_num, dx_sub.dims()[0]});
+        math::ColwiseSum<platform::CPUDeviceContext, T> col_sum;
+        col_sum(context, dout_sub, &dx_sub);
+        dout_offset += repeat_num * x_seq_len;
       }
     }
   }
@@ -179,20 +189,26 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
     g_x->mutable_data<T>(context.GetPlace());
     g_x->set_lod(x->lod());
 
-    auto& x_lod = x->lod();
     auto& y_lod = y->lod();
-
     if (ref_level == -1) ref_level = y_lod.size() - 1;
-
     // just copy the gradient
     if (y_lod[ref_level].size() <= 1) {
       framework::TensorCopy(*g_out, context.GetPlace(), g_x);
       return;
     }
 
+    framework::Vector<size_t> ref_x_lod;
+    framework::Vector<size_t> ref_lod = y_lod[ref_level];
+    if (x->lod().size() == 1) {
+      ref_x_lod = x->lod()[0];
+    } else {
+      // x_lod doesn't has lod, use fake x lod, level = 0
+      ref_x_lod.resize(x->dims()[0] + 1);
+      std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0);
+    }
     SequenceExpandGradFunctor<DeviceContext, T> functor;
-    functor(context.template device_context<DeviceContext>(), *x, *y, *g_out,
-            g_x);
+    functor(context.template device_context<DeviceContext>(), *g_out, ref_x_lod,
+            ref_lod, g_x);
   }
 };
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index d1cebc4ea2..4c8ec1426c 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -19,14 +19,8 @@ from op_test import OpTest
 
 class TestSequenceExpand(OpTest):
     def set_data(self):
-        x = [i / 10.0 for i in range(3)]
-        y = [i / 10.0 for i in range(8)]
-        x_data = np.array(x).reshape(3, 1).astype('float32')
-        y_data = np.array(y).reshape(8, 1).astype('float32')
-        print(x_data)
-        print(y_data)
-        # x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
-        # y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
+        x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
+        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
         y_lod = [[0, 1, 4, 8]]
         self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
 
@@ -53,8 +47,10 @@ class TestSequenceExpand(OpTest):
             x_len = x_idx[i] - x_idx[i - 1]
             if repeat_num > 0:
                 x_sub = x_data[x_idx[i - 1]:x_idx[i], :]
-                x_sub = np.repeat(x_sub, repeat_num, axis=0)
-                out = np.vstack((out, x_sub))
+                stacked_x_sub = x_sub
+                for r in range(repeat_num - 1):
+                    stacked_x_sub = np.vstack((stacked_x_sub, x_sub))
+                out = np.vstack((out, stacked_x_sub))
                 if x_lod is not None:
                     for j in xrange(repeat_num):
                         out_lod[0].append(out_lod[0][-1] + x_len)
@@ -107,11 +103,11 @@ class TestSequenceExpandCase3(TestSequenceExpand):
 
 class TestSequenceExpandCase4(TestSequenceExpand):
     def set_data(self):
-        data = [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]
+        data = np.random.uniform(0.1, 1, [5 * 2, 1])
         x_data = np.array(data).reshape([5, 2]).astype('float32')
         x_lod = [[0, 2, 5]]
-        y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32')
-        y_lod = [[0, 1, 2], [0, 1, 2]]
+        y_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
+        y_lod = [[0, 1, 3], [0, 1, 3]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 

From 0412f5e09b9c1d13593a28b6a529affb26681141 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Wed, 28 Mar 2018 05:21:29 -0700
Subject: [PATCH 07/67] "fix ci"

---
 paddle/fluid/operators/sequence_expand_op.cu | 21 ++++++++------------
 paddle/fluid/operators/sequence_expand_op.h  |  2 --
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 1bd7342652..8a35bc908e 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include <stdio.h>
 #include <algorithm>
 #include "paddle/fluid/operators/sequence_expand_op.h"
 #include "paddle/fluid/platform/cuda_helper.h"
@@ -109,12 +108,10 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
       const framework::Vector<size_t>& x_lod,   /*expand source lod*/
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
       LoDTensor* out) {
-    int x_item_length = 1;
-    x_item_length = x.numel() / x.dims()[0];
-    VLOG(0) << "x_item_length" << x_item_length;
-    int thread_x = std::max(static_cast<int>(ref_lod.size()), 32);
-    int thread_y = std::max(1024 / thread_x, 16);
-    int thread_z = std::min(1024 / thread_x / thread_y, 16);
+    int x_item_length = x.numel() / x.dims()[0];
+    int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
+    int thread_y = 16;
+    int thread_z = 1024 / thread_x / thread_y;
     int block_x = static_cast<int>(ref_lod.size());
     dim3 block_size(thread_x, thread_y, thread_z);
     dim3 grid_size(block_x, 1);
@@ -133,12 +130,10 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
                   const framework::Vector<size_t>& x_lod, /*expand source lod*/
                   const framework::Vector<size_t>& ref_lod, /*expand based lod*/
                   LoDTensor* dx) {
-    int x_item_length = 1;
-    x_item_length = framework::product(dx->dims()) / dx->dims()[0];
-
-    int thread_x = std::max(static_cast<int>(ref_lod.size()), 32);
-    int thread_y = std::max(1024 / thread_x, 16);
-    int thread_z = std::min(1024 / thread_x / thread_y, 16);
+    int x_item_length = framework::product(dx->dims()) / dx->dims()[0];
+    int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
+    int thread_y = 16;
+    int thread_z = 1024 / thread_x / thread_y;
     int block_x = static_cast<int>(ref_lod.size());
     dim3 block_size(thread_x, thread_y, thread_z);
     dim3 grid_size(block_x, 1);
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index c55c3e215a..d62c387c3e 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -15,8 +15,6 @@ limitations under the License. */
 #pragma once
 #include <numeric>  // std::iota
 
-#include <glog/logging.h>
-#include <sstream>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/math_function.h"

From b661fe1d76514127581f2f73b177d2891677d39f Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Thu, 29 Mar 2018 01:36:34 -0700
Subject: [PATCH 08/67] "fix ci"

---
 python/paddle/fluid/tests/unittests/op_test.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 555f188abb..8393f7827b 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -362,9 +362,6 @@ class OpTest(unittest.TestCase):
         for a, b, name in itertools.izip(numeric_grads, analytic_grads, names):
             abs_a = np.abs(a)
             abs_a[abs_a < 1e-3] = 1
-            print("actual", a)
-            print("*****")
-            print("expected", b)
 
             diff_mat = np.abs(a - b) / abs_a
             max_diff = np.max(diff_mat)

From fbdb5b7b437a55ce97fba37da5fdcbdd5e3e53bb Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Thu, 29 Mar 2018 19:20:50 -0700
Subject: [PATCH 09/67] "fix based on comment"

---
 paddle/fluid/operators/sequence_expand_op.cu | 68 +++++++++-----------
 1 file changed, 32 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 8a35bc908e..8119afce1a 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -25,27 +25,17 @@ using LoDTensor = framework::LoDTensor;
 template <typename T>
 __global__ void sequence_expand_kernel(const T* x_data, const size_t* x_lod,
                                        const size_t* ref_lod,
+                                       const size_t* offset,
                                        const size_t lod_size,
                                        /* default=1,
                                           the instance length*/
                                        const int x_item_length, T* out_data) {
-  constexpr int N = 1024;
-  __shared__ int mem[N];
-  int offset = 0;
-  for (int i = 0; i < lod_size; ++i) {
-    mem[i] = offset;
-    if (i < lod_size - 1) {
-      offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]);
-    }
-  }
-  __syncthreads();
-
   int bid = blockIdx.x;
   if (bid >= lod_size - 1) return;
 
   int x_item_count = x_lod[bid + 1] - x_lod[bid];
   int repeats = ref_lod[bid + 1] - ref_lod[bid];
-  int out_offset = mem[bid];
+  int out_offset = static_cast<int>(offset[bid]);
   int x_offset = x_lod[bid];
   for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
     for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
@@ -59,32 +49,17 @@ __global__ void sequence_expand_kernel(const T* x_data, const size_t* x_lod,
 }
 
 template <typename T>
-__global__ void sequence_expand_grad_kernel(const T* dout_data,
-                                            const size_t* ref_lod,
-                                            const size_t* dx_lod,
-                                            const size_t lod_size,
-                                            /* default=1,
-                                               the instance length*/
-                                            const int x_item_length,
-                                            T* dx_data) {
-  // TODO(dzhwinter) : too many atomicAdd
-  // use shared memory to reduce memory visits
-  constexpr int N = 1024;
-  __shared__ int mem[N];
-  int offset = 0;
-  for (int i = 0; i < lod_size; ++i) {
-    mem[i] = offset;
-    if (i < lod_size - 1) {
-      offset += (ref_lod[i + 1] - ref_lod[i]) * (dx_lod[i + 1] - dx_lod[i]);
-    }
-  }
-  __syncthreads();
-
+__global__ void sequence_expand_grad_kernel(
+    const T* dout_data, const size_t* ref_lod, const size_t* dx_lod,
+    const size_t* offset, const size_t lod_size,
+    /* default=1,
+       the instance length*/
+    const int x_item_length, T* dx_data) {
   int bid = blockIdx.x;
   if (bid >= lod_size - 1) return;
   int x_item_count = dx_lod[bid + 1] - dx_lod[bid];
   int repeats = ref_lod[bid + 1] - ref_lod[bid];
-  int out_offset = mem[bid];
+  int out_offset = static_cast<int>(offset[bid]);
   int x_offset = dx_lod[bid];
 
   for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
@@ -101,6 +76,19 @@ __global__ void sequence_expand_grad_kernel(const T* dout_data,
   }
 }
 
+void GetOutputOffset(const framework::Vector<size_t>& x_lod,
+                     const framework::Vector<size_t>& ref_lod,
+                     framework::Vector<size_t>& out_offset) {
+  size_t offset = 0;
+  int lod_size = static_cast<int>(x_lod.size());
+  for (int i = 0; i < static_cast<int>(x_lod.size()); ++i) {
+    out_offset[i] = offset;
+    if (i < lod_size - 1) {
+      offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]);
+    }
+  }
+}
+
 template <typename T>
 struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
   void operator()(
@@ -109,6 +97,9 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
       LoDTensor* out) {
     int x_item_length = x.numel() / x.dims()[0];
+    framework::Vector<size_t> out_offset(x_lod.size());
+    GetOutputOffset(x_lod, ref_lod, out_offset);
+
     int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
     int thread_y = 16;
     int thread_z = 1024 / thread_x / thread_y;
@@ -118,7 +109,8 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
 
     sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
         x.data<T>(), x_lod.CUDAData(context.GetPlace()),
-        ref_lod.CUDAData(context.GetPlace()), x_lod.size(), x_item_length,
+        ref_lod.CUDAData(context.GetPlace()),
+        out_offset.CUDAData(context.GetPlace()), x_lod.size(), x_item_length,
         out->mutable_data<T>(context.GetPlace()));
   }
 };
@@ -131,6 +123,9 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
                   const framework::Vector<size_t>& ref_lod, /*expand based lod*/
                   LoDTensor* dx) {
     int x_item_length = framework::product(dx->dims()) / dx->dims()[0];
+    framework::Vector<size_t> out_offset(x_lod.size());
+    GetOutputOffset(x_lod, ref_lod, out_offset);
+
     int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
     int thread_y = 16;
     int thread_z = 1024 / thread_x / thread_y;
@@ -139,7 +134,8 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
     dim3 grid_size(block_x, 1);
     sequence_expand_grad_kernel<<<grid_size, block_size, 0, context.stream()>>>(
         dout.data<T>(), ref_lod.CUDAData(context.GetPlace()),
-        x_lod.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length,
+        x_lod.CUDAData(context.GetPlace()),
+        out_offset.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length,
         dx->mutable_data<T>(context.GetPlace()));
   }
 };

From f43be75b82582ec5f81c2ceba45eb14128638478 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 2 Apr 2018 20:25:11 +0800
Subject: [PATCH 10/67] multi stream thread pool

---
 paddle/fluid/framework/threadpool.cc         | 15 +++++++++++++++
 paddle/fluid/framework/threadpool.h          | 16 ++++++++++++++++
 paddle/fluid/operators/detail/grpc_client.cc | 12 +++++++-----
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index 9854d618d2..0a8377cc47 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -91,5 +91,20 @@ void ThreadPool::TaskLoop() {
   }
 }
 
+std::unique_ptr<ThreadPool> MultiStreamThreadPool::io_threadpool_(nullptr);
+std::once_flag MultiStreamThreadPool::io_init_flag_;
+
+MultiStreamThreadPool* MultiStreamThreadPool::GetInstanceIO() {
+  std::call_once(io_init_flag_, &MultiStreamThreadPool::InitIO);
+  return static_cast<MultiStreamThreadPool*>(io_threadpool_.get());
+}
+
+void MultiStreamThreadPool::InitIO() {
+  if (io_threadpool_.get() == nullptr) {
+    // TODO(typhoonzero1986): make this configurable
+    io_threadpool_.reset(new ThreadPool(100));
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
index f9dce7105e..5d437594ab 100644
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -135,6 +135,17 @@ class ThreadPool {
   std::condition_variable completed_;
 };
 
+class MultiStreamThreadPool : ThreadPool {
+ public:
+  static MultiStreamThreadPool* GetInstanceIO();
+  static void InitIO();
+
+ private:
+  // NOTE: threadpool in base will be inhereted here.
+  static std::unique_ptr<ThreadPool> io_threadpool_;
+  static std::once_flag io_init_flag_;
+};
+
 // Run a function asynchronously.
 // NOTE: The function must return void. If the function need to return a value,
 // you can use lambda to capture a value pointer.
@@ -143,5 +154,10 @@ std::future<void> Async(Callback callback) {
   return ThreadPool::GetInstance()->Run(callback);
 }
 
+template <typename Callback>
+std::future<void> AsyncIO(Callback callback) {
+  return MultiStreamThreadPool::GetInstanceIO()->Run(callback);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index d79ba6d291..3f96ce3718 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -33,7 +33,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
 
-  framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] {
+  framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch,
+                      this] {
     auto* var = p_scope->FindVar(var_name_val);
 
     ::grpc::ByteBuffer req;
@@ -88,7 +89,8 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
 
-  framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {
+  framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch,
+                      this] {
     // prepare input
     sendrecv::VariableMessage req;
     req.set_varname(var_name_val);
@@ -131,8 +133,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
 
-  framework::Async([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                    time_out, ch, this] {
+  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
+                      time_out, ch, this] {
     auto* var = p_scope->FindVar(in_var_name_val);
 
     ::grpc::ByteBuffer req;
@@ -195,7 +197,7 @@ bool RPCClient::Wait() {
   std::vector<std::future<void>> waits(req_count_);
 
   for (int i = 0; i < req_count_; i++) {
-    waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); });
+    waits[i] = framework::AsyncIO([i, &a, this] { a[i] = Proceed(); });
   }
 
   for (int i = 0; i < req_count_; i++) {

From b851c0739f29eebfb9d63db026c847733fa8d252 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Tue, 3 Apr 2018 10:02:34 +0800
Subject: [PATCH 11/67] update compile

---
 paddle/fluid/framework/threadpool.h          | 32 ++++++++++----------
 paddle/fluid/operators/detail/grpc_client.cc | 12 +++-----
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
index 5d437594ab..0a60488d9f 100644
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -28,6 +28,22 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+struct ExceptionHandler {
+  mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
+  explicit ExceptionHandler(
+      std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
+      : future_(std::move(f)) {}
+  void operator()() const {
+    auto ex = this->future_.get();
+    if (ex != nullptr) {
+      LOG(FATAL) << "The exception is thrown inside the thread pool. You "
+                    "should use RunAndGetException to handle the exception.\n"
+                    "The default exception handler is LOG(FATAL)."
+                 << ex->what();
+    }
+  }
+};
+
 // ThreadPool maintains a queue of tasks, and runs them using a fixed
 // number of threads.
 class ThreadPool {
@@ -87,22 +103,6 @@ class ThreadPool {
   void Wait();
 
  private:
-  struct ExceptionHandler {
-    mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
-    explicit ExceptionHandler(
-        std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
-        : future_(std::move(f)) {}
-    void operator()() const {
-      auto ex = this->future_.get();
-      if (ex != nullptr) {
-        LOG(FATAL) << "The exception is thrown inside the thread pool. You "
-                      "should use RunAndGetException to handle the exception.\n"
-                      "The default exception handler is LOG(FATAL)."
-                   << ex->what();
-      }
-    }
-  };
-
   DISABLE_COPY_AND_ASSIGN(ThreadPool);
 
   // If the task queue is empty and avaialbe is equal to the number of
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index 3f96ce3718..d79ba6d291 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -33,8 +33,7 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
 
-  framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch,
-                      this] {
+  framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] {
     auto* var = p_scope->FindVar(var_name_val);
 
     ::grpc::ByteBuffer req;
@@ -89,8 +88,7 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
 
-  framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch,
-                      this] {
+  framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {
     // prepare input
     sendrecv::VariableMessage req;
     req.set_varname(var_name_val);
@@ -133,8 +131,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
 
-  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      time_out, ch, this] {
+  framework::Async([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
+                    time_out, ch, this] {
     auto* var = p_scope->FindVar(in_var_name_val);
 
     ::grpc::ByteBuffer req;
@@ -197,7 +195,7 @@ bool RPCClient::Wait() {
   std::vector<std::future<void>> waits(req_count_);
 
   for (int i = 0; i < req_count_; i++) {
-    waits[i] = framework::AsyncIO([i, &a, this] { a[i] = Proceed(); });
+    waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); });
   }
 
   for (int i = 0; i < req_count_; i++) {

From c72450d24d49c547d8e6bfc75691f429c19d6a79 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Mon, 2 Apr 2018 23:46:34 -0700
Subject: [PATCH 12/67] "seperate test"

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 0ad273c716..3873fda226 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -28,6 +28,8 @@ function(py_test_modules TARGET_NAME)
   endif()
 endfunction()
 
+list(REMOVE_ITEM TEST_OPS test_sequence_expand)
+
 # test time consuming OPs in a separate process for expliot parallism
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dyn_rnn)
@@ -63,6 +65,8 @@ else()
     endforeach(TEST_OP)
 endif(WITH_FAST_BUNDLE_TEST)
 
+#
+py_test_modules(test_sequence_expand MODULES test_sequence_expand)
 # tests with high overhead
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
 py_test_modules(test_train_dyn_rnn MODULES test_dyn_rnn)

From fbd3604cad8fdb3ad7fa2f6717395b1c40e6ecaf Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Tue, 3 Apr 2018 05:31:52 +0000
Subject: [PATCH 13/67] Split Executor.Run to Executor.Prepare and
 Executor.RunPreparedContext for inference.

---
 paddle/fluid/framework/executor.cc            | 94 ++++++++++++-------
 paddle/fluid/framework/executor.h             |  7 ++
 .../test_inference_image_classification.cc    |  4 +-
 paddle/fluid/inference/tests/test_helper.h    | 20 +++-
 4 files changed, 85 insertions(+), 40 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 64c06687b6..009d0fbeb8 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -129,13 +129,15 @@ static bool has_feed_operators(
         feed_count, feed_targets.size(),
         "The number of feed operators should match 'feed_targets'");
 
-    // When feed operator are present, so should be feed_holder
-    auto var = block.FindVar(feed_holder_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
-                            feed_holder_name);
-    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH,
-                      "'%s' variable should be 'FEED_MINIBATCH' type",
-                      feed_holder_name);
+    if (!feed_holder_name.empty()) {
+      // When feed operator are present, so should be feed_holder
+      auto var = block.FindVar(feed_holder_name);
+      PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+                              feed_holder_name);
+      PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH,
+                        "'%s' variable should be 'FEED_MINIBATCH' type",
+                        feed_holder_name);
+    }
   }
 
   return feed_count > 0;
@@ -169,13 +171,15 @@ static bool has_fetch_operators(
         fetch_count, fetch_targets.size(),
         "The number of fetch operators should match 'fetch_targets'");
 
-    // When fetch operator are present, so should be fetch_holder
-    auto var = block.FindVar(fetch_holder_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
-                            fetch_holder_name);
-    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST,
-                      "'%s' variable should be 'FETCH_LIST' type",
-                      fetch_holder_name);
+    if (!fetch_holder_name.empty()) {
+      // When fetch operator are present, so should be fetch_holder
+      auto var = block.FindVar(fetch_holder_name);
+      PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+                              fetch_holder_name);
+      PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST,
+                        "'%s' variable should be 'FETCH_LIST' type",
+                        fetch_holder_name);
+    }
   }
 
   return fetch_count > 0;
@@ -222,16 +226,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
     }
   }
 
-  // map the data of feed_targets to feed_holder
-  for (auto* op : global_block->AllOps()) {
-    if (op->Type() == kFeedOpType) {
-      std::string feed_target_name = op->Output("Out")[0];
-      int idx = boost::get<int>(op->GetAttr("col"));
-      SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
-                      idx);
-    }
-  }
-
   if (!has_fetch_ops) {
     // create fetch_holder variable
     auto* fetch_holder = global_block->Var(fetch_holder_name);
@@ -255,17 +249,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
     }
   }
 
-  Run(*copy_program, scope, 0, create_vars, create_vars);
-
-  // obtain the data of fetch_targets from fetch_holder
-  for (auto* op : global_block->AllOps()) {
-    if (op->Type() == kFetchOpType) {
-      std::string fetch_target_name = op->Input("X")[0];
-      int idx = boost::get<int>(op->GetAttr("col"));
-      *fetch_targets[fetch_target_name] =
-          GetFetchVariable(*scope, fetch_holder_name, idx);
-    }
-  }
+  auto ctx = Prepare(*copy_program, 0);
+  RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets,
+                     feed_holder_name, fetch_holder_name, create_vars);
 }
 
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
@@ -343,5 +329,43 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 }
 
+void Executor::RunPreparedContext(
+    ExecutorPrepareContext* ctx, Scope* scope,
+    std::map<std::string, const LoDTensor*>& feed_targets,
+    std::map<std::string, LoDTensor*>& fetch_targets,
+    const std::string& feed_holder_name, const std::string& fetch_holder_name,
+    bool create_vars) {
+  auto& global_block = ctx->prog_.Block(ctx->block_id_);
+
+  // map the data of feed_targets to feed_holder
+  for (auto* op : global_block.AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      std::string feed_target_name = op->Output("Out")[0];
+      PADDLE_ENFORCE(feed_targets.find(feed_target_name) != feed_targets.end(),
+                     "Variable %s is not feeded.");
+
+      int idx = boost::get<int>(op->GetAttr("col"));
+      SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
+                      idx);
+    }
+  }
+
+  RunPreparedContext(ctx, scope, create_vars, create_vars);
+
+  // obtain the data of fetch_targets from fetch_holder
+  for (auto* op : global_block.AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      std::string fetch_target_name = op->Input("X")[0];
+      PADDLE_ENFORCE(
+          fetch_targets.find(fetch_target_name) != fetch_targets.end(),
+          "Variable %s is not fetched.");
+
+      int idx = boost::get<int>(op->GetAttr("col"));
+      *fetch_targets[fetch_target_name] =
+          GetFetchVariable(*scope, fetch_holder_name, idx);
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 7173c51c95..b0e64d5de0 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -65,6 +65,13 @@ class Executor {
                           bool create_local_scope = true,
                           bool create_vars = true);
 
+  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
+                          std::map<std::string, const LoDTensor*>& feed_targets,
+                          std::map<std::string, LoDTensor*>& fetch_targets,
+                          const std::string& feed_holder_name = "feed",
+                          const std::string& fetch_holder_name = "fetch",
+                          bool create_vars = true);
+
  private:
   const platform::Place place_;
 };
diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
index e9a27171f1..9126efb8c2 100644
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -48,7 +48,7 @@ TEST(inference, image_classification) {
 
   // Run inference on CPU
   LOG(INFO) << "--- CPU Runs: ---";
-  TestInference<paddle::platform::CPUPlace>(
+  TestInference<paddle::platform::CPUPlace, true>(
       dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
   LOG(INFO) << output1.dims();
 
@@ -59,7 +59,7 @@ TEST(inference, image_classification) {
 
   // Run inference on CUDA GPU
   LOG(INFO) << "--- GPU Runs: ---";
-  TestInference<paddle::platform::CUDAPlace>(
+  TestInference<paddle::platform::CUDAPlace, true>(
       dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
   LOG(INFO) << output2.dims();
 
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index dce541c097..d559cc7d03 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -88,7 +88,7 @@ void CheckError(paddle::framework::LoDTensor& output1,
   EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
 }
 
-template <typename Place>
+template <typename Place, bool PrepareContext = false>
 void TestInference(const std::string& dirname,
                    const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
                    std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
@@ -170,7 +170,14 @@ void TestInference(const std::string& dirname,
   // 6. Run the inference program
   {
     // Ignore the profiling results of the first run
-    executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+    std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
+    if (PrepareContext) {
+      ctx = executor.Prepare(*inference_program, 0);
+      executor.RunPreparedContext(
+          ctx.get(), scope, feed_targets, fetch_targets);
+    } else {
+      executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+    }
 
     // Enable the profiler
     paddle::platform::EnableProfiler(state);
@@ -181,7 +188,14 @@ void TestInference(const std::string& dirname,
           "run_inference",
           paddle::platform::DeviceContextPool::Instance().Get(place));
 
-      executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+      if (PrepareContext) {
+        // Note: if you changed the inference_program, you need to call
+        // executor.Prepare() again to get a new ExecutorPrepareContext.
+        executor.RunPreparedContext(
+            ctx.get(), scope, feed_targets, fetch_targets);
+      } else {
+        executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+      }
     }
 
     // Disable the profiler and print the timing information

From a9e826ed495bcd5a5b625d4ce364c8c42d0d0b7d Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Sun, 8 Apr 2018 06:32:30 +0000
Subject: [PATCH 14/67] Add the check of has_feed/fetch_operators back.

---
 paddle/fluid/framework/executor.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 8a0ab118d0..3edaede8d6 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -352,13 +352,17 @@ void Executor::RunPreparedContext(
     bool create_vars) {
   auto& global_block = ctx->prog_.Block(ctx->block_id_);
 
+  PADDLE_ENFORCE(
+      has_feed_operators(global_block, feed_targets, feed_holder_name),
+      "Program in ExecutorPrepareContext should has feed_ops.");
+  PADDLE_ENFORCE(
+      has_fetch_operators(global_block, fetch_targets, fetch_holder_name),
+      "Program in the prepared context should has fetch_ops.");
+
   // map the data of feed_targets to feed_holder
   for (auto* op : global_block.AllOps()) {
     if (op->Type() == kFeedOpType) {
       std::string feed_target_name = op->Output("Out")[0];
-      PADDLE_ENFORCE(feed_targets.find(feed_target_name) != feed_targets.end(),
-                     "Variable %s is not feeded.");
-
       int idx = boost::get<int>(op->GetAttr("col"));
       SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
                       idx);
@@ -371,10 +375,6 @@ void Executor::RunPreparedContext(
   for (auto* op : global_block.AllOps()) {
     if (op->Type() == kFetchOpType) {
       std::string fetch_target_name = op->Input("X")[0];
-      PADDLE_ENFORCE(
-          fetch_targets.find(fetch_target_name) != fetch_targets.end(),
-          "Variable %s is not fetched.");
-
       int idx = boost::get<int>(op->GetAttr("col"));
       *fetch_targets[fetch_target_name] =
           GetFetchVariable(*scope, fetch_holder_name, idx);

From 972ae6e98ffbddac7b68242f946934b07b275e01 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Mon, 9 Apr 2018 14:27:19 +0800
Subject: [PATCH 15/67] random selected rows value

---
 paddle/fluid/operators/uniform_random_op.cc | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 87699362b2..a50add9739 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -24,7 +24,15 @@ template <typename T>
 class CPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* tensor = ctx.Output<framework::Tensor>("Out");
+    framework::Tensor* tensor(nullptr);
+    auto out_var = ctx.OutputVar("Out");
+    if (out_var->IsType<framework::LoDTensor>()) {
+      tensor = ctx.Output<framework::LoDTensor>("Out");
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      tensor = ctx.Output<framework::SelectedRows>("Out")->mutable_value();
+    } else {
+      PADDLE_THROW("Only support LoDTensor and SelectedRows.");
+    }
     T* data = tensor->mutable_data<T>(ctx.GetPlace());
     unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
     std::minstd_rand engine;
@@ -36,6 +44,7 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
         static_cast<T>(ctx.Attr<float>("min")),
         static_cast<T>(ctx.Attr<float>("max")));
     int64_t size = tensor->numel();
+    VLOG(3) << "size = " << size;
     for (int64_t i = 0; i < size; ++i) {
       data[i] = dist(engine);
     }
@@ -55,6 +64,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
         "uniform_random's min must less then max");
     auto& shape = ctx->Attrs().Get<std::vector<int>>("shape");
     std::vector<int64_t> temp;
+    VLOG(3) << "shape.size() = " << shape.size();
     temp.reserve(shape.size());
     for (auto dim : shape) {
       temp.push_back(static_cast<int64_t>(dim));

From f909ff1a3652697f63070cf1bc8cb425d1902417 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Mon, 9 Apr 2018 15:53:00 +0800
Subject: [PATCH 16/67] update unit test

---
 paddle/fluid/operators/uniform_random_op.cc   |  5 +-
 paddle/fluid/operators/uniform_random_op.cu   | 13 +++++-
 .../tests/unittests/test_uniform_random_op.py | 46 +++++++++++++++++--
 3 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index a50add9739..d8b38fb7eb 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -29,11 +29,14 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
     if (out_var->IsType<framework::LoDTensor>()) {
       tensor = ctx.Output<framework::LoDTensor>("Out");
     } else if (out_var->IsType<framework::SelectedRows>()) {
+      auto shape = ctx.Attr<std::vector<int>>("shape");
       tensor = ctx.Output<framework::SelectedRows>("Out")->mutable_value();
+      tensor->Resize(framework::make_ddim(shape));
     } else {
       PADDLE_THROW("Only support LoDTensor and SelectedRows.");
     }
     T* data = tensor->mutable_data<T>(ctx.GetPlace());
+    data[0] = static_cast<T>(1000);
     unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
     std::minstd_rand engine;
     if (seed == 0) {
@@ -44,7 +47,6 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
         static_cast<T>(ctx.Attr<float>("min")),
         static_cast<T>(ctx.Attr<float>("max")));
     int64_t size = tensor->numel();
-    VLOG(3) << "size = " << size;
     for (int64_t i = 0; i < size; ++i) {
       data[i] = dist(engine);
     }
@@ -64,7 +66,6 @@ class UniformRandomOp : public framework::OperatorWithKernel {
         "uniform_random's min must less then max");
     auto& shape = ctx->Attrs().Get<std::vector<int>>("shape");
     std::vector<int64_t> temp;
-    VLOG(3) << "shape.size() = " << shape.size();
     temp.reserve(shape.size());
     for (auto dim : shape) {
       temp.push_back(static_cast<int64_t>(dim));
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 1232cd1eb3..115c859527 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -43,7 +43,18 @@ template <typename T>
 class GPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
+    framework::Tensor* tensor(nullptr);
+    auto out_var = ctx.OutputVar("Out");
+    if (out_var->IsType<framework::LoDTensor>()) {
+      tensor = ctx.Output<framework::LoDTensor>("Out");
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      auto shape = ctx.Attr<std::vector<int>>("shape");
+      tensor = ctx.Output<framework::SelectedRows>("Out")->mutable_value();
+      tensor->Resize(framework::make_ddim(shape));
+    } else {
+      PADDLE_THROW("Only support LoDTensor and SelectedRows.");
+    }
+
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
     if (seed == 0) {
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 75ff85a55f..3331e99c36 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -15,6 +15,16 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+def output_hist(out):
+    hist, _ = np.histogram(out, range=(-5, 10))
+    hist = hist.astype("float32")
+    hist /= float(out.size)
+    prob = 0.1 * np.ones((10))
+    return hist, prob
 
 
 class TestUniformRandomOp(OpTest):
@@ -33,11 +43,37 @@ class TestUniformRandomOp(OpTest):
         self.check_output_customized(self.verify_output)
 
     def verify_output(self, outs):
-        tensor = outs[0]
-        hist, _ = np.histogram(outs[0], range=(-5, 10))
-        hist = hist.astype("float32")
-        hist /= float(outs[0].size)
-        prob = 0.1 * np.ones((10))
+        hist, prob = output_hist(outs[0])
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+class TestUniformRandomOpSelectedRows(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        out = scope.var("X").get_selected_rows()
+
+        op = Operator(
+            "uniform_random",
+            Out="X",
+            shape=[1000, 784],
+            min=-5.0,
+            max=10.0,
+            seed=10)
+        op.run(scope, place)
+        out_tensor = out.get_tensor()
+        hist, prob = output_hist(np.array(out_tensor))
         self.assertTrue(
             np.allclose(
                 hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))

From 9fe938cb2aefcbced1e60fa459c943fa2ea245e6 Mon Sep 17 00:00:00 2001
From: jshower <j.shower@163.com>
Date: Tue, 10 Apr 2018 03:48:26 +0000
Subject: [PATCH 17/67] Changing network configuration, avoid nan

---
 .../fluid/tests/book/test_label_semantic_roles.py    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index c0a6df831a..5fc64ea958 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -77,7 +77,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
     emb_layers.append(mark_embedding)
 
     hidden_0_layers = [
-        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
+        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') for emb in emb_layers
     ]
 
     hidden_0 = fluid.layers.sums(input=hidden_0_layers)
@@ -94,8 +94,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
 
     for i in range(1, depth):
         mix_hidden = fluid.layers.sums(input=[
-            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
-            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
         ])
 
         lstm = fluid.layers.dynamic_lstm(
@@ -109,8 +109,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
         input_tmp = [mix_hidden, lstm]
 
     feature_out = fluid.layers.sums(input=[
-        fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
-        fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
     ])
 
     return feature_out
@@ -171,7 +171,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
     # check other optimizers and check why out will be NAN
     sgd_optimizer = fluid.optimizer.SGD(
         learning_rate=fluid.layers.exponential_decay(
-            learning_rate=0.0001,
+            learning_rate=0.01,
             decay_steps=100000,
             decay_rate=0.5,
             staircase=True))

From d9a52223852a92d532ff2522cb648758511abe26 Mon Sep 17 00:00:00 2001
From: jshower <j.shower@163.com>
Date: Tue, 10 Apr 2018 04:57:30 +0000
Subject: [PATCH 18/67] code style

---
 .../tests/book/test_label_semantic_roles.py   | 67 ++++++++++---------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index 5fc64ea958..4f5d30ac00 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -70,14 +70,15 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
         fluid.layers.embedding(
             size=[word_dict_len, word_dim],
             input=x,
-            param_attr=fluid.ParamAttr(
-                name=embedding_name, trainable=False)) for x in word_input
+            param_attr=fluid.ParamAttr(name=embedding_name, trainable=False))
+        for x in word_input
     ]
     emb_layers.append(predicate_embedding)
     emb_layers.append(mark_embedding)
 
     hidden_0_layers = [
-        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') for emb in emb_layers
+        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
+        for emb in emb_layers
     ]
 
     hidden_0 = fluid.layers.sums(input=hidden_0_layers)
@@ -163,8 +164,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
     crf_cost = fluid.layers.linear_chain_crf(
         input=feature_out,
         label=target,
-        param_attr=fluid.ParamAttr(
-            name='crfw', learning_rate=mix_hidden_lr))
+        param_attr=fluid.ParamAttr(name='crfw', learning_rate=mix_hidden_lr))
     avg_cost = fluid.layers.mean(crf_cost)
 
     # TODO(qiao)
@@ -189,8 +189,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
         num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
 
     train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.conll05.test(), buf_size=8192),
+        paddle.reader.shuffle(paddle.dataset.conll05.test(), buf_size=8192),
         batch_size=BATCH_SIZE)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
@@ -223,24 +222,25 @@ def train(use_cuda, save_dirname=None, is_local=True):
                     exe)
 
                 if batch_id % 10 == 0:
-                    print("avg_cost:" + str(cost) + " precision:" + str(
-                        precision) + " recall:" + str(recall) + " f1_score:" +
-                          str(f1_score) + " pass_precision:" + str(
-                              pass_precision) + " pass_recall:" + str(
-                                  pass_recall) + " pass_f1_score:" + str(
-                                      pass_f1_score))
+                    print(
+                        "avg_cost:" + str(cost) + " precision:" +
+                        str(precision) + " recall:" + str(recall) +
+                        " f1_score:" + str(f1_score) + " pass_precision:" + str(
+                            pass_precision) + " pass_recall:" + str(pass_recall)
+                        + " pass_f1_score:" + str(pass_f1_score))
                     if batch_id != 0:
-                        print("second per batch: " + str((time.time(
-                        ) - start_time) / batch_id))
+                        print("second per batch: " + str(
+                            (time.time() - start_time) / batch_id))
                     # Set the threshold low to speed up the CI test
                     if float(pass_precision) > 0.05:
                         if save_dirname is not None:
                             # TODO(liuyiqun): Change the target to crf_decode
-                            fluid.io.save_inference_model(save_dirname, [
-                                'word_data', 'verb_data', 'ctx_n2_data',
-                                'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
-                                'ctx_p2_data', 'mark_data'
-                            ], [feature_out], exe)
+                            fluid.io.save_inference_model(
+                                save_dirname, [
+                                    'word_data', 'verb_data', 'ctx_n2_data',
+                                    'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
+                                    'ctx_p2_data', 'mark_data'
+                                ], [feature_out], exe)
                         return
 
                 batch_id = batch_id + 1
@@ -320,19 +320,20 @@ def infer(use_cuda, save_dirname=None):
         assert feed_target_names[6] == 'ctx_p2_data'
         assert feed_target_names[7] == 'mark_data'
 
-        results = exe.run(inference_program,
-                          feed={
-                              feed_target_names[0]: word,
-                              feed_target_names[1]: pred,
-                              feed_target_names[2]: ctx_n2,
-                              feed_target_names[3]: ctx_n1,
-                              feed_target_names[4]: ctx_0,
-                              feed_target_names[5]: ctx_p1,
-                              feed_target_names[6]: ctx_p2,
-                              feed_target_names[7]: mark
-                          },
-                          fetch_list=fetch_targets,
-                          return_numpy=False)
+        results = exe.run(
+            inference_program,
+            feed={
+                feed_target_names[0]: word,
+                feed_target_names[1]: pred,
+                feed_target_names[2]: ctx_n2,
+                feed_target_names[3]: ctx_n1,
+                feed_target_names[4]: ctx_0,
+                feed_target_names[5]: ctx_p1,
+                feed_target_names[6]: ctx_p2,
+                feed_target_names[7]: mark
+            },
+            fetch_list=fetch_targets,
+            return_numpy=False)
         print(results[0].lod())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)

From 3f6fc10b9fc6da75961bab0f7a473dc388d07f51 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Tue, 10 Apr 2018 14:23:09 +0800
Subject: [PATCH 19/67] new op that init table value randomly

---
 .../operators/uniform_random_table_op.cc      | 144 ++++++++++++++++++
 .../unittests/test_uniform_random_table_op.py |  66 ++++++++
 2 files changed, 210 insertions(+)
 create mode 100644 paddle/fluid/operators/uniform_random_table_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/test_uniform_random_table_op.py

diff --git a/paddle/fluid/operators/uniform_random_table_op.cc b/paddle/fluid/operators/uniform_random_table_op.cc
new file mode 100644
index 0000000000..4664cc5d93
--- /dev/null
+++ b/paddle/fluid/operators/uniform_random_table_op.cc
@@ -0,0 +1,144 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+class UniformRandomTableInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    VLOG(3) << "Infershape...";
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UniformRandomTableOp should not be null.");
+
+    PADDLE_ENFORCE(
+        ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
+        "uniform_random's min must less then max");
+    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    std::vector<int64_t> temp;
+    temp.reserve(shape.size());
+    for (auto dim : shape) {
+      temp.push_back(static_cast<int64_t>(dim));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(temp));
+  }
+};
+
+class UniformRandomTableOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    VLOG(3) << "RunImpl...";
+    auto out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::SelectedRows>();
+    auto shard_cnt = Attr<int>("shard_cnt");
+    auto shard_id = Attr<int>("shard_id");
+    auto max_id = Attr<int>("max_id");
+    auto shape = Attr<std::vector<int>>("shape");
+
+    auto tensor = out->mutable_value();
+    tensor->Resize(framework::make_ddim(shape));
+    // Only allocate the memory of large table on CPU
+    auto cpu = platform::CPUPlace();
+    float *data = tensor->mutable_data<float>(cpu);
+    VLOG(3) << "generate seed";
+    unsigned int seed = static_cast<unsigned int>(Attr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::uniform_real_distribution<float> dist(Attr<float>("min"),
+                                               Attr<float>("max"));
+    int64_t size = tensor->numel();
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(engine);
+    }
+    // initialize rows by round-robin
+    // TODO(Yancey1989): need to support other way to distribute Ids
+    VLOG(3) << "calculate rows_size...";
+    int64_t rows_size = 0;
+    if (max_id % shard_cnt == 0) {
+      rows_size = max_id / shard_cnt;
+    } else {
+      rows_size = max_id / shard_cnt + 1;
+    }
+    auto *rows = out->mutable_rows();
+    rows->resize(rows_size);
+    (*rows)[0] = shard_id;
+    for (int64_t idx = 1; idx < rows_size; ++idx) {
+      (*rows)[idx] = (*rows)[idx - 1] + shard_cnt;
+    }
+    out->set_height(max_id);
+  }
+};
+
+class UniformRandomTableOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  UniformRandomTableOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out",
+              "(SelectedRows)"
+              "The output table of uniform random table op.");
+    AddComment(R"DOC(
+Uniform random operator for initializing a table. 
+
+This operator initializes a SelectedRows with random values sampled from a
+uniform distribution.
+
+)DOC");
+    AddAttr<int>("max_id",
+                 "(int, required)"
+                 "The maximal Id for the table.");
+    AddAttr<int>("shard_cnt",
+                 "(int, required)"
+                 "The count of shards for distributing the table.");
+    AddAttr<int>("shard_id", "(int, required) The current shard ID.");
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) The shape of the output tensor");
+    AddAttr<float>("min",
+                   "(float, default -1.0) "
+                   "Minimum value of uniform random")
+        .SetDefault(-1.0f);
+    AddAttr<float>("max",
+                   "(float, default 1.0) "
+                   "Maximun value of uniform random")
+        .SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "(int, default 0) "
+                 "Random seed used for generating samples. "
+                 "0 means use a seed generated by the system."
+                 "Note that if seed is not 0, this operator will always "
+                 "generate the same random numbers every time.")
+        .SetDefault(0);
+    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
+        .SetDefault(framework::proto::VarType::FP32);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(uniform_random_table, ops::UniformRandomTableOp,
+                  ops::UniformRandomTableInferShape,
+                  ops::UniformRandomTableOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_table_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_table_op.py
new file mode 100644
index 0000000000..0474c51e49
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_table_op.py
@@ -0,0 +1,66 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+def output_hist(out):
+    hist, _ = np.histogram(out, range=(-5, 10))
+    hist = hist.astype("float32")
+    hist /= float(out.size)
+    prob = 0.1 * np.ones((10))
+    return hist, prob
+
+
+class TestUniformRandomTableOp(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        out = scope.var("X").get_selected_rows()
+
+        op = Operator(
+            "uniform_random_table",
+            Out="X",
+            shape=[4, 784],
+            min=-5.0,
+            max=10.0,
+            seed=10,
+            shard_cnt=3,
+            shard_id=1,
+            max_id=10)
+        op.run(scope, place)
+        self.assertEqual(out.rows(), [1, 4, 7, 10])
+        self.assertEqual(out.height(), 10)
+        self.assertEqual(out.get_tensor().shape(), [4, 784])
+        hist, prob = output_hist(np.array(out.get_tensor()))
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+if __name__ == "__main__":
+    unittest.main()

From cb7bbf426c1be2d4a0989855f6440b0b8313f6b0 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Tue, 10 Apr 2018 14:28:35 +0800
Subject: [PATCH 20/67] revert uniform_random_op

---
 paddle/fluid/operators/uniform_random_op.cc   | 13 +-----
 paddle/fluid/operators/uniform_random_op.cu   | 13 +-----
 .../tests/unittests/test_uniform_random_op.py | 46 ++-----------------
 3 files changed, 7 insertions(+), 65 deletions(-)

diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index d8b38fb7eb..87699362b2 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -24,19 +24,8 @@ template <typename T>
 class CPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    framework::Tensor* tensor(nullptr);
-    auto out_var = ctx.OutputVar("Out");
-    if (out_var->IsType<framework::LoDTensor>()) {
-      tensor = ctx.Output<framework::LoDTensor>("Out");
-    } else if (out_var->IsType<framework::SelectedRows>()) {
-      auto shape = ctx.Attr<std::vector<int>>("shape");
-      tensor = ctx.Output<framework::SelectedRows>("Out")->mutable_value();
-      tensor->Resize(framework::make_ddim(shape));
-    } else {
-      PADDLE_THROW("Only support LoDTensor and SelectedRows.");
-    }
+    auto* tensor = ctx.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(ctx.GetPlace());
-    data[0] = static_cast<T>(1000);
     unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
     std::minstd_rand engine;
     if (seed == 0) {
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 115c859527..1232cd1eb3 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -43,18 +43,7 @@ template <typename T>
 class GPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    framework::Tensor* tensor(nullptr);
-    auto out_var = ctx.OutputVar("Out");
-    if (out_var->IsType<framework::LoDTensor>()) {
-      tensor = ctx.Output<framework::LoDTensor>("Out");
-    } else if (out_var->IsType<framework::SelectedRows>()) {
-      auto shape = ctx.Attr<std::vector<int>>("shape");
-      tensor = ctx.Output<framework::SelectedRows>("Out")->mutable_value();
-      tensor->Resize(framework::make_ddim(shape));
-    } else {
-      PADDLE_THROW("Only support LoDTensor and SelectedRows.");
-    }
-
+    auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
     if (seed == 0) {
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 3331e99c36..75ff85a55f 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -15,16 +15,6 @@
 import unittest
 import numpy as np
 from op_test import OpTest
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-
-
-def output_hist(out):
-    hist, _ = np.histogram(out, range=(-5, 10))
-    hist = hist.astype("float32")
-    hist /= float(out.size)
-    prob = 0.1 * np.ones((10))
-    return hist, prob
 
 
 class TestUniformRandomOp(OpTest):
@@ -43,37 +33,11 @@ class TestUniformRandomOp(OpTest):
         self.check_output_customized(self.verify_output)
 
     def verify_output(self, outs):
-        hist, prob = output_hist(outs[0])
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
-
-
-class TestUniformRandomOpSelectedRows(unittest.TestCase):
-    def get_places(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        return places
-
-    def test_check_output(self):
-        for place in self.get_places():
-            self.check_with_place(place)
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-        out = scope.var("X").get_selected_rows()
-
-        op = Operator(
-            "uniform_random",
-            Out="X",
-            shape=[1000, 784],
-            min=-5.0,
-            max=10.0,
-            seed=10)
-        op.run(scope, place)
-        out_tensor = out.get_tensor()
-        hist, prob = output_hist(np.array(out_tensor))
+        tensor = outs[0]
+        hist, _ = np.histogram(outs[0], range=(-5, 10))
+        hist = hist.astype("float32")
+        hist /= float(outs[0].size)
+        prob = 0.1 * np.ones((10))
         self.assertTrue(
             np.allclose(
                 hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))

From 7c1434dd73d367932e98ae569093183d33b7e5fb Mon Sep 17 00:00:00 2001
From: jshower <j.shower@163.com>
Date: Tue, 10 Apr 2018 07:36:15 +0000
Subject: [PATCH 21/67] code style

---
 .../tests/book/test_label_semantic_roles.py   | 64 +++++++++----------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index 4f5d30ac00..ace2e39ba4 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -70,8 +70,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
         fluid.layers.embedding(
             size=[word_dict_len, word_dim],
             input=x,
-            param_attr=fluid.ParamAttr(name=embedding_name, trainable=False))
-        for x in word_input
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
     ]
     emb_layers.append(predicate_embedding)
     emb_layers.append(mark_embedding)
@@ -164,7 +164,8 @@ def train(use_cuda, save_dirname=None, is_local=True):
     crf_cost = fluid.layers.linear_chain_crf(
         input=feature_out,
         label=target,
-        param_attr=fluid.ParamAttr(name='crfw', learning_rate=mix_hidden_lr))
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=mix_hidden_lr))
     avg_cost = fluid.layers.mean(crf_cost)
 
     # TODO(qiao)
@@ -189,7 +190,8 @@ def train(use_cuda, save_dirname=None, is_local=True):
         num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
 
     train_data = paddle.batch(
-        paddle.reader.shuffle(paddle.dataset.conll05.test(), buf_size=8192),
+        paddle.reader.shuffle(
+            paddle.dataset.conll05.test(), buf_size=8192),
         batch_size=BATCH_SIZE)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
@@ -222,25 +224,24 @@ def train(use_cuda, save_dirname=None, is_local=True):
                     exe)
 
                 if batch_id % 10 == 0:
-                    print(
-                        "avg_cost:" + str(cost) + " precision:" +
-                        str(precision) + " recall:" + str(recall) +
-                        " f1_score:" + str(f1_score) + " pass_precision:" + str(
-                            pass_precision) + " pass_recall:" + str(pass_recall)
-                        + " pass_f1_score:" + str(pass_f1_score))
+                    print("avg_cost:" + str(cost) + " precision:" + str(
+                        precision) + " recall:" + str(recall) + " f1_score:" +
+                          str(f1_score) + " pass_precision:" + str(
+                              pass_precision) + " pass_recall:" + str(
+                                  pass_recall) + " pass_f1_score:" + str(
+                                      pass_f1_score))
                     if batch_id != 0:
-                        print("second per batch: " + str(
-                            (time.time() - start_time) / batch_id))
+                        print("second per batch: " + str((time.time(
+                        ) - start_time) / batch_id))
                     # Set the threshold low to speed up the CI test
                     if float(pass_precision) > 0.05:
                         if save_dirname is not None:
                             # TODO(liuyiqun): Change the target to crf_decode
-                            fluid.io.save_inference_model(
-                                save_dirname, [
-                                    'word_data', 'verb_data', 'ctx_n2_data',
-                                    'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
-                                    'ctx_p2_data', 'mark_data'
-                                ], [feature_out], exe)
+                            fluid.io.save_inference_model(save_dirname, [
+                                'word_data', 'verb_data', 'ctx_n2_data',
+                                'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
+                                'ctx_p2_data', 'mark_data'
+                            ], [feature_out], exe)
                         return
 
                 batch_id = batch_id + 1
@@ -320,20 +321,19 @@ def infer(use_cuda, save_dirname=None):
         assert feed_target_names[6] == 'ctx_p2_data'
         assert feed_target_names[7] == 'mark_data'
 
-        results = exe.run(
-            inference_program,
-            feed={
-                feed_target_names[0]: word,
-                feed_target_names[1]: pred,
-                feed_target_names[2]: ctx_n2,
-                feed_target_names[3]: ctx_n1,
-                feed_target_names[4]: ctx_0,
-                feed_target_names[5]: ctx_p1,
-                feed_target_names[6]: ctx_p2,
-                feed_target_names[7]: mark
-            },
-            fetch_list=fetch_targets,
-            return_numpy=False)
+        results = exe.run(inference_program,
+                          feed={
+                              feed_target_names[0]: word,
+                              feed_target_names[1]: pred,
+                              feed_target_names[2]: ctx_n2,
+                              feed_target_names[3]: ctx_n1,
+                              feed_target_names[4]: ctx_0,
+                              feed_target_names[5]: ctx_p1,
+                              feed_target_names[6]: ctx_p2,
+                              feed_target_names[7]: mark
+                          },
+                          fetch_list=fetch_targets,
+                          return_numpy=False)
         print(results[0].lod())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)

From ad6ddf533cfb1542283f741cddb78835fb3b8658 Mon Sep 17 00:00:00 2001
From: jshower <j.shower@163.com>
Date: Tue, 10 Apr 2018 09:23:11 +0000
Subject: [PATCH 22/67] for ci

---
 python/paddle/fluid/tests/book/test_label_semantic_roles.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index ace2e39ba4..4d8bca4d24 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -37,7 +37,7 @@ depth = 8
 mix_hidden_lr = 1e-3
 
 IS_SPARSE = True
-PASS_NUM = 10
+PASS_NUM = 100
 BATCH_SIZE = 10
 
 embedding_name = 'emb'
@@ -234,7 +234,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
                         print("second per batch: " + str((time.time(
                         ) - start_time) / batch_id))
                     # Set the threshold low to speed up the CI test
-                    if float(pass_precision) > 0.05:
+                    if float(pass_precision) > 0.01:
                         if save_dirname is not None:
                             # TODO(liuyiqun): Change the target to crf_decode
                             fluid.io.save_inference_model(save_dirname, [

From 8eaec5dd7c5d627aa2d23db1fc518a1e85a30821 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Mon, 9 Apr 2018 15:28:07 +0800
Subject: [PATCH 23/67] add BCast and Gather

---
 paddle/fluid/framework/details/CMakeLists.txt |   8 +-
 .../framework/details/broad_cast_op_handle.cc | 103 +++++++++++
 .../framework/details/broad_cast_op_handle.h  |  54 ++++++
 .../details/broad_cast_op_handle_test.cc      | 174 ++++++++++++++++++
 paddle/fluid/platform/device_context.h        |  46 ++++-
 5 files changed, 382 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/framework/details/broad_cast_op_handle.cc
 create mode 100644 paddle/fluid/framework/details/broad_cast_op_handle.h
 create mode 100644 paddle/fluid/framework/details/broad_cast_op_handle_test.cc

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 89b5c6847f..eda2b6aac0 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -2,8 +2,12 @@ cc_library(var_handle SRCS var_handle.cc DEPS place)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
-nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+if(WITH_GPU)
+    nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
         dynload_cuda)
+    nv_library(broad_cast_op_handle SRCS broad_cast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+endif()
+
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
@@ -11,6 +15,8 @@ cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
 
 if(WITH_GPU)
     set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
+    nv_test(broad_cast_op_test SRCS broad_cast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
+            device_context broad_cast_op_handle)
 else()
     set(multi_devices_graph_builder_deps)
 endif()
diff --git a/paddle/fluid/framework/details/broad_cast_op_handle.cc b/paddle/fluid/framework/details/broad_cast_op_handle.cc
new file mode 100644
index 0000000000..e636371b94
--- /dev/null
+++ b/paddle/fluid/framework/details/broad_cast_op_handle.cc
@@ -0,0 +1,103 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/broad_cast_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+Tensor *GetTensorFromVar(Variable *in_var) {
+  if (in_var->IsType<LoDTensor>()) {
+    return in_var->GetMutable<LoDTensor>();
+  } else if (in_var->IsType<SelectedRows>()) {
+    return in_var->GetMutable<SelectedRows>()->mutable_value();
+  } else {
+    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+  }
+  return nullptr;
+}
+BCastOpHandle::BCastOpHandle(const std::vector<Scope *> &local_scopes,
+                             const std::vector<platform::Place> &places,
+                             const platform::ContextMap &ctxs)
+    : local_scopes_(local_scopes), places_(places), ctxs_(ctxs) {
+  for (auto &p : places_) {
+    this->dev_ctxes_[p] = ctxs_.DevCtx(p);
+  }
+}
+
+void BCastOpHandle::RunImpl() {
+  PADDLE_ENFORCE_EQ(this->inputs_.size(), 1);
+  PADDLE_ENFORCE_EQ(this->outputs_.size(), places_.size());
+
+  // Wait input done, this Wait is asynchronous operation
+  auto in_var_handle = static_cast<VarHandle *>(this->inputs_[0]);
+  auto &in_place = in_var_handle->place_;
+  if (inputs_[0]->generated_op_)
+    inputs_[0]->generated_op_->Wait(dev_ctxes_[in_place]);
+
+  auto iter = std::find(places_.begin(), places_.end(), in_place);
+  if (iter == places_.end()) {
+    PADDLE_THROW("The input of BCast is not in the places_.");
+  }
+
+  int offset = iter - places_.begin();
+  auto in_var = local_scopes_[offset]->FindVar(in_var_handle->name_);
+
+  Tensor *in_tensor = GetTensorFromVar(in_var);
+  for (auto *out : outputs_) {
+    auto out_handle = static_cast<VarHandle *>(out);
+    auto &out_p = out_handle->place_;
+
+    auto iter = std::find(places_.begin(), places_.end(), out_p);
+    if (iter == places_.end()) {
+      PADDLE_THROW("The output of BCast is not in the places_.");
+    }
+    int offset = iter - places_.begin();
+
+    auto *s = local_scopes_[offset];
+    auto out_var = s->FindVar(out_handle->name_);
+
+    PADDLE_ENFORCE_EQ(out_var->Type(), in_var->Type(), "");
+
+    if (in_var->IsType<framework::SelectedRows>()) {
+      auto in_sr = in_var->GetMutable<framework::SelectedRows>();
+      auto out = out_var->GetMutable<framework::SelectedRows>();
+      if (in_sr == out) continue;
+      out->set_height(in_sr->height());
+      out->set_rows(in_sr->rows());
+      out->mutable_value()->Resize(in_sr->value().dims());
+      out->mutable_value()->mutable_data(out_p, in_sr->value().type());
+    } else if (in_var->IsType<framework::LoDTensor>()) {
+      auto in_lod = in_var->GetMutable<framework::LoDTensor>();
+      auto out = out_var->GetMutable<framework::LoDTensor>();
+      if (in_lod == out) continue;
+      out->set_lod(in_lod->lod());
+      out->Resize(in_lod->dims());
+      out->mutable_data(out_p, in_lod->type());
+    } else {
+      PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+    }
+
+    Tensor *out_tensor = GetTensorFromVar(out_var);
+
+    paddle::framework::TensorCopy(*in_tensor, out_p, *(dev_ctxes_[in_place]),
+                                  out_tensor);
+  }
+}
+
+std::string BCastOpHandle::Name() const { return "broadcast"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/broad_cast_op_handle.h b/paddle/fluid/framework/details/broad_cast_op_handle.h
new file mode 100644
index 0000000000..432e86e410
--- /dev/null
+++ b/paddle/fluid/framework/details/broad_cast_op_handle.h
@@ -0,0 +1,54 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+/*
+ * BroadCast the input to all scope.
+ *
+ */
+struct BCastOpHandle : public OpHandleBase {
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+  const platform::ContextMap &ctxs_;
+
+  BCastOpHandle(const std::vector<Scope *> &local_scopes,
+                const std::vector<platform::Place> &places,
+                const platform::ContextMap &ctxs);
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/broad_cast_op_handle_test.cc b/paddle/fluid/framework/details/broad_cast_op_handle_test.cc
new file mode 100644
index 0000000000..a1338abeb5
--- /dev/null
+++ b/paddle/fluid/framework/details/broad_cast_op_handle_test.cc
@@ -0,0 +1,174 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/broad_cast_op_handle.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+// test data amount
+const f::DDim kDims = {20, 20};
+
+class BroadCastTester : public ::testing::Test {
+ public:
+  void SetUp() override {
+    int count = p::GetCUDADeviceCount();
+    if (count <= 1) {
+      LOG(WARNING) << "Cannot test multi-gpu BroadCast, because the CUDA "
+                      "device count is "
+                   << count;
+      exit(0);
+    }
+    for (int i = 0; i < count; ++i) {
+      gpu_list_.emplace_back(p::CUDAPlace(i));
+    }
+    ctxs_ = new p::ContextMap(gpu_list_);
+  }
+
+  template <class T>
+  void BroadCastInitOp(int gpu_id = 0) {
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      local_scope_.push_back(&g_scope_.NewScope());
+      auto* out_var = local_scope_[j]->Var("out");
+      out_var->GetMutable<T>();
+    }
+    auto* in_var = local_scope_[gpu_id]->Var("input");
+    in_var->GetMutable<T>();
+
+    bc_op_handle_ =
+        new f::details::BCastOpHandle(local_scope_, gpu_list_, *ctxs_);
+
+    f::details::VarHandle* in_var_handle = new f::details::VarHandle();
+    in_var_handle->place_ = gpu_list_[gpu_id];
+    in_var_handle->name_ = "input";
+    in_var_handle->version_ = 1;
+    in_var_handle->generated_op_ = nullptr;
+    bc_op_handle_->AddInput(in_var_handle);
+
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      f::details::VarHandle* out_var_handle = new f::details::VarHandle();
+      out_var_handle->place_ = gpu_list_[j];
+      out_var_handle->name_ = "out";
+      out_var_handle->version_ = 2;
+      out_var_handle->generated_op_ = bc_op_handle_;
+      bc_op_handle_->AddOutput(out_var_handle);
+    }
+  }
+  void BroadCastDestroy() {
+    delete ctxs_;
+    for (auto in : bc_op_handle_->inputs_) {
+      delete in;
+    }
+    for (auto out : bc_op_handle_->outputs_) {
+      delete out;
+    }
+    delete bc_op_handle_;
+  }
+
+ public:
+  f::Scope g_scope_;
+  p::ContextMap* ctxs_;
+  std::vector<f::Scope*> local_scope_;
+  std::vector<p::Place> gpu_list_;
+  f::details::BCastOpHandle* bc_op_handle_;
+};
+
+TEST_F(BroadCastTester, BroadCastTestLodTensor) {
+  int gpu_id = 0;
+  BroadCastInitOp<f::LoDTensor>(gpu_id);
+
+  auto in_var = local_scope_[gpu_id]->Var("input");
+  auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
+  in_lod_tensor->mutable_data<float>(kDims, gpu_list_[gpu_id]);
+
+  std::vector<float> send_vector(f::product(kDims), gpu_id + 12);
+  for (size_t k = 0; k < send_vector.size(); ++k) {
+    send_vector[k] = k;
+  }
+  f::LoD lod{{0, 10, 20}};
+  paddle::framework::TensorFromVector<float>(
+      send_vector, *(ctxs_->DevCtx(gpu_list_[gpu_id])), in_lod_tensor);
+  in_lod_tensor->set_lod(lod);
+  bc_op_handle_->Run(false);
+
+  ctxs_->WaitAll();
+
+  p::CPUPlace cpu_place;
+  for (size_t j = 0; j < gpu_list_.size(); ++j) {
+    auto out_var = local_scope_[j]->Var("out");
+    auto out_tensor = out_var->Get<f::LoDTensor>();
+    PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
+
+    f::Tensor result_tensor;
+    f::TensorCopy(out_tensor, cpu_place, *(ctxs_->DevCtx(j)), &result_tensor);
+    float* ct = result_tensor.mutable_data<float>(cpu_place);
+
+    for (int64_t j = 0; j < f::product(kDims); ++j) {
+      ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+    }
+  }
+
+  BroadCastDestroy();
+}
+
+TEST_F(BroadCastTester, BroadCastTestSelectedRows) {
+  int gpu_id = 0;
+  BroadCastInitOp<f::SelectedRows>(gpu_id);
+
+  auto in_var = local_scope_[gpu_id]->Var("input");
+  auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+  auto value = in_selected_rows->mutable_value();
+  value->mutable_data<float>(kDims, gpu_list_[gpu_id]);
+  int height = kDims[0] * 2;
+  std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                            2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+  in_selected_rows->set_height(height);
+  in_selected_rows->set_rows(rows);
+
+  std::vector<float> send_vector(f::product(kDims));
+  for (size_t k = 0; k < send_vector.size(); ++k) {
+    send_vector[k] = k;
+  }
+  paddle::framework::TensorFromVector<float>(
+      send_vector, *(ctxs_->DevCtx(gpu_list_[gpu_id])), value);
+
+  bc_op_handle_->Run(false);
+
+  ctxs_->WaitAll();
+
+  p::CPUPlace cpu_place;
+  for (size_t j = 0; j < gpu_list_.size(); ++j) {
+    auto out_var = local_scope_[j]->Var("out");
+    auto& out_select_rows = out_var->Get<f::SelectedRows>();
+    auto rt = out_select_rows.value();
+
+    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
+    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
+    }
+
+    f::Tensor result_tensor;
+    f::TensorCopy(rt, cpu_place, *(ctxs_->DevCtx(j)), &result_tensor);
+    float* ct = result_tensor.data<float>();
+
+    for (int64_t j = 0; j < f::product(kDims); ++j) {
+      ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+    }
+  }
+
+  BroadCastDestroy();
+}
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 6b796d92d0..fceb5845ff 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -2,17 +2,20 @@
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
+
+ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
 #include <memory>
+#include <string>
 #include <unordered_map>
+#include <vector>
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
@@ -137,6 +140,45 @@ template <>
 struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
   using TYPE = CUDAPinnedDeviceContext;
 };
+
+class ContextMap {
+ public:
+  explicit ContextMap(const std::vector<platform::Place>& places) {
+    order_.reserve(places.size());
+    for (auto& p : places) {
+      auto dev = boost::get<CUDAPlace>(p);
+      int dev_id = dev.device;
+      order_.emplace_back(dev_id);
+      contexts_[dev_id].reset(new CUDADeviceContext(dev));
+    }
+    PADDLE_ENFORCE_EQ(
+        order_.size(), contexts_.size(),
+        "Context Map does not support contain two or more same device");
+  }
+
+  DeviceContext* DevCtx(int dev_id) const { return at(dev_id); }
+
+  DeviceContext* DevCtx(platform::Place p) const {
+    return DevCtx(boost::get<CUDAPlace>(p).device);
+  }
+
+  DeviceContext* at(platform::Place p) const {
+    return this->at(boost::get<CUDAPlace>(p).device);
+  }
+
+  DeviceContext* at(int dev_id) const { return contexts_.at(dev_id).get(); }
+
+  void WaitAll() {
+    for (auto& p : contexts_) {
+      p.second->Wait();
+    }
+  }
+
+ private:
+  std::unordered_map<int, std::unique_ptr<DeviceContext>> contexts_;
+  std::vector<int> order_;
+};
+
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN

From 6db96ec23cd02a4cec41338f3c1e53aa303be78e Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Wed, 11 Apr 2018 11:47:03 +0800
Subject: [PATCH 24/67] follow comments

---
 paddle/fluid/framework/details/CMakeLists.txt |  6 ++---
 ...st_op_handle.cc => broadcast_op_handle.cc} | 12 ++++-----
 ...cast_op_handle.h => broadcast_op_handle.h} | 10 +++----
 ...le_test.cc => broadcast_op_handle_test.cc} | 26 +++++++++----------
 4 files changed, 27 insertions(+), 27 deletions(-)
 rename paddle/fluid/framework/details/{broad_cast_op_handle.cc => broadcast_op_handle.cc} (89%)
 rename paddle/fluid/framework/details/{broad_cast_op_handle.h => broadcast_op_handle.h} (83%)
 rename paddle/fluid/framework/details/{broad_cast_op_handle_test.cc => broadcast_op_handle_test.cc} (89%)

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index eda2b6aac0..7b7582380c 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -5,7 +5,7 @@ cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod
 if(WITH_GPU)
     nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
         dynload_cuda)
-    nv_library(broad_cast_op_handle SRCS broad_cast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+    nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 endif()
 
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
@@ -15,8 +15,8 @@ cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
 
 if(WITH_GPU)
     set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
-    nv_test(broad_cast_op_test SRCS broad_cast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
-            device_context broad_cast_op_handle)
+    nv_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
+            device_context broadcast_op_handle)
 else()
     set(multi_devices_graph_builder_deps)
 endif()
diff --git a/paddle/fluid/framework/details/broad_cast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
similarity index 89%
rename from paddle/fluid/framework/details/broad_cast_op_handle.cc
rename to paddle/fluid/framework/details/broadcast_op_handle.cc
index e636371b94..a782ebf8fd 100644
--- a/paddle/fluid/framework/details/broad_cast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/broad_cast_op_handle.h"
+#include "paddle/fluid/framework/details/broadcast_op_handle.h"
 
 namespace paddle {
 namespace framework {
@@ -28,16 +28,16 @@ Tensor *GetTensorFromVar(Variable *in_var) {
   }
   return nullptr;
 }
-BCastOpHandle::BCastOpHandle(const std::vector<Scope *> &local_scopes,
-                             const std::vector<platform::Place> &places,
-                             const platform::ContextMap &ctxs)
+BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+                                     const std::vector<platform::Place> &places,
+                                     const platform::ContextMap &ctxs)
     : local_scopes_(local_scopes), places_(places), ctxs_(ctxs) {
   for (auto &p : places_) {
     this->dev_ctxes_[p] = ctxs_.DevCtx(p);
   }
 }
 
-void BCastOpHandle::RunImpl() {
+void BroadcastOpHandle::RunImpl() {
   PADDLE_ENFORCE_EQ(this->inputs_.size(), 1);
   PADDLE_ENFORCE_EQ(this->outputs_.size(), places_.size());
 
@@ -97,7 +97,7 @@ void BCastOpHandle::RunImpl() {
   }
 }
 
-std::string BCastOpHandle::Name() const { return "broadcast"; }
+std::string BroadcastOpHandle::Name() const { return "broadcast"; }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/broad_cast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
similarity index 83%
rename from paddle/fluid/framework/details/broad_cast_op_handle.h
rename to paddle/fluid/framework/details/broadcast_op_handle.h
index 432e86e410..a571af1218 100644
--- a/paddle/fluid/framework/details/broad_cast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -29,17 +29,17 @@ namespace framework {
 namespace details {
 
 /*
- * BroadCast the input to all scope.
+ * Broadcast the input to all scope.
  *
  */
-struct BCastOpHandle : public OpHandleBase {
+struct BroadcastOpHandle : public OpHandleBase {
   const std::vector<Scope *> &local_scopes_;
   const std::vector<platform::Place> &places_;
   const platform::ContextMap &ctxs_;
 
-  BCastOpHandle(const std::vector<Scope *> &local_scopes,
-                const std::vector<platform::Place> &places,
-                const platform::ContextMap &ctxs);
+  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+                    const std::vector<platform::Place> &places,
+                    const platform::ContextMap &ctxs);
 
   std::string Name() const override;
 
diff --git a/paddle/fluid/framework/details/broad_cast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
similarity index 89%
rename from paddle/fluid/framework/details/broad_cast_op_handle_test.cc
rename to paddle/fluid/framework/details/broadcast_op_handle_test.cc
index a1338abeb5..fd671ded21 100644
--- a/paddle/fluid/framework/details/broad_cast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/broad_cast_op_handle.h"
+#include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/platform/device_context.h"
@@ -23,12 +23,12 @@ namespace p = paddle::platform;
 // test data amount
 const f::DDim kDims = {20, 20};
 
-class BroadCastTester : public ::testing::Test {
+class BroadcastTester : public ::testing::Test {
  public:
   void SetUp() override {
     int count = p::GetCUDADeviceCount();
     if (count <= 1) {
-      LOG(WARNING) << "Cannot test multi-gpu BroadCast, because the CUDA "
+      LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
                       "device count is "
                    << count;
       exit(0);
@@ -40,7 +40,7 @@ class BroadCastTester : public ::testing::Test {
   }
 
   template <class T>
-  void BroadCastInitOp(int gpu_id = 0) {
+  void BroadcastInitOp(int gpu_id = 0) {
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       local_scope_.push_back(&g_scope_.NewScope());
       auto* out_var = local_scope_[j]->Var("out");
@@ -50,7 +50,7 @@ class BroadCastTester : public ::testing::Test {
     in_var->GetMutable<T>();
 
     bc_op_handle_ =
-        new f::details::BCastOpHandle(local_scope_, gpu_list_, *ctxs_);
+        new f::details::BroadcastOpHandle(local_scope_, gpu_list_, *ctxs_);
 
     f::details::VarHandle* in_var_handle = new f::details::VarHandle();
     in_var_handle->place_ = gpu_list_[gpu_id];
@@ -68,7 +68,7 @@ class BroadCastTester : public ::testing::Test {
       bc_op_handle_->AddOutput(out_var_handle);
     }
   }
-  void BroadCastDestroy() {
+  void BroadcastDestroy() {
     delete ctxs_;
     for (auto in : bc_op_handle_->inputs_) {
       delete in;
@@ -84,12 +84,12 @@ class BroadCastTester : public ::testing::Test {
   p::ContextMap* ctxs_;
   std::vector<f::Scope*> local_scope_;
   std::vector<p::Place> gpu_list_;
-  f::details::BCastOpHandle* bc_op_handle_;
+  f::details::BroadcastOpHandle* bc_op_handle_;
 };
 
-TEST_F(BroadCastTester, BroadCastTestLodTensor) {
+TEST_F(BroadcastTester, BroadcastTestLodTensor) {
   int gpu_id = 0;
-  BroadCastInitOp<f::LoDTensor>(gpu_id);
+  BroadcastInitOp<f::LoDTensor>(gpu_id);
 
   auto in_var = local_scope_[gpu_id]->Var("input");
   auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
@@ -122,12 +122,12 @@ TEST_F(BroadCastTester, BroadCastTestLodTensor) {
     }
   }
 
-  BroadCastDestroy();
+  BroadcastDestroy();
 }
 
-TEST_F(BroadCastTester, BroadCastTestSelectedRows) {
+TEST_F(BroadcastTester, BroadcastTestSelectedRows) {
   int gpu_id = 0;
-  BroadCastInitOp<f::SelectedRows>(gpu_id);
+  BroadcastInitOp<f::SelectedRows>(gpu_id);
 
   auto in_var = local_scope_[gpu_id]->Var("input");
   auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
@@ -170,5 +170,5 @@ TEST_F(BroadCastTester, BroadCastTestSelectedRows) {
     }
   }
 
-  BroadCastDestroy();
+  BroadcastDestroy();
 }

From 124c93081d26a89b677823a7e2d74260c579fb54 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Wed, 11 Apr 2018 14:39:00 +0800
Subject: [PATCH 25/67] remove ContextMap

---
 paddle/fluid/framework/details/CMakeLists.txt |   7 +-
 .../framework/details/broadcast_op_handle.cc  |  29 +--
 .../framework/details/broadcast_op_handle.h   |   5 +-
 .../details/broadcast_op_handle_test.cc       | 234 +++++++++++-------
 paddle/fluid/framework/details/var_handle.h   |   1 +
 paddle/fluid/platform/device_context.h        |  45 +---
 6 files changed, 157 insertions(+), 164 deletions(-)

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 7b7582380c..2a87f02bd5 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -7,16 +7,12 @@ if(WITH_GPU)
         dynload_cuda)
     nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 endif()
-
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
-
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
 
 if(WITH_GPU)
     set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
-    nv_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
-            device_context broadcast_op_handle)
 else()
     set(multi_devices_graph_builder_deps)
 endif()
@@ -25,3 +21,6 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
+
+cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
+        device_context broadcast_op_handle)
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index a782ebf8fd..2c99a347bf 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -29,13 +29,8 @@ Tensor *GetTensorFromVar(Variable *in_var) {
   return nullptr;
 }
 BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
-                                     const std::vector<platform::Place> &places,
-                                     const platform::ContextMap &ctxs)
-    : local_scopes_(local_scopes), places_(places), ctxs_(ctxs) {
-  for (auto &p : places_) {
-    this->dev_ctxes_[p] = ctxs_.DevCtx(p);
-  }
-}
+                                     const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
 
 void BroadcastOpHandle::RunImpl() {
   PADDLE_ENFORCE_EQ(this->inputs_.size(), 1);
@@ -47,26 +42,18 @@ void BroadcastOpHandle::RunImpl() {
   if (inputs_[0]->generated_op_)
     inputs_[0]->generated_op_->Wait(dev_ctxes_[in_place]);
 
-  auto iter = std::find(places_.begin(), places_.end(), in_place);
-  if (iter == places_.end()) {
-    PADDLE_THROW("The input of BCast is not in the places_.");
-  }
-
-  int offset = iter - places_.begin();
-  auto in_var = local_scopes_[offset]->FindVar(in_var_handle->name_);
+  auto in_scope_idx = in_var_handle->scope_idx_;
+  PADDLE_ENFORCE_LT(in_scope_idx, local_scopes_.size(), "");
+  auto in_var = local_scopes_[in_scope_idx]->FindVar(in_var_handle->name_);
 
   Tensor *in_tensor = GetTensorFromVar(in_var);
   for (auto *out : outputs_) {
     auto out_handle = static_cast<VarHandle *>(out);
     auto &out_p = out_handle->place_;
 
-    auto iter = std::find(places_.begin(), places_.end(), out_p);
-    if (iter == places_.end()) {
-      PADDLE_THROW("The output of BCast is not in the places_.");
-    }
-    int offset = iter - places_.begin();
-
-    auto *s = local_scopes_[offset];
+    auto out_scope_idx = out_handle->scope_idx_;
+    PADDLE_ENFORCE_LT(out_scope_idx, local_scopes_.size(), "");
+    auto *s = local_scopes_[out_scope_idx];
     auto out_var = s->FindVar(out_handle->name_);
 
     PADDLE_ENFORCE_EQ(out_var->Type(), in_var->Type(), "");
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index a571af1218..06ec164ce0 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -35,11 +35,10 @@ namespace details {
 struct BroadcastOpHandle : public OpHandleBase {
   const std::vector<Scope *> &local_scopes_;
   const std::vector<platform::Place> &places_;
-  const platform::ContextMap &ctxs_;
+  //  const platform::ContextMap &ctxs_;
 
   BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
-                    const std::vector<platform::Place> &places,
-                    const platform::ContextMap &ctxs);
+                    const std::vector<platform::Place> &places);
 
   std::string Name() const override;
 
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
index fd671ded21..d03115f0be 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -25,51 +25,66 @@ const f::DDim kDims = {20, 20};
 
 class BroadcastTester : public ::testing::Test {
  public:
-  void SetUp() override {
-    int count = p::GetCUDADeviceCount();
-    if (count <= 1) {
-      LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
-                      "device count is "
-                   << count;
-      exit(0);
+  void InitCtx(bool use_gpu) {
+    if (use_gpu) {
+#ifdef PADDLE_WITH_CUDA
+      int count = p::GetCUDADeviceCount();
+      if (count <= 1) {
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CUDAPlace(i);
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+      }
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      int count = 8;
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CPUPlace();
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+      }
     }
-    for (int i = 0; i < count; ++i) {
-      gpu_list_.emplace_back(p::CUDAPlace(i));
-    }
-    ctxs_ = new p::ContextMap(gpu_list_);
   }
 
   template <class T>
-  void BroadcastInitOp(int gpu_id = 0) {
+  void BroadcastInitOp(int input_scope_idx) {
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       local_scope_.push_back(&g_scope_.NewScope());
       auto* out_var = local_scope_[j]->Var("out");
       out_var->GetMutable<T>();
     }
-    auto* in_var = local_scope_[gpu_id]->Var("input");
+    auto* in_var = local_scope_[input_scope_idx]->Var("input");
     in_var->GetMutable<T>();
 
-    bc_op_handle_ =
-        new f::details::BroadcastOpHandle(local_scope_, gpu_list_, *ctxs_);
+    bc_op_handle_ = new f::details::BroadcastOpHandle(local_scope_, gpu_list_);
 
     f::details::VarHandle* in_var_handle = new f::details::VarHandle();
-    in_var_handle->place_ = gpu_list_[gpu_id];
+    in_var_handle->place_ = gpu_list_[input_scope_idx];
     in_var_handle->name_ = "input";
     in_var_handle->version_ = 1;
+    in_var_handle->scope_idx_ = input_scope_idx;
     in_var_handle->generated_op_ = nullptr;
     bc_op_handle_->AddInput(in_var_handle);
 
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      bc_op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j];
       f::details::VarHandle* out_var_handle = new f::details::VarHandle();
       out_var_handle->place_ = gpu_list_[j];
       out_var_handle->name_ = "out";
       out_var_handle->version_ = 2;
+      out_var_handle->scope_idx_ = j;
       out_var_handle->generated_op_ = bc_op_handle_;
       bc_op_handle_->AddOutput(out_var_handle);
     }
   }
   void BroadcastDestroy() {
-    delete ctxs_;
     for (auto in : bc_op_handle_->inputs_) {
       delete in;
     }
@@ -77,98 +92,131 @@ class BroadcastTester : public ::testing::Test {
       delete out;
     }
     delete bc_op_handle_;
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      delete ctxs_[j];
+    }
   }
 
- public:
-  f::Scope g_scope_;
-  p::ContextMap* ctxs_;
-  std::vector<f::Scope*> local_scope_;
-  std::vector<p::Place> gpu_list_;
-  f::details::BroadcastOpHandle* bc_op_handle_;
-};
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+  }
 
-TEST_F(BroadcastTester, BroadcastTestLodTensor) {
-  int gpu_id = 0;
-  BroadcastInitOp<f::LoDTensor>(gpu_id);
+  void TestBroadcastLodTensor() {
+    int input_scope_idx = 0;
+    BroadcastInitOp<f::LoDTensor>(input_scope_idx);
 
-  auto in_var = local_scope_[gpu_id]->Var("input");
-  auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
-  in_lod_tensor->mutable_data<float>(kDims, gpu_list_[gpu_id]);
+    auto in_var = local_scope_[input_scope_idx]->Var("input");
+    auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
+    in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
 
-  std::vector<float> send_vector(f::product(kDims), gpu_id + 12);
-  for (size_t k = 0; k < send_vector.size(); ++k) {
-    send_vector[k] = k;
-  }
-  f::LoD lod{{0, 10, 20}};
-  paddle::framework::TensorFromVector<float>(
-      send_vector, *(ctxs_->DevCtx(gpu_list_[gpu_id])), in_lod_tensor);
-  in_lod_tensor->set_lod(lod);
-  bc_op_handle_->Run(false);
-
-  ctxs_->WaitAll();
-
-  p::CPUPlace cpu_place;
-  for (size_t j = 0; j < gpu_list_.size(); ++j) {
-    auto out_var = local_scope_[j]->Var("out");
-    auto out_tensor = out_var->Get<f::LoDTensor>();
-    PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
-
-    f::Tensor result_tensor;
-    f::TensorCopy(out_tensor, cpu_place, *(ctxs_->DevCtx(j)), &result_tensor);
-    float* ct = result_tensor.mutable_data<float>(cpu_place);
-
-    for (int64_t j = 0; j < f::product(kDims); ++j) {
-      ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+    std::vector<float> send_vector(f::product(kDims), input_scope_idx + 12);
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
     }
-  }
+    f::LoD lod{{0, 10, 20}};
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
+    in_lod_tensor->set_lod(lod);
 
-  BroadcastDestroy();
-}
+    bc_op_handle_->Run(false);
 
-TEST_F(BroadcastTester, BroadcastTestSelectedRows) {
-  int gpu_id = 0;
-  BroadcastInitOp<f::SelectedRows>(gpu_id);
-
-  auto in_var = local_scope_[gpu_id]->Var("input");
-  auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
-  auto value = in_selected_rows->mutable_value();
-  value->mutable_data<float>(kDims, gpu_list_[gpu_id]);
-  int height = kDims[0] * 2;
-  std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
-                            2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
-  in_selected_rows->set_height(height);
-  in_selected_rows->set_rows(rows);
-
-  std::vector<float> send_vector(f::product(kDims));
-  for (size_t k = 0; k < send_vector.size(); ++k) {
-    send_vector[k] = k;
-  }
-  paddle::framework::TensorFromVector<float>(
-      send_vector, *(ctxs_->DevCtx(gpu_list_[gpu_id])), value);
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      auto out_var = local_scope_[j]->Var("out");
+      auto out_tensor = out_var->Get<f::LoDTensor>();
+      PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
 
-  bc_op_handle_->Run(false);
+      f::Tensor result_tensor;
+      f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor);
+      float* ct = result_tensor.mutable_data<float>(cpu_place);
 
-  ctxs_->WaitAll();
+      for (int64_t j = 0; j < f::product(kDims); ++j) {
+        ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+      }
+    }
 
-  p::CPUPlace cpu_place;
-  for (size_t j = 0; j < gpu_list_.size(); ++j) {
-    auto out_var = local_scope_[j]->Var("out");
-    auto& out_select_rows = out_var->Get<f::SelectedRows>();
-    auto rt = out_select_rows.value();
+    BroadcastDestroy();
+  }
 
-    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
-    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
-      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
+  void TestBroadcastSelectedRows() {
+    int input_scope_idx = 0;
+    BroadcastInitOp<f::SelectedRows>(input_scope_idx);
+
+    auto in_var = local_scope_[input_scope_idx]->Var("input");
+    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+    auto value = in_selected_rows->mutable_value();
+    value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+    int height = kDims[0] * 2;
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    in_selected_rows->set_height(height);
+    in_selected_rows->set_rows(rows);
+
+    std::vector<float> send_vector(f::product(kDims));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
     }
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), value);
+
+    bc_op_handle_->Run(false);
 
-    f::Tensor result_tensor;
-    f::TensorCopy(rt, cpu_place, *(ctxs_->DevCtx(j)), &result_tensor);
-    float* ct = result_tensor.data<float>();
+    WaitAll();
 
-    for (int64_t j = 0; j < f::product(kDims); ++j) {
-      ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+    p::CPUPlace cpu_place;
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      auto out_var = local_scope_[j]->Var("out");
+      auto& out_select_rows = out_var->Get<f::SelectedRows>();
+      auto rt = out_select_rows.value();
+
+      PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
+                        "height is not equal.");
+      for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+        PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
+      }
+
+      f::Tensor result_tensor;
+      f::TensorCopy(rt, cpu_place, *(ctxs_[j]), &result_tensor);
+      float* ct = result_tensor.data<float>();
+
+      for (int64_t j = 0; j < f::product(kDims); ++j) {
+        ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+      }
     }
+
+    BroadcastDestroy();
   }
 
-  BroadcastDestroy();
+ public:
+  f::Scope g_scope_;
+  std::vector<p::DeviceContext*> ctxs_;
+  std::vector<f::Scope*> local_scope_;
+  std::vector<p::Place> gpu_list_;
+  f::details::BroadcastOpHandle* bc_op_handle_;
+};
+
+TEST_F(BroadcastTester, TestCPUBroadcastTestLodTensor) {
+  InitCtx(false);
+  TestBroadcastLodTensor();
+}
+
+TEST_F(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
+  InitCtx(false);
+  TestBroadcastSelectedRows();
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST_F(BroadcastTester, TestGPUBroadcastTestLodTensor) {
+  InitCtx(true);
+  TestBroadcastLodTensor();
+}
+
+TEST_F(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
+  InitCtx(true);
+  TestBroadcastSelectedRows();
 }
+#endif
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 569dda17c6..871e41343f 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -50,6 +50,7 @@ struct VarHandle : public VarHandleBase {
   // version field currently is not used, however, just store the version to
   // debug easily.
   size_t version_;
+  size_t scope_idx_;
   std::string name_;
   platform::Place place_;
 };
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index fceb5845ff..39ef082266 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -2,21 +2,19 @@
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
-
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
@@ -140,45 +138,6 @@ template <>
 struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
   using TYPE = CUDAPinnedDeviceContext;
 };
-
-class ContextMap {
- public:
-  explicit ContextMap(const std::vector<platform::Place>& places) {
-    order_.reserve(places.size());
-    for (auto& p : places) {
-      auto dev = boost::get<CUDAPlace>(p);
-      int dev_id = dev.device;
-      order_.emplace_back(dev_id);
-      contexts_[dev_id].reset(new CUDADeviceContext(dev));
-    }
-    PADDLE_ENFORCE_EQ(
-        order_.size(), contexts_.size(),
-        "Context Map does not support contain two or more same device");
-  }
-
-  DeviceContext* DevCtx(int dev_id) const { return at(dev_id); }
-
-  DeviceContext* DevCtx(platform::Place p) const {
-    return DevCtx(boost::get<CUDAPlace>(p).device);
-  }
-
-  DeviceContext* at(platform::Place p) const {
-    return this->at(boost::get<CUDAPlace>(p).device);
-  }
-
-  DeviceContext* at(int dev_id) const { return contexts_.at(dev_id).get(); }
-
-  void WaitAll() {
-    for (auto& p : contexts_) {
-      p.second->Wait();
-    }
-  }
-
- private:
-  std::unordered_map<int, std::unique_ptr<DeviceContext>> contexts_;
-  std::vector<int> order_;
-};
-
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN

From 80bd1ca01f62871b7e14fbdbe70482b3eeff9779 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Wed, 11 Apr 2018 01:31:59 -0700
Subject: [PATCH 26/67] "fix the style"

---
 paddle/fluid/operators/sequence_expand_op.cu | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 8119afce1a..111ccba225 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include <algorithm>
 #include "paddle/fluid/operators/sequence_expand_op.h"
 #include "paddle/fluid/platform/cuda_helper.h"
@@ -78,7 +77,7 @@ __global__ void sequence_expand_grad_kernel(
 
 void GetOutputOffset(const framework::Vector<size_t>& x_lod,
                      const framework::Vector<size_t>& ref_lod,
-                     framework::Vector<size_t>& out_offset) {
+                     framework::Vector<size_t>* out_offset) {
   size_t offset = 0;
   int lod_size = static_cast<int>(x_lod.size());
   for (int i = 0; i < static_cast<int>(x_lod.size()); ++i) {
@@ -98,7 +97,7 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
       LoDTensor* out) {
     int x_item_length = x.numel() / x.dims()[0];
     framework::Vector<size_t> out_offset(x_lod.size());
-    GetOutputOffset(x_lod, ref_lod, out_offset);
+    GetOutputOffset(x_lod, ref_lod, &out_offset);
 
     int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
     int thread_y = 16;
@@ -124,7 +123,7 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
                   LoDTensor* dx) {
     int x_item_length = framework::product(dx->dims()) / dx->dims()[0];
     framework::Vector<size_t> out_offset(x_lod.size());
-    GetOutputOffset(x_lod, ref_lod, out_offset);
+    GetOutputOffset(x_lod, ref_lod, &out_offset);
 
     int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
     int thread_y = 16;

From 62d1f9a7cb9b850584fcd22d1c2b57f31174a13a Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Wed, 11 Apr 2018 01:44:57 -0700
Subject: [PATCH 27/67] "done"

---
 paddle/fluid/operators/sequence_expand_op.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 111ccba225..c00765e5d5 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -81,7 +81,7 @@ void GetOutputOffset(const framework::Vector<size_t>& x_lod,
   size_t offset = 0;
   int lod_size = static_cast<int>(x_lod.size());
   for (int i = 0; i < static_cast<int>(x_lod.size()); ++i) {
-    out_offset[i] = offset;
+    (*out_offset)[i] = offset;
     if (i < lod_size - 1) {
       offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]);
     }

From 52987902c98378432ba9e3fc54307e19e87aaca3 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 11 Apr 2018 16:52:16 +0800
Subject: [PATCH 28/67] Polish reshape op

---
 paddle/fluid/operators/reshape_op.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index 807e5ad951..9abc78421a 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -60,7 +60,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
   static framework::DDim ValidateShape(const std::vector<int> shape,
                                        const framework::DDim &in_dims) {
     const int64_t in_size = framework::product(in_dims);
-    // only one dimension canbe set to -1, whose size will be automatically
+    // only one dimension can be set to -1, whose size will be automatically
     // infered.
     const int64_t unk_dim_val = -1;
     const int64_t copy_dim_val = 0;
@@ -119,13 +119,15 @@ class ReshapeKernel : public framework::OpKernel<T> {
     auto *shape_tensor = ctx.Input<framework::LoDTensor>("Shape");
 
     framework::DDim out_dims = out->dims();
+
     if (shape_tensor) {
       auto *shape_data = shape_tensor->data<int>();
+      framework::Tensor cpu_shape_tensor;
       if (platform::is_gpu_place(ctx.GetPlace())) {
-        framework::Tensor cpu_shape_tensor;
         TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(),
                    &cpu_shape_tensor);
         shape_data = cpu_shape_tensor.data<int>();
+        ctx.device_context().Wait();
       }
       auto shape =
           std::vector<int>(shape_data, shape_data + shape_tensor->numel());

From e7684911fd7680a2c5576da0833b7558a4ff9ba0 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Wed, 11 Apr 2018 16:32:31 +0800
Subject: [PATCH 29/67] add gather op handle

---
 paddle/fluid/framework/details/CMakeLists.txt |  14 +-
 .../framework/details/broadcast_op_handle.cc  |  39 +--
 .../framework/details/broadcast_op_handle.h   |   1 -
 .../details/broadcast_op_handle_test.cc       |   6 +-
 .../framework/details/gather_op_handle.cc     | 121 ++++++++++
 .../framework/details/gather_op_handle.h      |  52 ++++
 .../details/gather_op_handle_test.cc          | 227 ++++++++++++++++++
 7 files changed, 432 insertions(+), 28 deletions(-)
 create mode 100644 paddle/fluid/framework/details/gather_op_handle.cc
 create mode 100644 paddle/fluid/framework/details/gather_op_handle.h
 create mode 100644 paddle/fluid/framework/details/gather_op_handle_test.cc

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 2a87f02bd5..3644ed9cb7 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -5,22 +5,22 @@ cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod
 if(WITH_GPU)
     nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
         dynload_cuda)
-    nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
-endif()
-cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
-cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
-cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
-
-if(WITH_GPU)
     set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
 else()
     set(multi_devices_graph_builder_deps)
 endif()
+cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
+cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
+cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
             scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
+cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 
 cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
         device_context broadcast_op_handle)
+cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
+        device_context gather_op_handle)
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 2c99a347bf..7cd13a50f5 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -18,7 +18,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-Tensor *GetTensorFromVar(Variable *in_var) {
+static Tensor *GetTensorFromVar(Variable *in_var) {
   if (in_var->IsType<LoDTensor>()) {
     return in_var->GetMutable<LoDTensor>();
   } else if (in_var->IsType<SelectedRows>()) {
@@ -52,29 +52,34 @@ void BroadcastOpHandle::RunImpl() {
     auto &out_p = out_handle->place_;
 
     auto out_scope_idx = out_handle->scope_idx_;
-    PADDLE_ENFORCE_LT(out_scope_idx, local_scopes_.size(), "");
+    PADDLE_ENFORCE_LT(out_scope_idx, local_scopes_.size(),
+                      "%s is not the the local_scopes ", out_handle->name_);
     auto *s = local_scopes_[out_scope_idx];
     auto out_var = s->FindVar(out_handle->name_);
 
-    PADDLE_ENFORCE_EQ(out_var->Type(), in_var->Type(), "");
+    PADDLE_ENFORCE_EQ(
+        out_var->Type(), in_var->Type(),
+        "The type of input and output is not equal. (%s_%d vs %s_%d)",
+        out_handle->name_, out_handle->scope_idx_, in_var_handle->name_,
+        in_var_handle->scope_idx_);
 
     if (in_var->IsType<framework::SelectedRows>()) {
-      auto in_sr = in_var->GetMutable<framework::SelectedRows>();
-      auto out = out_var->GetMutable<framework::SelectedRows>();
-      if (in_sr == out) continue;
-      out->set_height(in_sr->height());
-      out->set_rows(in_sr->rows());
-      out->mutable_value()->Resize(in_sr->value().dims());
-      out->mutable_value()->mutable_data(out_p, in_sr->value().type());
+      auto &in_sr = in_var->Get<framework::SelectedRows>();
+      auto out_sr = out_var->GetMutable<framework::SelectedRows>();
+      if (&in_sr == out_sr) continue;
+      out_sr->set_height(in_sr.height());
+      out_sr->set_rows(in_sr.rows());
+      out_sr->mutable_value()->Resize(in_sr.value().dims());
+      out_sr->mutable_value()->mutable_data(out_p, in_sr.value().type());
     } else if (in_var->IsType<framework::LoDTensor>()) {
-      auto in_lod = in_var->GetMutable<framework::LoDTensor>();
-      auto out = out_var->GetMutable<framework::LoDTensor>();
-      if (in_lod == out) continue;
-      out->set_lod(in_lod->lod());
-      out->Resize(in_lod->dims());
-      out->mutable_data(out_p, in_lod->type());
+      auto in_lod = in_var->Get<framework::LoDTensor>();
+      auto out_lod = out_var->GetMutable<framework::LoDTensor>();
+      if (&in_lod == out_lod) continue;
+      out_lod->set_lod(in_lod.lod());
+      out_lod->Resize(in_lod.dims());
+      out_lod->mutable_data(out_p, in_lod.type());
     } else {
-      PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+      PADDLE_THROW("Var should be LoDTensor or SelectedRows.");
     }
 
     Tensor *out_tensor = GetTensorFromVar(out_var);
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 06ec164ce0..74c0a6a098 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -35,7 +35,6 @@ namespace details {
 struct BroadcastOpHandle : public OpHandleBase {
   const std::vector<Scope *> &local_scopes_;
   const std::vector<platform::Place> &places_;
-  //  const platform::ContextMap &ctxs_;
 
   BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places);
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
index d03115f0be..29cf120c76 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -84,7 +84,7 @@ class BroadcastTester : public ::testing::Test {
       bc_op_handle_->AddOutput(out_var_handle);
     }
   }
-  void BroadcastDestroy() {
+  void BroadcastOpDestroy() {
     for (auto in : bc_op_handle_->inputs_) {
       delete in;
     }
@@ -139,7 +139,7 @@ class BroadcastTester : public ::testing::Test {
       }
     }
 
-    BroadcastDestroy();
+    BroadcastOpDestroy();
   }
 
   void TestBroadcastSelectedRows() {
@@ -188,7 +188,7 @@ class BroadcastTester : public ::testing::Test {
       }
     }
 
-    BroadcastDestroy();
+    BroadcastOpDestroy();
   }
 
  public:
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
new file mode 100644
index 0000000000..9407868372
--- /dev/null
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -0,0 +1,121 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/gather_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static Tensor *GetTensorFromVar(Variable *in_var) {
+  if (in_var->IsType<LoDTensor>()) {
+    return in_var->GetMutable<LoDTensor>();
+  } else if (in_var->IsType<SelectedRows>()) {
+    return in_var->GetMutable<SelectedRows>()->mutable_value();
+  } else {
+    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+  }
+  return nullptr;
+}
+GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
+                               const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
+
+void GatherOpHandle::RunImpl() {
+  PADDLE_ENFORCE_EQ(this->inputs_.size(), places_.size());
+  PADDLE_ENFORCE_EQ(this->outputs_.size(), 1);
+
+  // Wait input done, this Wait is asynchronous operation
+  for (auto *in : inputs_) {
+    if (inputs_[0]->generated_op_) {
+      auto &p = static_cast<VarHandle *>(in)->place_;
+      in->generated_op_->Wait(dev_ctxes_[p]);
+    }
+  }
+  auto in_0_handle = static_cast<VarHandle *>(inputs_[0]);
+  auto pre_in_var =
+      local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
+
+  std::vector<int64_t> out_rows;
+  std::vector<Tensor *> in_tensors;
+  std::vector<platform::Place> in_places;
+
+  // gather the inputs
+  for (auto *in : inputs_) {
+    auto in_handle = static_cast<VarHandle *>(in);
+    auto in_p = in_handle->place_;
+    in_places.push_back(in_p);
+    PADDLE_ENFORCE_LT(in_handle->scope_idx_, local_scopes_.size(),
+                      "%s is not the the local_scopes ", in_handle->name_);
+
+    auto *s = local_scopes_[in_handle->scope_idx_];
+    auto in_var = s->FindVar(in_handle->name_);
+    PADDLE_ENFORCE_EQ(in_var->Type(), pre_in_var->Type(),
+                      "The type of input is not consistent.");
+
+    if (in_var->IsType<framework::SelectedRows>()) {
+      auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
+      auto &in_sr = in_var->Get<framework::SelectedRows>();
+      auto in_sr_rows = in_sr.rows();
+      out_rows.insert(out_rows.begin(), in_sr_rows.begin(), in_sr_rows.end());
+      PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(), "");
+      PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(), "");
+    } else if (in_var->IsType<framework::LoDTensor>()) {
+      auto &pre_in = pre_in_var->Get<framework::LoDTensor>();
+      auto &in_lodtensor = in_var->Get<framework::LoDTensor>();
+      PADDLE_ENFORCE_EQ(in_lodtensor.lod(), pre_in.lod());
+      PADDLE_ENFORCE_EQ(in_lodtensor.dims(), pre_in.dims());
+    } else {
+      PADDLE_THROW("Var should be LoDTensor or SelectedRows.");
+    }
+    in_tensors.push_back(GetTensorFromVar(in_var));
+    pre_in_var = in_var;
+  }
+
+  // write the output
+  auto out_handle = static_cast<VarHandle *>(this->outputs_[0]);
+  auto &out_place = out_handle->place_;
+  auto out_scope_idx = out_handle->scope_idx_;
+  auto out_var = local_scopes_[out_scope_idx]->FindVar(out_handle->name_);
+
+  if (pre_in_var->IsType<framework::SelectedRows>()) {
+    auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
+    auto out = out_var->GetMutable<framework::SelectedRows>();
+    out->set_height(pre_in.height());
+    out->set_rows(out_rows);
+    size_t rows = out_rows.size();
+    DDim out_dim = pre_in.GetCompleteDims();
+    out_dim[0] = static_cast<int64_t>(rows);
+    out->mutable_value()->Resize(out_dim);
+    out->mutable_value()->mutable_data(out_place, pre_in.value().type());
+    auto out_tensor = out->mutable_value();
+    // copy
+    int s = 0, e = 0;
+    for (size_t j = 0; j < in_tensors.size(); ++j) {
+      e += in_tensors[j]->dims()[0];
+      auto sub_out = out_tensor->Slice(s, e);
+      paddle::framework::TensorCopy(*(in_tensors[j]), out_place,
+                                    *(dev_ctxes_[in_places[j]]), &sub_out);
+      s = e;
+    }
+  } else if (pre_in_var->IsType<framework::LoDTensor>()) {
+  } else {
+    PADDLE_THROW("Var should be LoDTensor or SelectedRows.");
+  }
+}
+
+std::string GatherOpHandle::Name() const { return "broadcast"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h
new file mode 100644
index 0000000000..48e1db227b
--- /dev/null
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@@ -0,0 +1,52 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+/*
+ * Broadcast the input to all scope.
+ *
+ */
+struct GatherOpHandle : public OpHandleBase {
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+
+  GatherOpHandle(const std::vector<Scope *> &local_scopes,
+                 const std::vector<platform::Place> &places);
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
new file mode 100644
index 0000000000..a029a2d266
--- /dev/null
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -0,0 +1,227 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/gather_op_handle.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+// test data amount
+const f::DDim kDims = {20, 20};
+
+class GatherTester : public ::testing::Test {
+ public:
+  void InitCtx(bool use_gpu) {
+    if (use_gpu) {
+#ifdef PADDLE_WITH_CUDA
+      int count = p::GetCUDADeviceCount();
+      if (count <= 1) {
+        LOG(WARNING) << "Cannot test multi-gpu Gather, because the CUDA "
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CUDAPlace(i);
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+      }
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      int count = 8;
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CPUPlace();
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+      }
+    }
+  }
+
+  template <class T>
+  void InitGatherOp(int input_scope_idx) {
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      local_scope_.push_back(&g_scope_.NewScope());
+      auto* out_var = local_scope_[j]->Var("input");
+      out_var->GetMutable<T>();
+    }
+    auto* in_var = local_scope_[input_scope_idx]->Var("out");
+    in_var->GetMutable<T>();
+
+    gather_op_handle_ = new f::details::GatherOpHandle(local_scope_, gpu_list_);
+
+    f::details::VarHandle* out_var_handle = new f::details::VarHandle();
+    out_var_handle->place_ = gpu_list_[input_scope_idx];
+    out_var_handle->name_ = "out";
+    out_var_handle->version_ = 2;
+    out_var_handle->scope_idx_ = input_scope_idx;
+    out_var_handle->generated_op_ = gather_op_handle_;
+    gather_op_handle_->AddOutput(out_var_handle);
+
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      gather_op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j];
+      f::details::VarHandle* in_var_handle = new f::details::VarHandle();
+      in_var_handle->place_ = gpu_list_[j];
+      in_var_handle->name_ = "input";
+      in_var_handle->version_ = 1;
+      in_var_handle->scope_idx_ = j;
+      in_var_handle->generated_op_ = nullptr;
+      gather_op_handle_->AddInput(in_var_handle);
+    }
+  }
+  void GatherOpDestroy() {
+    for (auto in : gather_op_handle_->inputs_) {
+      delete in;
+    }
+    for (auto out : gather_op_handle_->outputs_) {
+      delete out;
+    }
+    delete gather_op_handle_;
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      delete ctxs_[j];
+    }
+  }
+
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+  }
+
+  void TestGatherLodTensor() {
+    //    int input_scope_idx = 0;
+    //    InitGatherOp<f::LoDTensor>(input_scope_idx);
+    //
+    //    auto in_var = local_scope_[input_scope_idx]->Var("input");
+    //    auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
+    //    in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+    //
+    //    std::vector<float> send_vector(f::product(kDims), input_scope_idx +
+    //    12);
+    //    for (size_t k = 0; k < send_vector.size(); ++k) {
+    //      send_vector[k] = k;
+    //    }
+    //    f::LoD lod{{0, 10, 20}};
+    //    paddle::framework::TensorFromVector<float>(
+    //        send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
+    //    in_lod_tensor->set_lod(lod);
+    //
+    //    gather_op_handle_->Run(false);
+    //
+    //    WaitAll();
+    //
+    //    p::CPUPlace cpu_place;
+    //    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+    //      auto out_var = local_scope_[j]->Var("out");
+    //      auto out_tensor = out_var->Get<f::LoDTensor>();
+    //      PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
+    //
+    //      f::Tensor result_tensor;
+    //      f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor);
+    //      float* ct = result_tensor.mutable_data<float>(cpu_place);
+    //
+    //      for (int64_t j = 0; j < f::product(kDims); ++j) {
+    //        ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+    //      }
+    //    }
+    //
+    //    GatherOpDestroy();
+  }
+
+  void TestGatherSelectedRows() {
+    int output_scope_idx = 0;
+    InitGatherOp<f::SelectedRows>(output_scope_idx);
+
+    int height = kDims[0] * 2;
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    std::vector<float> send_vector(f::product(kDims));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+
+    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
+         ++input_scope_idx) {
+      auto in_var = local_scope_[input_scope_idx]->Var("input");
+      auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+      auto value = in_selected_rows->mutable_value();
+      value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+
+      in_selected_rows->set_height(height);
+      in_selected_rows->set_rows(rows);
+
+      paddle::framework::TensorFromVector<float>(
+          send_vector, *(ctxs_[input_scope_idx]), value);
+      value->Resize(kDims);
+    }
+
+    gather_op_handle_->Run(false);
+
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+
+    auto out_var = local_scope_[output_scope_idx]->Var("out");
+    auto& out_select_rows = out_var->Get<f::SelectedRows>();
+    auto rt = out_select_rows.value();
+
+    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
+    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
+    }
+
+    f::Tensor result_tensor;
+    f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
+    float* ct = result_tensor.data<float>();
+
+    for (int64_t j = 0; j < f::product(kDims); ++j) {
+      ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
+    }
+
+    GatherOpDestroy();
+  }
+
+ public:
+  f::Scope g_scope_;
+  std::vector<p::DeviceContext*> ctxs_;
+  std::vector<f::Scope*> local_scope_;
+  std::vector<p::Place> gpu_list_;
+  f::details::GatherOpHandle* gather_op_handle_;
+};
+
+// TEST_F(GatherTester, TestCPUGatherTestLodTensor) {
+//  InitCtx(false);
+//  TestGatherLodTensor();
+//}
+
+TEST_F(GatherTester, TestCPUGatherTestSelectedRows) {
+  InitCtx(false);
+  TestGatherSelectedRows();
+}
+
+#ifdef PADDLE_WITH_CUDA
+// TEST_F(GatherTester, TestGPUGatherTestLodTensor) {
+//  InitCtx(true);
+//  TestGatherLodTensor();
+//}
+
+TEST_F(GatherTester, TestGPUGatherTestSelectedRows) {
+  InitCtx(true);
+  TestGatherSelectedRows();
+}
+#endif

From 70500398b63cf8a80a6113ada9e06aa5e98a541e Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Thu, 12 Apr 2018 09:54:33 +0800
Subject: [PATCH 30/67] wip

---
 paddle/fluid/operators/detail/grpc_client.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index 8bbfd1f159..b546aa1d2f 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -35,7 +35,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
 
-  framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] {
+  framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch,
+                      this] {
     auto* var = p_scope->FindVar(var_name_val);
 
     ::grpc::ByteBuffer req;
@@ -90,7 +91,8 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
 
-  framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {
+  framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch,
+                      this] {
     // prepare input
     sendrecv::VariableMessage req;
     req.set_varname(var_name_val);
@@ -133,8 +135,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
 
-  framework::Async([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                    time_out, ch, this] {
+  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
+                      time_out, ch, this] {
     auto* var = p_scope->FindVar(in_var_name_val);
 
     ::grpc::ByteBuffer req;
@@ -197,7 +199,7 @@ bool RPCClient::Wait() {
   std::vector<std::future<void>> waits(req_count_);
 
   for (int i = 0; i < req_count_; i++) {
-    waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); });
+    waits[i] = framework::AsyncIO([i, &a, this] { a[i] = Proceed(); });
   }
 
   for (int i = 0; i < req_count_; i++) {

From 0532bc4078f59e44967df2ebca4e2aa0bd28ea36 Mon Sep 17 00:00:00 2001
From: Yang Yang <yangyang62@baidu.com>
Date: Thu, 12 Apr 2018 11:43:46 +0800
Subject: [PATCH 31/67] init

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index fbec88c796..7856d3bbc4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 ARG UBUNTU_MIRROR

From 7132bbe6b7329914fefcd4fa9960afda495d3f89 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 12 Apr 2018 12:20:13 +0800
Subject: [PATCH 32/67] update by comment

---
 paddle/fluid/operators/uniform_random_op.cc   |  12 +-
 paddle/fluid/operators/uniform_random_op.cu   |  12 +-
 .../operators/uniform_random_table_op.cc      | 144 ------------------
 .../tests/unittests/test_uniform_random_op.py |  46 +++++-
 .../unittests/test_uniform_random_table_op.py |  66 --------
 5 files changed, 63 insertions(+), 217 deletions(-)
 delete mode 100644 paddle/fluid/operators/uniform_random_table_op.cc
 delete mode 100644 python/paddle/fluid/tests/unittests/test_uniform_random_table_op.py

diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 87699362b2..155690a6f4 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -24,7 +24,17 @@ template <typename T>
 class CPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* tensor = ctx.Output<framework::Tensor>("Out");
+    framework::Tensor* tensor(nullptr);
+    auto out_var = ctx.OutputVar("Out");
+    if (out_var->IsType<framework::LoDTensor>()) {
+      tensor = out_var->GetMutable<framework::LoDTensor>();
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      auto shape = ctx.Attr<std::vector<int>>("shape");
+      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(framework::make_ddim(shape));
+    } else {
+      PADDLE_THROW("Only support SelectedRows and Tensor");
+    }
     T* data = tensor->mutable_data<T>(ctx.GetPlace());
     unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
     std::minstd_rand engine;
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 1232cd1eb3..33971be3e0 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -43,7 +43,17 @@ template <typename T>
 class GPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
+    framework::Tensor* tensor(nullptr);
+    auto out_var = ctx.OutputVar("Out");
+    if (out_var->IsType<framework::LoDTensor>()) {
+      tensor = out_var->GetMutable<framework::LoDTensor>();
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      auto shape = ctx.Attr<std::vector<int>>("shape");
+      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(framework::make_ddim(shape));
+    } else {
+      PADDLE_THROW("Only support SelectedRows and Tensor");
+    }
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
     if (seed == 0) {
diff --git a/paddle/fluid/operators/uniform_random_table_op.cc b/paddle/fluid/operators/uniform_random_table_op.cc
deleted file mode 100644
index 4664cc5d93..0000000000
--- a/paddle/fluid/operators/uniform_random_table_op.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-class UniformRandomTableInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    VLOG(3) << "Infershape...";
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of UniformRandomTableOp should not be null.");
-
-    PADDLE_ENFORCE(
-        ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
-        "uniform_random's min must less then max");
-    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    std::vector<int64_t> temp;
-    temp.reserve(shape.size());
-    for (auto dim : shape) {
-      temp.push_back(static_cast<int64_t>(dim));
-    }
-    ctx->SetOutputDim("Out", framework::make_ddim(temp));
-  }
-};
-
-class UniformRandomTableOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    VLOG(3) << "RunImpl...";
-    auto out =
-        scope.FindVar(Output("Out"))->GetMutable<framework::SelectedRows>();
-    auto shard_cnt = Attr<int>("shard_cnt");
-    auto shard_id = Attr<int>("shard_id");
-    auto max_id = Attr<int>("max_id");
-    auto shape = Attr<std::vector<int>>("shape");
-
-    auto tensor = out->mutable_value();
-    tensor->Resize(framework::make_ddim(shape));
-    // Only allocate the memory of large table on CPU
-    auto cpu = platform::CPUPlace();
-    float *data = tensor->mutable_data<float>(cpu);
-    VLOG(3) << "generate seed";
-    unsigned int seed = static_cast<unsigned int>(Attr<int>("seed"));
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
-    std::uniform_real_distribution<float> dist(Attr<float>("min"),
-                                               Attr<float>("max"));
-    int64_t size = tensor->numel();
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(engine);
-    }
-    // initialize rows by round-robin
-    // TODO(Yancey1989): need to support other way to distribute Ids
-    VLOG(3) << "calculate rows_size...";
-    int64_t rows_size = 0;
-    if (max_id % shard_cnt == 0) {
-      rows_size = max_id / shard_cnt;
-    } else {
-      rows_size = max_id / shard_cnt + 1;
-    }
-    auto *rows = out->mutable_rows();
-    rows->resize(rows_size);
-    (*rows)[0] = shard_id;
-    for (int64_t idx = 1; idx < rows_size; ++idx) {
-      (*rows)[idx] = (*rows)[idx - 1] + shard_cnt;
-    }
-    out->set_height(max_id);
-  }
-};
-
-class UniformRandomTableOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  UniformRandomTableOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput("Out",
-              "(SelectedRows)"
-              "The output table of uniform random table op.");
-    AddComment(R"DOC(
-Uniform random operator for initializing a table. 
-
-This operator initializes a SelectedRows with random values sampled from a
-uniform distribution.
-
-)DOC");
-    AddAttr<int>("max_id",
-                 "(int, required)"
-                 "The maximal Id for the table.");
-    AddAttr<int>("shard_cnt",
-                 "(int, required)"
-                 "The count of shards for distributing the table.");
-    AddAttr<int>("shard_id", "(int, required) The current shard ID.");
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) The shape of the output tensor");
-    AddAttr<float>("min",
-                   "(float, default -1.0) "
-                   "Minimum value of uniform random")
-        .SetDefault(-1.0f);
-    AddAttr<float>("max",
-                   "(float, default 1.0) "
-                   "Maximun value of uniform random")
-        .SetDefault(1.0f);
-    AddAttr<int>("seed",
-                 "(int, default 0) "
-                 "Random seed used for generating samples. "
-                 "0 means use a seed generated by the system."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time.")
-        .SetDefault(0);
-    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
-        .SetDefault(framework::proto::VarType::FP32);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(uniform_random_table, ops::UniformRandomTableOp,
-                  ops::UniformRandomTableInferShape,
-                  ops::UniformRandomTableOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 75ff85a55f..346a949b6e 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -15,6 +15,16 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+def output_hist(out):
+    hist, _ = np.histogram(out, range=(-5, 10))
+    hist = hist.astype("float32")
+    hist /= float(out.size)
+    prob = 0.1 * np.ones((10))
+    return hist, prob
 
 
 class TestUniformRandomOp(OpTest):
@@ -33,11 +43,37 @@ class TestUniformRandomOp(OpTest):
         self.check_output_customized(self.verify_output)
 
     def verify_output(self, outs):
-        tensor = outs[0]
-        hist, _ = np.histogram(outs[0], range=(-5, 10))
-        hist = hist.astype("float32")
-        hist /= float(outs[0].size)
-        prob = 0.1 * np.ones((10))
+        hist, prob = output_hist(np.array(outs[0]))
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+class TestUniformRandomOpSelectedRows(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        out = scope.var("X").get_selected_rows()
+
+        op = Operator(
+            "uniform_random",
+            Out="X",
+            shape=[4, 784],
+            min=-5.0,
+            max=10.0,
+            seed=10)
+        op.run(scope, place)
+        self.assertEqual(out.get_tensor().shape(), [4, 784])
+        hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
                 hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_table_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_table_op.py
deleted file mode 100644
index 0474c51e49..0000000000
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_table_op.py
+++ /dev/null
@@ -1,66 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-from op_test import OpTest
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-
-
-def output_hist(out):
-    hist, _ = np.histogram(out, range=(-5, 10))
-    hist = hist.astype("float32")
-    hist /= float(out.size)
-    prob = 0.1 * np.ones((10))
-    return hist, prob
-
-
-class TestUniformRandomTableOp(unittest.TestCase):
-    def get_places(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        return places
-
-    def test_check_output(self):
-        for place in self.get_places():
-            self.check_with_place(place)
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-        out = scope.var("X").get_selected_rows()
-
-        op = Operator(
-            "uniform_random_table",
-            Out="X",
-            shape=[4, 784],
-            min=-5.0,
-            max=10.0,
-            seed=10,
-            shard_cnt=3,
-            shard_id=1,
-            max_id=10)
-        op.run(scope, place)
-        self.assertEqual(out.rows(), [1, 4, 7, 10])
-        self.assertEqual(out.height(), 10)
-        self.assertEqual(out.get_tensor().shape(), [4, 784])
-        hist, prob = output_hist(np.array(out.get_tensor()))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
-
-
-if __name__ == "__main__":
-    unittest.main()

From 9e9f5d8080995e71b3a7ef8fd20a0a02f33f107f Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 12 Apr 2018 12:43:16 +0800
Subject: [PATCH 33/67] fix ci

---
 paddle/fluid/operators/uniform_random_op.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 33971be3e0..00011bbe61 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -44,11 +44,11 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     framework::Tensor* tensor(nullptr);
-    auto out_var = ctx.OutputVar("Out");
+    auto out_var = context.OutputVar("Out");
     if (out_var->IsType<framework::LoDTensor>()) {
       tensor = out_var->GetMutable<framework::LoDTensor>();
     } else if (out_var->IsType<framework::SelectedRows>()) {
-      auto shape = ctx.Attr<std::vector<int>>("shape");
+      auto shape = context.Attr<std::vector<int>>("shape");
       tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
       tensor->Resize(framework::make_ddim(shape));
     } else {

From 1204d9f3d1b76de8d3fce594634134bcfb653c8e Mon Sep 17 00:00:00 2001
From: Dang Qingqing <dangqingqing@baidu.com>
Date: Thu, 12 Apr 2018 13:12:05 +0800
Subject: [PATCH 34/67] Refine batch_norm_op.

---
 paddle/fluid/operators/batch_norm_op.cu.cc | 27 ++++++++++++----------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc
index eecb58e11e..cb1927bc0f 100644
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -114,23 +114,11 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     const auto *bias = ctx.Input<Tensor>("Bias");
 
     auto *y = ctx.Output<Tensor>("Y");
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
 
     // alloc memory
     y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
-        functor;
-    functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
-    functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
 
     auto handle = dev_ctx.cudnn_handle();
 
@@ -159,6 +147,21 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
       // Run training mode.
       // obtain running mean and running inv var, and see if we need to
       // initialize them.
+
+      auto *mean_out = ctx.Output<Tensor>("MeanOut");
+      auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+      mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+      auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+      auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+      saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+          functor;
+      functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
+      functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
+
       double this_factor = 1. - momentum;
 
       CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(

From 339be6254ea5e3432e4cbe44f35609bb45662e12 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Thu, 12 Apr 2018 05:58:26 +0000
Subject: [PATCH 35/67] Refine the order of arguments.

---
 paddle/fluid/framework/executor.cc         | 5 ++---
 paddle/fluid/framework/executor.h          | 4 ++--
 paddle/fluid/inference/tests/test_helper.h | 6 +++---
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 910012927b..34bba77f40 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -359,9 +359,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 void Executor::RunPreparedContext(
     ExecutorPrepareContext* ctx, Scope* scope,
     std::map<std::string, const LoDTensor*>& feed_targets,
-    std::map<std::string, LoDTensor*>& fetch_targets,
-    const std::string& feed_holder_name, const std::string& fetch_holder_name,
-    bool create_vars) {
+    std::map<std::string, LoDTensor*>& fetch_targets, bool create_vars,
+    const std::string& feed_holder_name, const std::string& fetch_holder_name) {
   auto& global_block = ctx->prog_.Block(ctx->block_id_);
 
   PADDLE_ENFORCE(
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index cbd70d9544..8b3ea01542 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -73,9 +73,9 @@ class Executor {
   void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                           std::map<std::string, const LoDTensor*>& feed_targets,
                           std::map<std::string, LoDTensor*>& fetch_targets,
+                          bool create_vars = true,
                           const std::string& feed_holder_name = "feed",
-                          const std::string& fetch_holder_name = "fetch",
-                          bool create_vars = true);
+                          const std::string& fetch_holder_name = "fetch");
 
  private:
   const platform::Place place_;
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 09fe344ec7..9875e43860 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -178,8 +178,8 @@ void TestInference(const std::string& dirname,
     std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
     if (PrepareContext) {
       ctx = executor.Prepare(*inference_program, 0);
-      executor.RunPreparedContext(ctx.get(), scope, feed_targets,
-                                  fetch_targets);
+      executor.RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets,
+                                  CreateVars);
     } else {
       executor.Run(*inference_program, scope, feed_targets, fetch_targets,
                    CreateVars);
@@ -198,7 +198,7 @@ void TestInference(const std::string& dirname,
         // Note: if you changed the inference_program, you need to call
         // executor.Prepare() again to get a new ExecutorPrepareContext.
         executor.RunPreparedContext(ctx.get(), scope, feed_targets,
-                                    fetch_targets);
+                                    fetch_targets, CreateVars);
       } else {
         executor.Run(*inference_program, scope, feed_targets, fetch_targets,
                      CreateVars);

From 26cfc634b9f4dc02b051b49f54e33b57938e5ff2 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Thu, 12 Apr 2018 14:48:26 +0800
Subject: [PATCH 36/67] multi stream thread pool

---
 paddle/fluid/framework/threadpool.cc                   | 10 +++++++---
 paddle/fluid/framework/threadpool.h                    | 10 +++++-----
 paddle/fluid/operators/detail/grpc_server.cc           |  2 +-
 .../paddle/fluid/tests/book/test_recognize_digits.py   |  1 -
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index 0a8377cc47..109c2c745c 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -14,8 +14,12 @@
 
 #include "paddle/fluid/framework/threadpool.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
 
+DEFINE_int32(io_threadpool_size, 100,
+             "number of threads used for doing IO, default 100");
+
 namespace paddle {
 namespace framework {
 
@@ -94,15 +98,15 @@ void ThreadPool::TaskLoop() {
 std::unique_ptr<ThreadPool> MultiStreamThreadPool::io_threadpool_(nullptr);
 std::once_flag MultiStreamThreadPool::io_init_flag_;
 
-MultiStreamThreadPool* MultiStreamThreadPool::GetInstanceIO() {
+ThreadPool* MultiStreamThreadPool::GetInstanceIO() {
   std::call_once(io_init_flag_, &MultiStreamThreadPool::InitIO);
-  return static_cast<MultiStreamThreadPool*>(io_threadpool_.get());
+  return io_threadpool_.get();
 }
 
 void MultiStreamThreadPool::InitIO() {
   if (io_threadpool_.get() == nullptr) {
     // TODO(typhoonzero1986): make this configurable
-    io_threadpool_.reset(new ThreadPool(100));
+    io_threadpool_.reset(new ThreadPool(FLAGS_io_threadpool_size));
   }
 }
 
diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
index 0a60488d9f..1cc058834c 100644
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include <condition_variable>
+#include <condition_variable>  // NOLINT
 #include <functional>
-#include <future>
-#include <mutex>
+#include <future>  // NOLINT
+#include <mutex>   // NOLINT
 #include <queue>
-#include <thread>
+#include <thread>  // NOLINT
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -137,7 +137,7 @@ class ThreadPool {
 
 class MultiStreamThreadPool : ThreadPool {
  public:
-  static MultiStreamThreadPool* GetInstanceIO();
+  static ThreadPool* GetInstanceIO();
   static void InitIO();
 
  private:
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index d5fc163bc2..36dad5dd43 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -216,10 +216,10 @@ void AsyncGRPCServer::RunSyncUpdate() {
   std::function<void()> prefetch_register =
       std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
 
+  // TODO(wuyi): Run these "HandleRequest" in thread pool
   t_send_.reset(
       new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
                                 cq_send_.get(), "cq_send", send_register)));
-
   t_get_.reset(
       new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
                                 cq_get_.get(), "cq_get", get_register)));
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index e4997b4069..5ec6890c1b 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -157,7 +157,6 @@ def train(nn_type,
         for ip in pserver_ips.split(","):
             eplist.append(':'.join([ip, port]))
         pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        pserver_endpoints = os.getenv("PSERVERS")
         trainers = int(os.getenv("TRAINERS"))
         current_endpoint = os.getenv("POD_IP") + ":" + port
         trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))

From e26c6d78adc47eb721286f9b0517ac500e03528a Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Wed, 11 Apr 2018 22:42:29 +0800
Subject: [PATCH 37/67] code refine

---
 paddle/fluid/framework/details/CMakeLists.txt | 10 ++--
 .../framework/details/broadcast_op_handle.cc  | 29 ++++------
 .../framework/details/broadcast_op_handle.h   |  4 --
 .../details/broadcast_op_handle_test.cc       | 18 ++++--
 .../framework/details/gather_op_handle.cc     | 39 ++++++-------
 .../framework/details/gather_op_handle.h      |  4 --
 .../details/gather_op_handle_test.cc          | 56 ++++---------------
 .../fluid/framework/details/op_handle_base.cc | 15 +++++
 .../fluid/framework/details/op_handle_base.h  |  8 +++
 9 files changed, 83 insertions(+), 100 deletions(-)

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 9c1d145828..897e41f79f 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -1,5 +1,5 @@
 cc_library(var_handle SRCS var_handle.cc DEPS place)
-cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context)
+cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
@@ -21,10 +21,10 @@ cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framewor
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
 
-cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
-cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory)
+cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory)
 
-cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
+cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
         device_context broadcast_op_handle)
-cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
+cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
         device_context gather_op_handle)
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 7cd13a50f5..dc8db33ef4 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -18,23 +18,16 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-static Tensor *GetTensorFromVar(Variable *in_var) {
-  if (in_var->IsType<LoDTensor>()) {
-    return in_var->GetMutable<LoDTensor>();
-  } else if (in_var->IsType<SelectedRows>()) {
-    return in_var->GetMutable<SelectedRows>()->mutable_value();
-  } else {
-    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
-  }
-  return nullptr;
-}
 BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places)
     : local_scopes_(local_scopes), places_(places) {}
 
 void BroadcastOpHandle::RunImpl() {
-  PADDLE_ENFORCE_EQ(this->inputs_.size(), 1);
-  PADDLE_ENFORCE_EQ(this->outputs_.size(), places_.size());
+  PADDLE_ENFORCE_EQ(this->inputs_.size(), 1,
+                    "The number of input should be one.");
+  PADDLE_ENFORCE_EQ(
+      this->outputs_.size(), places_.size(),
+      "The number of output should equal to the number of places.");
 
   // Wait input done, this Wait is asynchronous operation
   auto in_var_handle = static_cast<VarHandle *>(this->inputs_[0]);
@@ -43,7 +36,9 @@ void BroadcastOpHandle::RunImpl() {
     inputs_[0]->generated_op_->Wait(dev_ctxes_[in_place]);
 
   auto in_scope_idx = in_var_handle->scope_idx_;
-  PADDLE_ENFORCE_LT(in_scope_idx, local_scopes_.size(), "");
+  PADDLE_ENFORCE_LT(in_scope_idx, local_scopes_.size(),
+                    "The input(%s) is not in the local_scopes.",
+                    in_var_handle->name_);
   auto in_var = local_scopes_[in_scope_idx]->FindVar(in_var_handle->name_);
 
   Tensor *in_tensor = GetTensorFromVar(in_var);
@@ -56,12 +51,8 @@ void BroadcastOpHandle::RunImpl() {
                       "%s is not the the local_scopes ", out_handle->name_);
     auto *s = local_scopes_[out_scope_idx];
     auto out_var = s->FindVar(out_handle->name_);
-
-    PADDLE_ENFORCE_EQ(
-        out_var->Type(), in_var->Type(),
-        "The type of input and output is not equal. (%s_%d vs %s_%d)",
-        out_handle->name_, out_handle->scope_idx_, in_var_handle->name_,
-        in_var_handle->scope_idx_);
+    PADDLE_ENFORCE_EQ(out_p.which(), in_place.which(),
+                      "The place of input and output should be the same.");
 
     if (in_var->IsType<framework::SelectedRows>()) {
       auto &in_sr = in_var->Get<framework::SelectedRows>();
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 74c0a6a098..b329242252 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -28,10 +28,6 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-/*
- * Broadcast the input to all scope.
- *
- */
 struct BroadcastOpHandle : public OpHandleBase {
   const std::vector<Scope *> &local_scopes_;
   const std::vector<platform::Place> &places_;
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
index 29cf120c76..cd069df118 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -17,6 +17,10 @@
 
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+namespace details {
+
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
@@ -25,7 +29,7 @@ const f::DDim kDims = {20, 20};
 
 class BroadcastTester : public ::testing::Test {
  public:
-  void InitCtx(bool use_gpu) {
+  void InitCtxOnGpu(bool use_gpu) {
     if (use_gpu) {
 #ifdef PADDLE_WITH_CUDA
       int count = p::GetCUDADeviceCount();
@@ -200,23 +204,27 @@ class BroadcastTester : public ::testing::Test {
 };
 
 TEST_F(BroadcastTester, TestCPUBroadcastTestLodTensor) {
-  InitCtx(false);
+  InitCtxOnGpu(false);
   TestBroadcastLodTensor();
 }
 
 TEST_F(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
-  InitCtx(false);
+  InitCtxOnGpu(false);
   TestBroadcastSelectedRows();
 }
 
 #ifdef PADDLE_WITH_CUDA
 TEST_F(BroadcastTester, TestGPUBroadcastTestLodTensor) {
-  InitCtx(true);
+  InitCtxOnGpu(true);
   TestBroadcastLodTensor();
 }
 
 TEST_F(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
-  InitCtx(true);
+  InitCtxOnGpu(true);
   TestBroadcastSelectedRows();
 }
 #endif
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
index 9407868372..3047054d1a 100644
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -18,23 +18,16 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-static Tensor *GetTensorFromVar(Variable *in_var) {
-  if (in_var->IsType<LoDTensor>()) {
-    return in_var->GetMutable<LoDTensor>();
-  } else if (in_var->IsType<SelectedRows>()) {
-    return in_var->GetMutable<SelectedRows>()->mutable_value();
-  } else {
-    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
-  }
-  return nullptr;
-}
 GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
                                const std::vector<platform::Place> &places)
     : local_scopes_(local_scopes), places_(places) {}
 
 void GatherOpHandle::RunImpl() {
-  PADDLE_ENFORCE_EQ(this->inputs_.size(), places_.size());
-  PADDLE_ENFORCE_EQ(this->outputs_.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      this->inputs_.size(), places_.size(),
+      "The number of inputs should be equal to the number of place.");
+  PADDLE_ENFORCE_EQ(this->outputs_.size(), 1,
+                    "The number of output should be one.");
 
   // Wait input done, this Wait is asynchronous operation
   for (auto *in : inputs_) {
@@ -46,6 +39,7 @@ void GatherOpHandle::RunImpl() {
   auto in_0_handle = static_cast<VarHandle *>(inputs_[0]);
   auto pre_in_var =
       local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
+  auto pre_place = in_0_handle->place_;
 
   std::vector<int64_t> out_rows;
   std::vector<Tensor *> in_tensors;
@@ -58,7 +52,8 @@ void GatherOpHandle::RunImpl() {
     in_places.push_back(in_p);
     PADDLE_ENFORCE_LT(in_handle->scope_idx_, local_scopes_.size(),
                       "%s is not the the local_scopes ", in_handle->name_);
-
+    PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
+                      "The place of input should be the same.");
     auto *s = local_scopes_[in_handle->scope_idx_];
     auto in_var = s->FindVar(in_handle->name_);
     PADDLE_ENFORCE_EQ(in_var->Type(), pre_in_var->Type(),
@@ -69,13 +64,17 @@ void GatherOpHandle::RunImpl() {
       auto &in_sr = in_var->Get<framework::SelectedRows>();
       auto in_sr_rows = in_sr.rows();
       out_rows.insert(out_rows.begin(), in_sr_rows.begin(), in_sr_rows.end());
-      PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(), "");
-      PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(), "");
+      PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(),
+                        "The height of inputs is not consistent.");
+      PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(), ,
+                        "The dims of inputs is not consistent.");
     } else if (in_var->IsType<framework::LoDTensor>()) {
       auto &pre_in = pre_in_var->Get<framework::LoDTensor>();
       auto &in_lodtensor = in_var->Get<framework::LoDTensor>();
-      PADDLE_ENFORCE_EQ(in_lodtensor.lod(), pre_in.lod());
-      PADDLE_ENFORCE_EQ(in_lodtensor.dims(), pre_in.dims());
+      PADDLE_ENFORCE_EQ(in_lodtensor.lod(), pre_in.lod(),
+                        "The lod of inputs is not consistent.");
+      PADDLE_ENFORCE_EQ(in_lodtensor.dims(), pre_in.dims(),
+                        "The dims of inputs is not consistent.");
     } else {
       PADDLE_THROW("Var should be LoDTensor or SelectedRows.");
     }
@@ -88,7 +87,8 @@ void GatherOpHandle::RunImpl() {
   auto &out_place = out_handle->place_;
   auto out_scope_idx = out_handle->scope_idx_;
   auto out_var = local_scopes_[out_scope_idx]->FindVar(out_handle->name_);
-
+  PADDLE_ENFORCE_EQ(out_place.which(), pre_place.which(),
+                    "The place of input and output should be the same.");
   if (pre_in_var->IsType<framework::SelectedRows>()) {
     auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
     auto out = out_var->GetMutable<framework::SelectedRows>();
@@ -110,12 +110,13 @@ void GatherOpHandle::RunImpl() {
       s = e;
     }
   } else if (pre_in_var->IsType<framework::LoDTensor>()) {
+    // gather LoDTensor ???
   } else {
     PADDLE_THROW("Var should be LoDTensor or SelectedRows.");
   }
 }
 
-std::string GatherOpHandle::Name() const { return "broadcast"; }
+std::string GatherOpHandle::Name() const { return "gather"; }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h
index 48e1db227b..6c0231f642 100644
--- a/paddle/fluid/framework/details/gather_op_handle.h
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@@ -28,10 +28,6 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-/*
- * Broadcast the input to all scope.
- *
- */
 struct GatherOpHandle : public OpHandleBase {
   const std::vector<Scope *> &local_scopes_;
   const std::vector<platform::Place> &places_;
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index a029a2d266..5d105b37aa 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -17,6 +17,9 @@
 
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+namespace details {
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
@@ -25,7 +28,7 @@ const f::DDim kDims = {20, 20};
 
 class GatherTester : public ::testing::Test {
  public:
-  void InitCtx(bool use_gpu) {
+  void InitCtxOnGpu(bool use_gpu) {
     if (use_gpu) {
 #ifdef PADDLE_WITH_CUDA
       int count = p::GetCUDADeviceCount();
@@ -103,45 +106,7 @@ class GatherTester : public ::testing::Test {
     }
   }
 
-  void TestGatherLodTensor() {
-    //    int input_scope_idx = 0;
-    //    InitGatherOp<f::LoDTensor>(input_scope_idx);
-    //
-    //    auto in_var = local_scope_[input_scope_idx]->Var("input");
-    //    auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
-    //    in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
-    //
-    //    std::vector<float> send_vector(f::product(kDims), input_scope_idx +
-    //    12);
-    //    for (size_t k = 0; k < send_vector.size(); ++k) {
-    //      send_vector[k] = k;
-    //    }
-    //    f::LoD lod{{0, 10, 20}};
-    //    paddle::framework::TensorFromVector<float>(
-    //        send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
-    //    in_lod_tensor->set_lod(lod);
-    //
-    //    gather_op_handle_->Run(false);
-    //
-    //    WaitAll();
-    //
-    //    p::CPUPlace cpu_place;
-    //    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-    //      auto out_var = local_scope_[j]->Var("out");
-    //      auto out_tensor = out_var->Get<f::LoDTensor>();
-    //      PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
-    //
-    //      f::Tensor result_tensor;
-    //      f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor);
-    //      float* ct = result_tensor.mutable_data<float>(cpu_place);
-    //
-    //      for (int64_t j = 0; j < f::product(kDims); ++j) {
-    //        ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
-    //      }
-    //    }
-    //
-    //    GatherOpDestroy();
-  }
+  void TestGatherLodTensor() {}
 
   void TestGatherSelectedRows() {
     int output_scope_idx = 0;
@@ -205,23 +170,26 @@ class GatherTester : public ::testing::Test {
 };
 
 // TEST_F(GatherTester, TestCPUGatherTestLodTensor) {
-//  InitCtx(false);
+//  InitCtxOnGpu(false);
 //  TestGatherLodTensor();
 //}
 
 TEST_F(GatherTester, TestCPUGatherTestSelectedRows) {
-  InitCtx(false);
+  InitCtxOnGpu(false);
   TestGatherSelectedRows();
 }
 
 #ifdef PADDLE_WITH_CUDA
 // TEST_F(GatherTester, TestGPUGatherTestLodTensor) {
-//  InitCtx(true);
+//  InitCtxOnGpu(true);
 //  TestGatherLodTensor();
 //}
 
 TEST_F(GatherTester, TestGPUGatherTestSelectedRows) {
-  InitCtx(true);
+  InitCtxOnGpu(true);
   TestGatherSelectedRows();
 }
 #endif
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index e4194a7442..0d7fbdfeab 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -17,6 +17,21 @@
 namespace paddle {
 namespace framework {
 namespace details {
+
+// GetTensorFromVar is used in broadcast_op handle and gather_op handle, so it
+// should be placed in a commonplace. I don't find an appropriate place, so I
+// temporarily place it in op_handle_base.
+Tensor *GetTensorFromVar(Variable *in_var) {
+  if (in_var->IsType<LoDTensor>()) {
+    return in_var->GetMutable<LoDTensor>();
+  } else if (in_var->IsType<SelectedRows>()) {
+    return in_var->GetMutable<SelectedRows>()->mutable_value();
+  } else {
+    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+  }
+  return nullptr;
+}
+
 std::string OpHandleBase::DebugString() const {
   std::stringstream ss;
   ss << "(";
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index d7a541ac4b..fedff07772 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -17,6 +17,9 @@
 #include <vector>
 
 #include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -24,6 +27,11 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+// GetTensorFromVar is used in broadcast_op handle and gather_op handle, so it
+// should be placed in a commonplace. I don't find an appropriate place, so I
+// temporarily place it in op_handle.
+Tensor *GetTensorFromVar(Variable *in_var);
+
 class OpHandleBase {
  private:
   DISABLE_COPY_AND_ASSIGN(OpHandleBase);

From 449bdde58accc9beb94d56c8ef33c0bde4c007b7 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Thu, 12 Apr 2018 06:15:24 +0000
Subject: [PATCH 38/67] Correct some typos.

---
 cmake/cblas.cmake                          |  2 +-
 paddle/fluid/framework/executor.cc         | 19 +++++++++++--------
 paddle/fluid/framework/executor.h          |  3 +++
 paddle/fluid/inference/io.cc               |  2 +-
 paddle/fluid/inference/tests/test_helper.h |  2 +-
 5 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 52a22c1fbf..e3b9d94215 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -78,7 +78,7 @@ if(NOT CMAKE_CROSSCOMPILING)
     /usr/lib/reference/
   )
 else()
-  # Diable the finding of reference cblas under host's system path
+  # Disable the finding of reference cblas under host's system path
   set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
   set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
 endif()
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 34bba77f40..513e720fd0 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -83,8 +83,8 @@ static void CheckTensorNANOrInf(const std::string& name,
   if (tensor.memory_size() == 0) {
     return;
   }
-  if (tensor.type().hash_code() != typeid(float).hash_code() &&
-      tensor.type().hash_code() != typeid(double).hash_code()) {
+  if (tensor.type().hash_code() != typeid(float).hash_code() &&   // NOLINT
+      tensor.type().hash_code() != typeid(double).hash_code()) {  // NOLINT
     return;
   }
   PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
@@ -145,12 +145,13 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
 // Return true if the block has feed operators and holder of matching info.
 static bool has_feed_operators(
     const BlockDesc& block,
-    std::map<std::string, const LoDTensor*>& feed_targets,
+    const std::map<std::string, const LoDTensor*>& feed_targets,
     const std::string& feed_holder_name) {
   size_t feed_count = 0;
   for (auto* op : block.AllOps()) {
     if (op->Type() == kFeedOpType) {
       feed_count++;
+      // The input variable's name of feed_op should be feed_holder_name.
       PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
                         "Input to feed op should be '%s'", feed_holder_name);
       std::string feed_target_name = op->Output("Out")[0];
@@ -167,7 +168,7 @@ static bool has_feed_operators(
         "The number of feed operators should match 'feed_targets'");
 
     if (!feed_holder_name.empty()) {
-      // When feed operator are present, so should be feed_holder
+      // When feed operator are present, so should be feed_holder.
       auto var = block.FindVar(feed_holder_name);
       PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
                               feed_holder_name);
@@ -187,12 +188,14 @@ static bool has_feed_operators(
 // and fetch_holder_name. Raise exception when any mismatch is found.
 // Return true if the block has fetch operators and holder of matching info.
 static bool has_fetch_operators(
-    const BlockDesc& block, std::map<std::string, LoDTensor*>& fetch_targets,
+    const BlockDesc& block,
+    const std::map<std::string, LoDTensor*>& fetch_targets,
     const std::string& fetch_holder_name) {
   size_t fetch_count = 0;
   for (auto* op : block.AllOps()) {
     if (op->Type() == kFetchOpType) {
       fetch_count++;
+      // The output variable's name of fetch_op should be fetch_holder_name.
       PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
                         "Output of fetch op should be '%s'", fetch_holder_name);
       std::string fetch_target_name = op->Input("X")[0];
@@ -209,7 +212,7 @@ static bool has_fetch_operators(
         "The number of fetch operators should match 'fetch_targets'");
 
     if (!fetch_holder_name.empty()) {
-      // When fetch operator are present, so should be fetch_holder
+      // When fetch operator are present, so should be fetch_holder.
       auto var = block.FindVar(fetch_holder_name);
       PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
                               fetch_holder_name);
@@ -287,8 +290,8 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
   }
 
   auto ctx = Prepare(*copy_program, 0);
-  RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets,
-                     feed_holder_name, fetch_holder_name, create_vars);
+  RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets, create_vars,
+                     feed_holder_name, fetch_holder_name);
 }
 
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 8b3ea01542..43defdacf2 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <map>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index a29d457b6f..3b58019db6 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 
-// Temporarilly add this function for exposing framework::InitDevices() when
+// Temporarily add this function for exposing framework::InitDevices() when
 // linking the inference shared library.
 void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
 
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 9875e43860..c3a8d0889c 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -195,7 +195,7 @@ void TestInference(const std::string& dirname,
           paddle::platform::DeviceContextPool::Instance().Get(place));
 
       if (PrepareContext) {
-        // Note: if you changed the inference_program, you need to call
+        // Note: if you change the inference_program, you need to call
         // executor.Prepare() again to get a new ExecutorPrepareContext.
         executor.RunPreparedContext(ctx.get(), scope, feed_targets,
                                     fetch_targets, CreateVars);

From 8eac2a46f7f6945cf2c553d8716be02b96791813 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 12 Apr 2018 19:52:40 +0800
Subject: [PATCH 39/67] update by comment

---
 paddle/fluid/operators/uniform_random_op.cc | 6 ++++--
 paddle/fluid/operators/uniform_random_op.cu | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 155690a6f4..acaefaacda 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -24,7 +24,7 @@ template <typename T>
 class CPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    framework::Tensor* tensor(nullptr);
+    framework::Tensor* tensor = nullptr;
     auto out_var = ctx.OutputVar("Out");
     if (out_var->IsType<framework::LoDTensor>()) {
       tensor = out_var->GetMutable<framework::LoDTensor>();
@@ -33,7 +33,9 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
       tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
       tensor->Resize(framework::make_ddim(shape));
     } else {
-      PADDLE_THROW("Only support SelectedRows and Tensor");
+      PADDLE_THROW(
+          "uniform_random_op's output only"
+          "supports SelectedRows and Tensor");
     }
     T* data = tensor->mutable_data<T>(ctx.GetPlace());
     unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 00011bbe61..e1c7323a30 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -43,7 +43,7 @@ template <typename T>
 class GPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    framework::Tensor* tensor(nullptr);
+    framework::Tensor* tensor = nullptr;
     auto out_var = context.OutputVar("Out");
     if (out_var->IsType<framework::LoDTensor>()) {
       tensor = out_var->GetMutable<framework::LoDTensor>();
@@ -52,7 +52,9 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
       tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
       tensor->Resize(framework::make_ddim(shape));
     } else {
-      PADDLE_THROW("Only support SelectedRows and Tensor");
+      PADDLE_THROW(
+          "uniform_random_op's output only"
+          "supports SelectedRows and Tensor");
     }
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));

From d24b5e060f738139feab99b1c4a97042bce1982f Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Thu, 12 Apr 2018 14:33:38 +0200
Subject: [PATCH 40/67] The fully connected: the operator is removed when the
 MKLDNN flag is OFF

---
 paddle/fluid/operators/CMakeLists.txt              | 8 ++++++++
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 6 ++++++
 2 files changed, 14 insertions(+)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 3c8696b508..7d6781c2c3 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -245,9 +245,17 @@ op_library(channel_send_op DEPS concurrency)
 op_library(channel_recv_op DEPS concurrency)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
+
+# The fully connected layer is deleted when the WITH_MKLDNN flag is OFF
+# Because the fully connected layer has only one MKLDNN's operator
+if(NOT WITH_MKLDNN)
+    list(REMOVE_ITEM GENERAL_OPS fc_op)
+endif(NOT WITH_MKLDNN)
+
 foreach(src ${GENERAL_OPS})
     op_library(${src})
 endforeach()
+
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
 
 add_subdirectory(reader)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index f10ef9b634..3bd24c98a2 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1,6 +1,12 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
+# The fully connected test is removed whe the WITH_MKLDNN flag is OFF
+# Because the fully connected layer has only one kernel (MKLDNN)
+if(NOT WITH_MKLDNN)
+    list(REMOVE_ITEM TEST_OPS test_fc_op)
+endif(NOT WITH_MKLDNN)
+
 if(NOT WITH_DISTRIBUTE)
     list(REMOVE_ITEM TEST_OPS test_recv_op)
 endif(NOT WITH_DISTRIBUTE)

From 617e790a596ccd3f2eb940fcfe76803c01ee6cc8 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <kexin.zhao.paddle@gmail.com>
Date: Thu, 12 Apr 2018 11:48:17 -0700
Subject: [PATCH 41/67] fix cuda 7.5 compile error (#9885)

---
 paddle/fluid/operators/math/math_function.cu | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index e53183603f..c28047e6e9 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -288,9 +288,14 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
   // TODO(kexinzhao): add processing code for compute capability < 53 case
   PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
                     "cublas Hgemm requires GPU compute capability >= 53");
+
+#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
       strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
+#else
+  PADDLE_ENFORCE(false, "HgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
 }
 
 template <>
@@ -310,9 +315,13 @@ void batched_gemm<platform::CUDADeviceContext, float>(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int strideC = M * N;
 
+#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
       strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
+#else
+  PADDLE_ENFORCE(false, "SgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
 }
 
 template <>
@@ -332,9 +341,13 @@ void batched_gemm<platform::CUDADeviceContext, double>(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int strideC = M * N;
 
+#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
       strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
+#else
+  PADDLE_ENFORCE(false, "DgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
 }
 
 template <>

From 59234b7287980ef0fec0a064f524e6c25697b7c7 Mon Sep 17 00:00:00 2001
From: redrayqll <redray2006@gmail.com>
Date: Fri, 13 Apr 2018 03:25:44 +0800
Subject: [PATCH 42/67] =?UTF-8?q?modify=20=E2=80=9Cif-then-else=E2=80=9D?=
 =?UTF-8?q?=20md=20path=20(#9876)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 doc/fluid/design/motivation/fluid.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md
index 5e147f8263..4b7696cc1b 100644
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
@@ -119,7 +119,7 @@ An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Pad
 
 From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
 
-We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid.
 
 ## Turing Completeness
 

From 3794027d7fbb4d6636534c78452aad589db66361 Mon Sep 17 00:00:00 2001
From: Abhinav Arora <aroraabhinav@baidu.com>
Date: Thu, 12 Apr 2018 15:45:07 -0700
Subject: [PATCH 43/67] Fix warnings in sgd_op.h

---
 paddle/fluid/operators/sgd_op.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
index 8d2bdf7590..cfc8793e1e 100644
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -65,7 +65,8 @@ class SGDOpKernel : public framework::OpKernel<T> {
         auto &grad_rows = grad->rows();
 
         size_t grad_row_numel = grad_value.numel() / grad_rows.size();
-        PADDLE_ENFORCE_EQ(grad_row_numel, param_out->numel() / grad_height);
+        PADDLE_ENFORCE_EQ(static_cast<int64_t>(grad_row_numel),
+                          param_out->numel() / grad_height);
 
         auto *grad_data = grad_value.data<T>();
         auto *out_data = param_out->data<T>();
@@ -73,7 +74,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
         for (size_t i = 0; i < grad_rows.size(); i++) {
           PADDLE_ENFORCE(grad_rows[i] < grad_height,
                          "Input rows index should less than height");
-          for (int64_t j = 0; j < grad_row_numel; j++) {
+          for (size_t j = 0; j < grad_row_numel; j++) {
             out_data[grad_rows[i] * grad_row_numel + j] -=
                 lr[0] * grad_data[i * grad_row_numel + j];
           }
@@ -107,7 +108,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
                        "Input rows index should less than height");
         int64_t id_index = param.index(grad.rows()[i]);
-        for (int64_t j = 0; j < grad_row_width; j++) {
+        for (size_t j = 0; j < grad_row_width; j++) {
           out_data[id_index * grad_row_width + j] -=
               lr[0] * grad_data[i * grad_row_width + j];
         }

From 9b63b7dde0173cc10b0d99d50d7f37837665b673 Mon Sep 17 00:00:00 2001
From: Abhinav Arora <aroraabhinav@baidu.com>
Date: Thu, 12 Apr 2018 16:04:58 -0700
Subject: [PATCH 44/67] Fix warnings in split_ids_op

---
 paddle/fluid/operators/split_ids_op.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
index ba1e903dbb..d263426e07 100644
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -60,7 +60,9 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
     } else if (ids_var->IsType<framework::SelectedRows>()) {
       const auto *ids_selected_rows = ctx.Input<framework::SelectedRows>("Ids");
       auto &ids_dims = ids_selected_rows->value().dims();
-      PADDLE_ENFORCE_EQ(ids_dims[0], ids_selected_rows->rows().size(), "");
+      PADDLE_ENFORCE_EQ(ids_dims[0],
+                        static_cast<int64_t>(ids_selected_rows->rows().size()),
+                        "");
       const T *ids = ids_selected_rows->value().data<T>();
       const auto &ids_rows = ids_selected_rows->rows();
       auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
@@ -77,7 +79,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
         framework::DDim ddim = framework::make_ddim(
             {static_cast<int64_t>(out->rows().size()), row_width});
         T *output = out->mutable_value()->mutable_data<T>(ddim, place);
-        for (size_t i = 0; i < ddim[0]; ++i) {
+        for (int64_t i = 0; i < ddim[0]; ++i) {
           memcpy(output + i * row_width, ids + out->rows()[i] * row_width,
                  row_width * sizeof(T));
         }

From c241959e489053259274edb2614381d7058463a4 Mon Sep 17 00:00:00 2001
From: Abhinav Arora <abhinavarora28@gmail.com>
Date: Thu, 12 Apr 2018 16:45:40 -0700
Subject: [PATCH 45/67] Fix CPPLint errors in operators (#9828)

* Fix CPPLint errors in operators

* Fix prior box op

* Fix Prior Box op

* Fix top_k_op.cu

* Fix pool mkmldnn

* Fix pool mkmldnn
---
 paddle/fluid/operators/pad_op.h             |  2 +
 paddle/fluid/operators/pool_mkldnn_op.cc    | 12 ++-
 paddle/fluid/operators/pool_op.h            |  2 +
 paddle/fluid/operators/pool_with_index_op.h |  1 +
 paddle/fluid/operators/prelu_op.cc          |  1 -
 paddle/fluid/operators/prior_box_op.cc      |  2 +-
 paddle/fluid/operators/prior_box_op.cu      |  2 +-
 paddle/fluid/operators/prior_box_op.h       | 18 +++--
 paddle/fluid/operators/rank_loss_op.cc      |  1 +
 paddle/fluid/operators/recv_op.cc           |  2 +-
 paddle/fluid/operators/roi_pool_op.h        |  2 +
 paddle/fluid/operators/strided_memcpy.h     |  4 +-
 paddle/fluid/operators/top_k_op.cu          | 83 +++++++++++----------
 13 files changed, 73 insertions(+), 59 deletions(-)

diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h
index a36abe3789..c93c096575 100644
--- a/paddle/fluid/operators/pad_op.h
+++ b/paddle/fluid/operators/pad_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc
index c88578570c..63eaaedcd5 100644
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -83,9 +83,11 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
 
     auto src_memory =
-        mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
+        mkldnn::memory({src_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(input_data)));
     auto dst_memory =
-        mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data);
+        mkldnn::memory({dst_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(output_data)));
 
     auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory,
                                              *workspace_memory);
@@ -195,9 +197,11 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         pool_bwd_desc, mkldnn_engine, *pool_pd);
 
     auto diff_src_memory =
-        mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data);
+        mkldnn::memory({diff_src_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(in_x_grad_data)));
     auto diff_dst_memory =
-        mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data);
+        mkldnn::memory({diff_dst_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(out_grad_data)));
 
     auto bwd_prim = mkldnn::pooling_backward(
         pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory);
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index 2fec50ef25..a48127ea69 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h
index 83e7bd138a..b55fa76eae 100644
--- a/paddle/fluid/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index 7fb45bd19d..8eaa12a4a6 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/prelu_op.h"
-
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc
index 82e54139c8..058b13eeb8 100644
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -45,7 +45,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
     bool flip = ctx->Attrs().Get<bool>("flip");
 
     std::vector<float> aspect_ratios_vec;
-    ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
+    ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
 
     size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
     if (max_sizes.size() > 0) {
diff --git a/paddle/fluid/operators/prior_box_op.cu b/paddle/fluid/operators/prior_box_op.cu
index 76bf2b3b7d..0ea8909296 100644
--- a/paddle/fluid/operators/prior_box_op.cu
+++ b/paddle/fluid/operators/prior_box_op.cu
@@ -96,7 +96,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
     auto clip = ctx.Attr<bool>("clip");
 
     std::vector<float> aspect_ratios;
-    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
 
     T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
     T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h
index 1e4a12aac1..1c62fd8d2c 100644
--- a/paddle/fluid/operators/prior_box_op.h
+++ b/paddle/fluid/operators/prior_box_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/transform.h"
@@ -22,23 +24,23 @@ namespace operators {
 
 inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
                                bool flip,
-                               std::vector<float>& output_aspect_ratior) {
+                               std::vector<float>* output_aspect_ratior) {
   constexpr float epsilon = 1e-6;
-  output_aspect_ratior.clear();
-  output_aspect_ratior.push_back(1.0f);
+  output_aspect_ratior->clear();
+  output_aspect_ratior->push_back(1.0f);
   for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
     float ar = input_aspect_ratior[i];
     bool already_exist = false;
-    for (size_t j = 0; j < output_aspect_ratior.size(); ++j) {
-      if (fabs(ar - output_aspect_ratior[j]) < epsilon) {
+    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
+      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
         already_exist = true;
         break;
       }
     }
     if (!already_exist) {
-      output_aspect_ratior.push_back(ar);
+      output_aspect_ratior->push_back(ar);
       if (flip) {
-        output_aspect_ratior.push_back(1.0f / ar);
+        output_aspect_ratior->push_back(1.0f / ar);
       }
     }
   }
@@ -68,7 +70,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
     auto clip = ctx.Attr<bool>("clip");
 
     std::vector<float> aspect_ratios;
-    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
 
     T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
     T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index 767eef5686..a1127f11a7 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/rank_loss_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 083c1fae5e..a4dcf704a6 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <future>  // NOLINT
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
@@ -19,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-#include <future>
 #include "paddle/fluid/operators/detail/grpc_client.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
index f38c5a3c0c..54e0749031 100644
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
+#include <limits>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index 22c1db82e9..7a10218e15 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -37,8 +37,8 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
                           const framework::DDim& src_stride,
                           const framework::DDim& dst_dim,
                           const framework::DDim& dst_stride, T* dst) {
-  using namespace detail;
-  StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
+  paddle::operators::detail::StridedCopyDimVisitor<T> func(
+      dev_ctx, src, src_stride, dst_stride, dst);
   boost::apply_visitor(func, dst_dim);
 }
 
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index bfd26c2f22..d7f4d383ce 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/assert.h"
 
 namespace paddle {
@@ -133,71 +134,71 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
 }
 
 template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
                                               int beam_size, const T* src,
-                                              bool& firstStep, bool& is_empty,
-                                              Pair<T>& max, int dim,
+                                              bool* firstStep, bool* is_empty,
+                                              Pair<T>* max, int dim,
                                               const int tid) {
-  if (beam > 0) {
-    int length = beam < beam_size ? beam : beam_size;
-    if (firstStep) {
-      firstStep = false;
+  if (*beam > 0) {
+    int length = (*beam) < beam_size ? *beam : beam_size;
+    if (*firstStep) {
+      *firstStep = false;
       GetTopK<T, BlockSize>(topk, src, tid, dim, length);
     } else {
       for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - beam) {
-          topk[k] = topk[k + beam];
+        if (k < MaxLength - (*beam)) {
+          topk[k] = topk[k + *beam];
         } else {
           topk[k].set(-INFINITY, -1);
         }
       }
-      if (!is_empty) {
-        GetTopK<T, BlockSize>(topk + MaxLength - beam, src, tid, dim, max,
+      if (!(*is_empty)) {
+        GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
                               length);
       }
     }
 
-    max = topk[MaxLength - 1];
-    if (max.v == -1) is_empty = true;
-    beam = 0;
+    *max = topk[MaxLength - 1];
+    if ((*max).v == -1) *is_empty = true;
+    *beam = 0;
   }
 }
 
 template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
                                               int beam_size, const T* val,
-                                              int* col, bool& firstStep,
-                                              bool& is_empty, Pair<T>& max,
+                                              int* col, bool* firstStep,
+                                              bool* is_empty, Pair<T>* max,
                                               int dim, const int tid) {
-  if (beam > 0) {
-    int length = beam < beam_size ? beam : beam_size;
-    if (firstStep) {
-      firstStep = false;
+  if (*beam > 0) {
+    int length = (*beam) < beam_size ? *beam : beam_size;
+    if (*firstStep) {
+      *firstStep = false;
       GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
     } else {
       for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - beam) {
-          topk[k] = topk[k + beam];
+        if (k < MaxLength - *beam) {
+          topk[k] = topk[k + *beam];
         } else {
           topk[k].set(-INFINITY, -1);
         }
       }
-      if (!is_empty) {
-        GetTopK<T, BlockSize>(topk + MaxLength - beam, val, col, tid, dim, max,
+      if (!(*is_empty)) {
+        GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
                               length);
       }
     }
 
-    max = topk[MaxLength - 1];
-    if (max.v == -1) is_empty = true;
-    beam = 0;
+    *max = topk[MaxLength - 1];
+    if ((*max).v == -1) *is_empty = true;
+    *beam = 0;
   }
 }
 
 template <typename T, int MaxLength, int BlockSize>
 __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
                                             Pair<T> topk[], T** topVal,
-                                            int64_t** topIds, int& beam, int& k,
+                                            int64_t** topIds, int* beam, int* k,
                                             const int tid, const int warp) {
   while (true) {
     __syncthreads();
@@ -225,17 +226,17 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
       (*topVal)++;
       (*topIds)++;
     }
-    if (tid == maxid[0]) beam++;
-    if (--k == 0) break;
+    if (tid == maxid[0]) (*beam)++;
+    if (--(*k) == 0) break;
     __syncthreads();
 
     if (tid == maxid[0]) {
-      if (beam < MaxLength) {
-        sh_topk[tid] = topk[beam];
+      if (*beam < MaxLength) {
+        sh_topk[tid] = topk[*beam];
       }
     }
     if (maxid[0] / 32 == warp) {
-      if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break;
+      if (__shfl(*beam, (maxid[0]) % 32, 32) == MaxLength) break;
     }
   }
 }
@@ -268,13 +269,13 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
     topk[k].set(-INFINITY, -1);
   }
   while (k) {
-    ThreadGetTopK<T, MaxLength, BlockSize>(topk, beam, k,
-                                           src + blockIdx.x * lds, firststep,
-                                           is_empty, max, dim, tid);
+    ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
+                                           src + blockIdx.x * lds, &firststep,
+                                           &is_empty, &max, dim, tid);
 
     sh_topk[tid] = topk[0];
     BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
-                                         &indices, beam, k, tid, warp);
+                                         &indices, &beam, &k, tid, warp);
   }
 }
 
@@ -308,9 +309,9 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
     KeMatrixTopK<T, 5, 256><<<
         grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
-                              .stream()>>>(output_data, output->dims()[1],
-                                           indices_data, input_data,
-                                           input_width, input_width, int(k));
+                              .stream()>>>(
+        output_data, output->dims()[1], indices_data, input_data, input_width,
+        input_width, static_cast<int>(k));
   }
 };
 

From 855992dab0e840109899983baf3a9675185b0c35 Mon Sep 17 00:00:00 2001
From: Abhinav Arora <aroraabhinav@baidu.com>
Date: Thu, 12 Apr 2018 17:04:54 -0700
Subject: [PATCH 46/67] Fix warnings in chunk_test

---
 paddle/fluid/recordio/chunk_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/recordio/chunk_test.cc b/paddle/fluid/recordio/chunk_test.cc
index 98ca99b9a0..5177475c01 100644
--- a/paddle/fluid/recordio/chunk_test.cc
+++ b/paddle/fluid/recordio/chunk_test.cc
@@ -43,5 +43,5 @@ TEST(Chunk, Compressor) {
 
   ch.Clear();
   ch.Parse(ss);
-  ASSERT_EQ(ch.NumBytes(), 18);
+  ASSERT_EQ(ch.NumBytes(), 18ul);
 }

From b0267ac93a84cdb3be3099b869c1c334b7e26096 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Fri, 13 Apr 2018 11:31:59 +0800
Subject: [PATCH 47/67] refine broadcast op

---
 .../framework/details/broadcast_op_handle.cc  | 29 ++++++++++++++++---
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index cd9bff52d9..53e8f9f366 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -32,8 +32,14 @@ void BroadcastOpHandle::RunImpl() {
   // Wait input done, this Wait is asynchronous operation
   auto in_var_handle = static_cast<VarHandle *>(this->inputs_[0]);
   auto &in_place = in_var_handle->place_;
-  if (inputs_[0]->generated_op_)
+  if (inputs_[0]->generated_op_) {
     inputs_[0]->generated_op_->Wait(dev_ctxes_[in_place]);
+    for (auto *out : outputs_) {
+      auto out_handle = static_cast<VarHandle *>(out);
+      auto &out_p = out_handle->place_;
+      inputs_[0]->generated_op_->Wait(dev_ctxes_[out_p]);
+    }
+  }
 
   auto in_scope_idx = in_var_handle->scope_idx_;
   PADDLE_ENFORCE_LT(in_scope_idx, local_scopes_.size(),
@@ -74,9 +80,24 @@ void BroadcastOpHandle::RunImpl() {
     }
 
     Tensor *out_tensor = GetTensorFromVar(out_var);
-
-    paddle::framework::TensorCopy(*in_tensor, out_p, *(dev_ctxes_[in_place]),
-                                  out_tensor);
+    if (platform::is_cpu_place(in_place)) {
+      paddle::framework::TensorCopy(*in_tensor, out_p, *(dev_ctxes_[in_place]),
+                                    out_tensor);
+    } else if (platform::is_gpu_place(in_place)) {
+#ifdef PADDLE_WITH_CUDA
+      auto src_gpu_place = boost::get<platform::CUDAPlace>(in_place);
+      auto dst_gpu_place = boost::get<platform::CUDAPlace>(out_p);
+      void *dst_ptr = out_tensor->mutable_data(out_p);
+      void *src_ptr = in_tensor->data<void>();
+      int64_t size = in_tensor->numel();
+      memory::Copy(
+          dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+          reinterpret_cast<platform::CUDADeviceContext *>(dev_ctxes_[out_p])
+              ->stream());
+#else
+      PADDLE_THROW("CUDAPlace is not supported in CPU device.");
+#endif
+    }
   }
 }
 

From c20cc2bd8a018f078e3916e01579df8faab66f92 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 13 Apr 2018 05:45:52 +0000
Subject: [PATCH 48/67] Add Wait() for reshape_op

---
 paddle/fluid/operators/reshape_op.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index 9abc78421a..8320c257c9 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -147,6 +147,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
     if (!inplace) {
       out->mutable_data<T>(ctx.GetPlace());
       framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
+      ctx.device_context().Wait();
       // TensorCopy will resize to in_dims.
       out->Resize(out_dims);
     } else {
@@ -169,6 +170,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
     auto in_dims = d_x->dims();
     if (!inplace) {
       framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
+      ctx.device_context().Wait();
       d_x->Resize(in_dims);
     } else {
       d_x->ShareDataWith(*d_out);

From a08bf76f74cbdd4db4a773a4557b4ad6551ce679 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Fri, 13 Apr 2018 13:52:39 +0800
Subject: [PATCH 49/67] refine name

---
 paddle/fluid/framework/threadpool.cc | 10 +++++-----
 paddle/fluid/framework/threadpool.h  |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index 109c2c745c..f26f212d4d 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -95,15 +95,15 @@ void ThreadPool::TaskLoop() {
   }
 }
 
-std::unique_ptr<ThreadPool> MultiStreamThreadPool::io_threadpool_(nullptr);
-std::once_flag MultiStreamThreadPool::io_init_flag_;
+std::unique_ptr<ThreadPool> ThreadPoolIO::io_threadpool_(nullptr);
+std::once_flag ThreadPoolIO::io_init_flag_;
 
-ThreadPool* MultiStreamThreadPool::GetInstanceIO() {
-  std::call_once(io_init_flag_, &MultiStreamThreadPool::InitIO);
+ThreadPool* ThreadPoolIO::GetInstanceIO() {
+  std::call_once(io_init_flag_, &ThreadPoolIO::InitIO);
   return io_threadpool_.get();
 }
 
-void MultiStreamThreadPool::InitIO() {
+void ThreadPoolIO::InitIO() {
   if (io_threadpool_.get() == nullptr) {
     // TODO(typhoonzero1986): make this configurable
     io_threadpool_.reset(new ThreadPool(FLAGS_io_threadpool_size));
diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
index 1cc058834c..94111ee335 100644
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -135,7 +135,7 @@ class ThreadPool {
   std::condition_variable completed_;
 };
 
-class MultiStreamThreadPool : ThreadPool {
+class ThreadPoolIO : ThreadPool {
  public:
   static ThreadPool* GetInstanceIO();
   static void InitIO();
@@ -156,7 +156,7 @@ std::future<void> Async(Callback callback) {
 
 template <typename Callback>
 std::future<void> AsyncIO(Callback callback) {
-  return MultiStreamThreadPool::GetInstanceIO()->Run(callback);
+  return ThreadPoolIO::GetInstanceIO()->Run(callback);
 }
 
 }  // namespace framework

From 3fa0ef3d7102615848f8793c1151c6ec069cd296 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 13 Apr 2018 06:40:50 +0000
Subject: [PATCH 50/67] Refine double_buffer code

---
 .../reader/create_double_buffer_reader_op.cc  | 62 +++++++------------
 1 file changed, 22 insertions(+), 40 deletions(-)

diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 33a50b5ceb..0b7c1d6af7 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -33,28 +33,14 @@ static constexpr size_t kChannelSize = 0;  // kCacheSize - 2
 
 class DoubleBufferReader : public framework::DecoratedReader {
  public:
-  struct Item {
-    Item() : ctx_(nullptr) {}
-    Item(Item&& b) {
-      payloads_ = std::move(b.payloads_);
-      ctx_ = std::move(b.ctx_);
-    }
-    Item& operator=(Item&& b) {
-      payloads_ = std::move(b.payloads_);
-      ctx_ = std::move(b.ctx_);
-      return *this;
-    }
-
-    std::vector<framework::LoDTensor> payloads_;
-    platform::DeviceContext* ctx_;
-  };
-
   explicit DoubleBufferReader(
       ReaderBase* reader, platform::Place target_place = platform::CPUPlace())
       : DecoratedReader(reader), place_(target_place) {
+    cpu_tensor_cache_.resize(kCacheSize);
+    gpu_tensor_cache_.resize(kCacheSize);
 #ifdef PADDLE_WITH_CUDA
-    for (size_t i = 0; i < kCacheSize; ++i) {
-      if (platform::is_gpu_place(place_)) {
+    if (platform::is_gpu_place(place_)) {
+      for (size_t i = 0; i < kCacheSize; ++i) {
         ctxs_.emplace_back(new platform::CUDADeviceContext(
             boost::get<platform::CUDAPlace>(place_)));
       }
@@ -72,7 +58,7 @@ class DoubleBufferReader : public framework::DecoratedReader {
   bool HasNext() const;
 
   void StartPrefetcher() {
-    channel_ = framework::MakeChannel<Item>(kChannelSize);
+    channel_ = framework::MakeChannel<size_t>(kChannelSize);
     prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
   }
 
@@ -88,8 +74,10 @@ class DoubleBufferReader : public framework::DecoratedReader {
   void PrefetchThreadFunc();
 
   std::thread prefetcher_;
-  framework::Channel<Item>* channel_;
+  framework::Channel<size_t>* channel_;
   platform::Place place_;
+  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache_;
+  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache_;
   std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
 };
 
@@ -153,11 +141,14 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
 void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
   out->clear();
   if (HasNext()) {
-    Item batch;
-    channel_->Receive(&batch);
-    *out = batch.payloads_;
-    if (batch.ctx_) {
-      batch.ctx_->Wait();
+    size_t cached_tensor_id;
+    channel_->Receive(&cached_tensor_id);
+    if (platform::is_gpu_place(place_)) {
+      *out = gpu_tensor_cache_[cached_tensor_id];
+      ctxs_[cached_tensor_id]->Wait();
+    } else {
+      // CPU place
+      *out = cpu_tensor_cache_[cached_tensor_id];
     }
   }
 }
@@ -176,42 +167,33 @@ bool DoubleBufferReader::HasNext() const {
 
 void DoubleBufferReader::PrefetchThreadFunc() {
   VLOG(5) << "A new prefetch thread starts.";
-  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(kCacheSize);
-  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(kCacheSize);
   size_t cached_tensor_id = 0;
-
   while (true) {
-    Item batch;
-    auto& cpu_batch = cpu_tensor_cache[cached_tensor_id];
+    auto& cpu_batch = cpu_tensor_cache_[cached_tensor_id];
     reader_->ReadNext(&cpu_batch);
     if (cpu_batch.empty()) {
       // The underlying reader have no next data.
       break;
     }
     if (platform::is_gpu_place(place_)) {
-      auto& gpu_batch = gpu_tensor_cache[cached_tensor_id];
+      auto& gpu_batch = gpu_tensor_cache_[cached_tensor_id];
       auto* gpu_ctx = ctxs_[cached_tensor_id].get();
       gpu_batch.resize(cpu_batch.size());
       for (size_t i = 0; i < cpu_batch.size(); ++i) {
         framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]);
         gpu_batch[i].set_lod(cpu_batch[i].lod());
       }
-      batch.payloads_ = gpu_batch;
-      batch.ctx_ = gpu_ctx;
-    } else {
-      // CPUPlace
-      batch.payloads_ = cpu_batch;
     }
-    ++cached_tensor_id;
-    cached_tensor_id %= kCacheSize;
-
     try {
-      channel_->Send(&batch);
+      size_t tmp = cached_tensor_id;
+      channel_->Send(&tmp);
     } catch (paddle::platform::EnforceNotMet e) {
       VLOG(5) << "WARNING: The double buffer channel has been closed. The "
                  "prefetch thread will terminate.";
       break;
     }
+    ++cached_tensor_id;
+    cached_tensor_id %= kCacheSize;
   }
   channel_->Close();
   VLOG(5) << "Prefetch thread terminates.";

From 6b20b35589c3443bbd49fde2b71b5c4e0e5b8cc0 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 13 Apr 2018 15:22:04 +0800
Subject: [PATCH 51/67] Fix Transformer Hang Problem

---
 .../details/computation_op_handle.cc          |  4 ++-
 .../details/nccl_all_reduce_op_handle.cc      | 10 +++---
 .../fluid/framework/details/op_handle_base.cc | 32 ++++++++++++-------
 .../fluid/framework/details/op_handle_base.h  |  2 ++
 .../details/scale_loss_grad_op_handle.cc      | 14 +++++---
 .../fluid/framework/details/send_op_handle.cc |  2 +-
 .../details/threaded_ssa_graph_executor.cc    |  4 ++-
 paddle/fluid/platform/device_context.cc       |  2 +-
 paddle/fluid/platform/device_context.h        |  9 +++++-
 9 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index e3f8bbb72f..ff6d91c1da 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -35,7 +35,9 @@ void ComputationOpHandle::RunImpl() {
     }
   }
 
-  op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
+  this->RunAndRecordEvent([this] {
+    op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
+  });
 }
 
 std::string ComputationOpHandle::Name() const { return op_->Type(); }
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
index 55b5f11358..0611ec6376 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -69,10 +69,12 @@ void NCCLAllReduceOpHandle::RunImpl() {
       });
     }
 
-    platform::NCCLGroupGuard guard;
-    for (auto &call : all_reduce_calls) {
-      call();
-    }
+    this->RunAndRecordEvent([&] {
+      platform::NCCLGroupGuard guard;
+      for (auto &call : all_reduce_calls) {
+        call();
+      }
+    });
   }
 }
 
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index e4194a7442..846bc21be2 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -54,17 +54,6 @@ void OpHandleBase::Run(bool use_event) {
 #endif
 
   RunImpl();
-
-#ifdef PADDLE_WITH_CUDA
-  if (use_event) {
-    for (auto &p : dev_ctxes_) {
-      int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
-      auto stream =
-          static_cast<platform::CUDADeviceContext *>(p.second)->stream();
-      PADDLE_ENFORCE(cudaEventRecord(events_.at(dev_id), stream));
-    }
-  }
-#endif
 }
 
 void OpHandleBase::Wait(platform::DeviceContext *waited_dev) {
@@ -97,6 +86,27 @@ void OpHandleBase::AddOutput(VarHandleBase *out) {
   out->generated_op_ = this;
 }
 
+void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
+#ifdef PADDLE_WITH_CUDA
+  if (!events_.empty()) {  // Use event
+    std::function<void()> method = callback;
+
+    for (auto &p : dev_ctxes_) {
+      method = [method, p, this]() {
+        static_cast<platform::CUDADeviceContext *>(p.second)->RecordEvent(
+            events_.at(boost::get<platform::CUDAPlace>(p.first).device),
+            method);
+      };
+    }
+    method();
+  } else {
+#endif
+    callback();
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index fbdb54ba8d..1aacba5a4c 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -62,6 +62,8 @@ class OpHandleBase {
   virtual bool IsMultiDeviceTransfer() { return false; }
 
  protected:
+  void RunAndRecordEvent(const std::function<void()> &callback);
+
   virtual void RunImpl() = 0;
 };
 
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 0a6f6129b8..7fb9f99a8a 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
 
+#include <string>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -37,11 +39,13 @@ void ScaleLossGradOpHandle::RunImpl() {
     *tmp = coeff_;
   } else {
 #ifdef PADDLE_WITH_CUDA
-    auto stream =
-        static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
-            ->stream();
-    memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
-                 platform::CPUPlace(), &coeff_, sizeof(float), stream);
+    this->RunAndRecordEvent([&] {
+      auto stream =
+          static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
+              ->stream();
+      memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
+                   platform::CPUPlace(), &coeff_, sizeof(float), stream);
+    });
 #endif
   }
 }
diff --git a/paddle/fluid/framework/details/send_op_handle.cc b/paddle/fluid/framework/details/send_op_handle.cc
index d181607e86..549b9d9abb 100644
--- a/paddle/fluid/framework/details/send_op_handle.cc
+++ b/paddle/fluid/framework/details/send_op_handle.cc
@@ -34,7 +34,7 @@ void SendOpHandle::RunImpl() {
     }
     in->generated_op_->Wait(dev_ctxes_[p]);
   }
-  op_->Run(*local_scope_, place_);
+  this->RunAndRecordEvent([&] { op_->Run(*local_scope_, place_); });
 }
 
 std::string SendOpHandle::Name() const { return "send"; }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 1ce69ab02b..a371ee10fe 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -196,10 +196,12 @@ void ThreadedSSAGraphExecutor::RunOp(
     BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
   auto op_run = [ready_var_q, op, this] {
     try {
-      VLOG(10) << op->Name() << " : " << op->DebugString();
+      VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
       op->Run(use_event_);
+      VLOG(10) << op << " " << op->Name() << " Done ";
       running_ops_--;
       ready_var_q->Extend(op->outputs_);
+      VLOG(10) << op << " " << op->Name() << "Signal posted";
     } catch (platform::EnforceNotMet ex) {
       exception_.reset(new platform::EnforceNotMet(ex));
     } catch (...) {
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index f03165fae5..1f733d71bd 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -175,7 +175,7 @@ CUDADeviceContext::~CUDADeviceContext() {
 Place CUDADeviceContext::GetPlace() const { return place_; }
 
 void CUDADeviceContext::Wait() const {
-  std::lock_guard<std::mutex> guard(mutex_);
+  std::lock_guard<std::recursive_mutex> guard(mutex_);
   PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
   PADDLE_ENFORCE(cudaGetLastError());
 }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index b175583379..a9c1984616 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -98,13 +98,20 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Return cuda stream in the device context. */
   cudaStream_t stream() const;
 
+  template <typename Callback>
+  void RecordEvent(cudaEvent_t ev, Callback callback) {
+    std::lock_guard<std::recursive_mutex> guard(mutex_);
+    callback();
+    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
+  }
+
  private:
   CUDAPlace place_;
 
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;
   std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
 
-  mutable std::mutex mutex_;
+  mutable std::recursive_mutex mutex_;
   cudaStream_t stream_;
   cudnnHandle_t cudnn_handle_;
   cublasHandle_t cublas_handle_;

From cedade949412a1fcffa12714375e03e4234282af Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 13 Apr 2018 16:30:08 +0800
Subject: [PATCH 52/67] Stash

---
 .../details/nccl_all_reduce_op_handle.cc      | 84 ++++++++++++++-----
 1 file changed, 64 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
index 55b5f11358..6e4314e2a8 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 
+#include <algorithm>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -27,6 +29,32 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
   }
 }
 
+struct ReduceLoDTensor {
+  const std::vector<LoDTensor> &src_tensors_;
+  LoDTensor &dst_tensor_;
+
+  ReduceLoDTensor(const std::vector<LoDTensor> &src, LoDTensor *dst)
+      : src_tensors_(src), dst_tensor_(*dst) {}
+
+  template <typename T>
+  void operator()() const {
+    PADDLE_ENFORCE(!src_tensors_.empty());
+    auto &t0 = src_tensors_[0];
+    PADDLE_ENFORCE_NE(t0.numel(), 0);
+    dst_tensor_.Resize(t0.dims());
+    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
+    std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
+
+    for (size_t i = 1; i < src_tensors_.size(); ++i) {
+      auto &t = src_tensors_[i];
+      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
+      PADDLE_ENFORCE_EQ(t.type(), t0.type());
+      std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
+                     [](T a, T b) -> T { return a + b; });
+    }
+  }
+};
+
 void NCCLAllReduceOpHandle::RunImpl() {
   if (inputs_.size() == 1) {
     return;  // No need to all reduce when GPU count = 1;
@@ -41,37 +69,53 @@ void NCCLAllReduceOpHandle::RunImpl() {
     int dtype = -1;
     size_t numel = 0;
 
-    std::vector<std::function<void()>> all_reduce_calls;
+    std::vector<LoDTensor> lod_tensors;
 
     for (size_t i = 0; i < local_scopes_.size(); ++i) {
-      auto &p = places_[i];
       auto *s = local_scopes_[i];
-      int dev_id = boost::get<platform::CUDAPlace>(p).device;
 
       auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
-      void *buffer = const_cast<void *>(lod_tensor.data<void>());
+      lod_tensors.emplace_back(lod_tensor);
+    }
+
+    if (platform::is_gpu_place(lod_tensors[0].place())) {
+      std::vector<std::function<void()>> all_reduce_calls;
+      for (size_t i = 0; i < local_scopes_.size(); ++i) {
+        auto &p = places_[i];
+        auto &lod_tensor = lod_tensors[i];
+        void *buffer = const_cast<void *>(lod_tensor.data<void>());
 
-      if (dtype == -1) {
-        dtype = platform::ToNCCLDataType(lod_tensor.type());
+        if (dtype == -1) {
+          dtype = platform::ToNCCLDataType(lod_tensor.type());
+        }
+
+        if (numel == 0) {
+          numel = static_cast<size_t>(lod_tensor.numel());
+        }
+
+        int dev_id = boost::get<platform::CUDAPlace>(p).device;
+        auto &nccl_ctx = nccl_ctxs_.at(dev_id);
+        auto stream = nccl_ctx.stream();
+        auto comm = nccl_ctx.comm_;
+        all_reduce_calls.emplace_back([=] {
+          PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+              buffer, buffer, numel, static_cast<ncclDataType_t>(dtype),
+              ncclSum, comm, stream));
+        });
       }
 
-      if (numel == 0) {
-        numel = static_cast<size_t>(lod_tensor.numel());
+      platform::NCCLGroupGuard guard;
+      for (auto &call : all_reduce_calls) {
+        call();
       }
+    } else {  // Special handle CPU only Operator's gradient. Like CRF
+      framework::LoDTensor trg;
 
-      auto &nccl_ctx = nccl_ctxs_.at(dev_id);
-      auto stream = nccl_ctx.stream();
-      auto comm = nccl_ctx.comm_;
-      all_reduce_calls.emplace_back([=] {
-        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-            buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
-            comm, stream));
-      });
-    }
+      // Reduce All Tensor to trg in CPU
+      ReduceLoDTensor func(lod_tensors, &trg);
+      VisitDataType(ToDataType(lod_tensors[0].type()), func);
 
-    platform::NCCLGroupGuard guard;
-    for (auto &call : all_reduce_calls) {
-      call();
+      // Copy trg to GPU
     }
   }
 }

From 79be06045c2cfd97b14991dac5bdbe2a2fa765db Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 13 Apr 2018 16:43:44 +0800
Subject: [PATCH 53/67] Support CPU/GPU mixture for ParallelExecutor

---
 .../details/nccl_all_reduce_op_handle.cc         | 13 +++++++++++++
 paddle/fluid/framework/details/op_handle_base.cc | 16 ++++++++++++++++
 paddle/fluid/framework/details/op_handle_base.h  |  3 +++
 3 files changed, 32 insertions(+)

diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
index 3547a6e21c..1e48f75958 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -116,6 +116,19 @@ void NCCLAllReduceOpHandle::RunImpl() {
       // Reduce All Tensor to trg in CPU
       ReduceLoDTensor func(lod_tensors, &trg);
       VisitDataType(ToDataType(lod_tensors[0].type()), func);
+
+      for (size_t i = 0; i < local_scopes_.size(); ++i) {
+        auto &scope = local_scopes_[i];
+        auto &p = places_[i];
+        auto *var = scope->FindVar(var_name);
+        auto *dev_ctx = dev_ctxes_[p];
+
+        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
+          auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
+          auto &tensor_cpu = trg;
+          TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
+        });
+      }
     }
   }
 }
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 846bc21be2..28f1e7b508 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -107,6 +107,22 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #endif
 }
 
+void OpHandleBase::RunAndRecordEvent(platform::Place p,
+                                     const std::function<void()> &callback) {
+  if (platform::is_cpu_place(p) || events_.empty()) {
+    callback();
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    auto *ctx = dev_ctxes_.at(p);
+    auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx);
+    cuda_ctx->RecordEvent(events_.at(boost::get<platform::CUDAPlace>(p).device),
+                          callback);
+#else
+    PADDLE_THROW("Not implemented");
+#endif
+  }
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 1aacba5a4c..a9a6c8d39c 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -64,6 +64,9 @@ class OpHandleBase {
  protected:
   void RunAndRecordEvent(const std::function<void()> &callback);
 
+  void RunAndRecordEvent(platform::Place p,
+                         const std::function<void()> &callback);
+
   virtual void RunImpl() = 0;
 };
 

From 02842cfc2508240cca89bac59d1beaac2f5da2b6 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Fri, 13 Apr 2018 13:46:17 +0800
Subject: [PATCH 54/67] enhance broadcast_op_handle and gather_op_handle

---
 .../framework/details/broadcast_op_handle.cc  |  71 +++++---
 .../details/broadcast_op_handle_test.cc       | 151 +++++++++---------
 .../framework/details/gather_op_handle.cc     | 131 +++++++--------
 .../details/gather_op_handle_test.cc          | 129 ++++++++-------
 .../fluid/framework/details/op_handle_base.cc |  15 --
 .../fluid/framework/details/op_handle_base.h  |   8 -
 6 files changed, 266 insertions(+), 239 deletions(-)

diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 53e8f9f366..24115cae81 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -18,45 +18,74 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+Tensor *GetTensorFromVar(Variable *in_var) {
+  if (in_var->IsType<LoDTensor>()) {
+    return in_var->GetMutable<LoDTensor>();
+  } else if (in_var->IsType<SelectedRows>()) {
+    return in_var->GetMutable<SelectedRows>()->mutable_value();
+  } else {
+    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+  }
+  return nullptr;
+}
+
 BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places)
     : local_scopes_(local_scopes), places_(places) {}
 
 void BroadcastOpHandle::RunImpl() {
-  PADDLE_ENFORCE_EQ(this->inputs_.size(), 1,
+  // the input may have dummy var.
+  std::vector<VarHandle *> in_var_handle;
+  for (auto *in : inputs_) {
+    auto *out_handle = dynamic_cast<VarHandle *>(in);
+    if (out_handle) {
+      in_var_handle.push_back(out_handle);
+    }
+  }
+  PADDLE_ENFORCE_EQ(in_var_handle.size(), 1,
                     "The number of input should be one.");
+
+  // the output may have dummy var.
+  std::vector<VarHandle *> out_var_handles;
+  for (auto *out : outputs_) {
+    auto *out_handle = dynamic_cast<VarHandle *>(out);
+    if (out_handle) {
+      out_var_handles.push_back(out_handle);
+    }
+  }
+
   PADDLE_ENFORCE_EQ(
-      this->outputs_.size(), places_.size(),
+      out_var_handles.size(), places_.size(),
       "The number of output should equal to the number of places.");
 
   // Wait input done, this Wait is asynchronous operation
-  auto in_var_handle = static_cast<VarHandle *>(this->inputs_[0]);
-  auto &in_place = in_var_handle->place_;
-  if (inputs_[0]->generated_op_) {
-    inputs_[0]->generated_op_->Wait(dev_ctxes_[in_place]);
-    for (auto *out : outputs_) {
-      auto out_handle = static_cast<VarHandle *>(out);
-      auto &out_p = out_handle->place_;
-      inputs_[0]->generated_op_->Wait(dev_ctxes_[out_p]);
+  auto &in_place = in_var_handle[0]->place_;
+  if (in_var_handle[0]->generated_op_) {
+    in_var_handle[0]->generated_op_->Wait(dev_ctxes_[in_place]);
+    for (auto *out : out_var_handles) {
+      auto &out_p = out->place_;
+      if (platform::is_same_place(in_place, out_p)) continue;
+      in_var_handle[0]->generated_op_->Wait(dev_ctxes_[out_p]);
     }
   }
 
-  auto in_scope_idx = in_var_handle->scope_idx_;
+  //
+  auto in_scope_idx = in_var_handle[0]->scope_idx_;
   PADDLE_ENFORCE_LT(in_scope_idx, local_scopes_.size(),
                     "The input(%s) is not in the local_scopes.",
-                    in_var_handle->name_);
-  auto in_var = local_scopes_[in_scope_idx]->FindVar(in_var_handle->name_);
-
+                    in_var_handle[0]->name_);
+  auto in_var = local_scopes_[in_scope_idx]->FindVar(in_var_handle[0]->name_);
   Tensor *in_tensor = GetTensorFromVar(in_var);
-  for (auto *out : outputs_) {
-    auto out_handle = static_cast<VarHandle *>(out);
-    auto &out_p = out_handle->place_;
 
-    auto out_scope_idx = out_handle->scope_idx_;
+  for (auto *out : out_var_handles) {
+    auto &out_p = out->place_;
+
+    auto out_scope_idx = out->scope_idx_;
     PADDLE_ENFORCE_LT(out_scope_idx, local_scopes_.size(),
-                      "%s is not in the local_scopes ", out_handle->name_);
+                      "%s is not in the local_scopes ", out->name_);
+
     auto *s = local_scopes_[out_scope_idx];
-    auto out_var = s->FindVar(out_handle->name_);
+    auto out_var = s->FindVar(out->name_);
     PADDLE_ENFORCE_EQ(out_p.which(), in_place.which(),
                       "The place of input and output should be the same.");
 
@@ -89,7 +118,7 @@ void BroadcastOpHandle::RunImpl() {
       auto dst_gpu_place = boost::get<platform::CUDAPlace>(out_p);
       void *dst_ptr = out_tensor->mutable_data(out_p);
       void *src_ptr = in_tensor->data<void>();
-      int64_t size = in_tensor->numel();
+      int64_t size = in_tensor->numel() * SizeOfType(in_tensor->type());
       memory::Copy(
           dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
           reinterpret_cast<platform::CUDADeviceContext *>(dev_ctxes_[out_p])
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
index 9bf72f0360..dfc52b012f 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -27,8 +27,20 @@ namespace p = paddle::platform;
 // test data amount
 const f::DDim kDims = {20, 20};
 
-class BroadcastTester : public ::testing::Test {
- public:
+struct TestBroadcastOpHandle {
+  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
+  std::vector<Scope*> local_scopes_;
+  Scope g_scope_;
+  std::unique_ptr<OpHandleBase> op_handle_;
+  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  std::vector<p::Place> gpu_list_;
+
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+  }
+
   void InitCtxOnGpu(bool use_gpu) {
     if (use_gpu) {
 #ifdef PADDLE_WITH_CUDA
@@ -57,61 +69,56 @@ class BroadcastTester : public ::testing::Test {
     }
   }
 
-  void BroadcastInitOp(int input_scope_idx) {
+  void InitBroadcastOp(size_t input_scope_idx) {
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      local_scope_.push_back(&g_scope_.NewScope());
-      local_scope_[j]->Var("out");
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      local_scopes_[j]->Var("out");
     }
-    local_scope_[input_scope_idx]->Var("input");
+    local_scopes_[input_scope_idx]->Var("input");
 
-    bc_op_handle_ = new f::details::BroadcastOpHandle(local_scope_, gpu_list_);
+    op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
 
-    f::details::VarHandle* in_var_handle = new f::details::VarHandle();
+    vars_.emplace_back(new VarHandle());
+    VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
     in_var_handle->place_ = gpu_list_[input_scope_idx];
     in_var_handle->name_ = "input";
     in_var_handle->version_ = 1;
     in_var_handle->scope_idx_ = input_scope_idx;
     in_var_handle->generated_op_ = nullptr;
-    bc_op_handle_->AddInput(in_var_handle);
+    op_handle_->AddInput(in_var_handle);
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddInput(dummy_var_handle);
 
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      bc_op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j];
-      f::details::VarHandle* out_var_handle = new f::details::VarHandle();
+      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
+      vars_.emplace_back(new VarHandle());
+      VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
       out_var_handle->place_ = gpu_list_[j];
       out_var_handle->name_ = "out";
       out_var_handle->version_ = 2;
       out_var_handle->scope_idx_ = j;
-      bc_op_handle_->AddOutput(out_var_handle);
-    }
-  }
-  void BroadcastOpDestroy() {
-    for (auto in : bc_op_handle_->inputs_) {
-      delete in;
-    }
-    for (auto out : bc_op_handle_->outputs_) {
-      delete out;
+      op_handle_->AddOutput(out_var_handle);
     }
-    delete bc_op_handle_;
-    for (size_t j = 0; j < ctxs_.size(); ++j) {
-      delete ctxs_[j];
-    }
-  }
 
-  void WaitAll() {
-    for (size_t j = 0; j < ctxs_.size(); ++j) {
-      ctxs_[j]->Wait();
-    }
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* out_dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    out_dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddOutput(out_dummy_var_handle);
   }
 
-  void TestBroadcastLodTensor() {
-    int input_scope_idx = 0;
-    BroadcastInitOp(input_scope_idx);
-
-    auto in_var = local_scope_[input_scope_idx]->Var("input");
+  void TestBroadcastLodTensor(size_t input_scope_idx) {
+    auto in_var = local_scopes_[input_scope_idx]->Var("input");
     auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
     in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
 
-    std::vector<float> send_vector(f::product(kDims), input_scope_idx + 12);
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
     for (size_t k = 0; k < send_vector.size(); ++k) {
       send_vector[k] = k;
     }
@@ -120,13 +127,13 @@ class BroadcastTester : public ::testing::Test {
         send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
     in_lod_tensor->set_lod(lod);
 
-    bc_op_handle_->Run(false);
+    op_handle_->Run(false);
 
     WaitAll();
 
     p::CPUPlace cpu_place;
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      auto out_var = local_scope_[j]->Var("out");
+      auto out_var = local_scopes_[j]->Var("out");
       auto out_tensor = out_var->Get<f::LoDTensor>();
       PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
 
@@ -134,42 +141,37 @@ class BroadcastTester : public ::testing::Test {
       f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor);
       float* ct = result_tensor.mutable_data<float>(cpu_place);
 
-      for (int64_t j = 0; j < f::product(kDims); ++j) {
-        ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+      for (int64_t i = 0; i < f::product(kDims); ++i) {
+        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
       }
     }
-
-    BroadcastOpDestroy();
   }
 
-  void TestBroadcastSelectedRows() {
-    int input_scope_idx = 0;
-    BroadcastInitOp(input_scope_idx);
-
-    auto in_var = local_scope_[input_scope_idx]->Var("input");
+  void TestBroadcastSelectedRows(size_t input_scope_idx) {
+    auto in_var = local_scopes_[input_scope_idx]->Var("input");
     auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
     auto value = in_selected_rows->mutable_value();
     value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
-    int height = kDims[0] * 2;
+    int height = static_cast<int>(kDims[0]) * 2;
     std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
                               2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
     in_selected_rows->set_height(height);
     in_selected_rows->set_rows(rows);
 
-    std::vector<float> send_vector(f::product(kDims));
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
     for (size_t k = 0; k < send_vector.size(); ++k) {
       send_vector[k] = k;
     }
     paddle::framework::TensorFromVector<float>(
         send_vector, *(ctxs_[input_scope_idx]), value);
 
-    bc_op_handle_->Run(false);
+    op_handle_->Run(false);
 
     WaitAll();
 
     p::CPUPlace cpu_place;
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      auto out_var = local_scope_[j]->Var("out");
+      auto out_var = local_scopes_[j]->Var("out");
       auto& out_select_rows = out_var->Get<f::SelectedRows>();
       auto rt = out_select_rows.value();
 
@@ -183,41 +185,44 @@ class BroadcastTester : public ::testing::Test {
       f::TensorCopy(rt, cpu_place, *(ctxs_[j]), &result_tensor);
       float* ct = result_tensor.data<float>();
 
-      for (int64_t j = 0; j < f::product(kDims); ++j) {
-        ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+      for (int64_t i = 0; i < f::product(kDims); ++i) {
+        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
       }
     }
-
-    BroadcastOpDestroy();
   }
-
- public:
-  f::Scope g_scope_;
-  std::vector<p::DeviceContext*> ctxs_;
-  std::vector<f::Scope*> local_scope_;
-  std::vector<p::Place> gpu_list_;
-  f::details::BroadcastOpHandle* bc_op_handle_;
 };
 
-TEST_F(BroadcastTester, TestCPUBroadcastTestLodTensor) {
-  InitCtxOnGpu(false);
-  TestBroadcastLodTensor();
+TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastLodTensor(input_scope_idx);
 }
 
-TEST_F(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
-  InitCtxOnGpu(false);
-  TestBroadcastSelectedRows();
+TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastSelectedRows(input_scope_idx);
 }
 
 #ifdef PADDLE_WITH_CUDA
-TEST_F(BroadcastTester, TestGPUBroadcastTestLodTensor) {
-  InitCtxOnGpu(true);
-  TestBroadcastLodTensor();
+TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastLodTensor(input_scope_idx);
 }
 
-TEST_F(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
-  InitCtxOnGpu(true);
-  TestBroadcastSelectedRows();
+TEST(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastSelectedRows(input_scope_idx);
 }
 #endif
 
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
index f9dfb2f5c6..3c3054c03d 100644
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -23,32 +23,54 @@ GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
     : local_scopes_(local_scopes), places_(places) {}
 
 void GatherOpHandle::RunImpl() {
+  // the input may have dummy var.
+  std::vector<VarHandle *> in_var_handles;
+  for (auto *in : inputs_) {
+    auto *in_handle = dynamic_cast<VarHandle *>(in);
+    if (in_handle) {
+      in_var_handles.push_back(in_handle);
+    }
+  }
   PADDLE_ENFORCE_EQ(
-      this->inputs_.size(), places_.size(),
-      "The number of inputs should be equal to the number of place.");
-  PADDLE_ENFORCE_EQ(this->outputs_.size(), 1,
+      in_var_handles.size(), places_.size(),
+      "The number of output should equal to the number of places.");
+
+  // the output may have dummy var.
+  std::vector<VarHandle *> out_var_handles;
+  for (auto *out : outputs_) {
+    auto *out_handle = dynamic_cast<VarHandle *>(out);
+    if (out_handle) {
+      out_var_handles.push_back(out_handle);
+    }
+  }
+  PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
                     "The number of output should be one.");
-  auto in_0_handle = static_cast<VarHandle *>(inputs_[0]);
+
+  auto in_0_handle = static_cast<VarHandle *>(in_var_handles[0]);
   auto pre_in_var =
       local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
+  auto pre_place = in_0_handle->place_;
+
   PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
                  "Currently, gather_op only can gather SelectedRows.");
-  auto pre_place = in_0_handle->place_;
+
+  PADDLE_ENFORCE_EQ(out_var_handles[0]->place_.which(), pre_place.which(),
+                    "The place of input and output should be the same.");
 
   // Wait input done, this Wait is asynchronous operation
-  for (auto *in : inputs_) {
-    if (inputs_[0]->generated_op_) {
-      auto &p = static_cast<VarHandle *>(in)->place_;
-      in->generated_op_->Wait(dev_ctxes_[p]);
+  for (auto *in : in_var_handles) {
+    if (in->generated_op_) {
+      in->generated_op_->Wait(dev_ctxes_[in->place_]);
     }
   }
 
   std::vector<int64_t> out_rows;
-  std::vector<Tensor *> in_tensors;
+  std::vector<Tensor> in_tensors;
   std::vector<platform::Place> in_places;
 
+  auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
   // gather the inputs
-  for (auto *in : inputs_) {
+  for (auto *in : in_var_handles) {
     auto in_handle = static_cast<VarHandle *>(in);
     auto in_p = in_handle->place_;
     in_places.push_back(in_p);
@@ -58,63 +80,46 @@ void GatherOpHandle::RunImpl() {
                       "The place of input should be the same.");
     auto *s = local_scopes_[in_handle->scope_idx_];
     auto in_var = s->FindVar(in_handle->name_);
-    PADDLE_ENFORCE_EQ(in_var->Type(), pre_in_var->Type(),
+
+    auto &in_sr = in_var->Get<framework::SelectedRows>();
+
+    PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(),
                       "The type of input is not consistent.");
+    PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(),
+                      "The height of inputs is not consistent.");
+    PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(), ,
+                      "The dims of inputs is not consistent.");
 
-    if (in_var->IsType<framework::SelectedRows>()) {
-      auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
-      auto &in_sr = in_var->Get<framework::SelectedRows>();
-      auto in_sr_rows = in_sr.rows();
-      out_rows.insert(out_rows.begin(), in_sr_rows.begin(), in_sr_rows.end());
-      PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(),
-                        "The height of inputs is not consistent.");
-      PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(), ,
-                        "The dims of inputs is not consistent.");
-    } else if (in_var->IsType<framework::LoDTensor>()) {
-      auto &pre_in = pre_in_var->Get<framework::LoDTensor>();
-      auto &in_lodtensor = in_var->Get<framework::LoDTensor>();
-      PADDLE_ENFORCE_EQ(in_lodtensor.lod(), pre_in.lod(),
-                        "The lod of inputs is not consistent.");
-      PADDLE_ENFORCE_EQ(in_lodtensor.dims(), pre_in.dims(),
-                        "The dims of inputs is not consistent.");
-    } else {
-      PADDLE_THROW("Var should be LoDTensor or SelectedRows.");
-    }
-    in_tensors.push_back(GetTensorFromVar(in_var));
-    pre_in_var = in_var;
+    auto in_sr_rows = in_sr.rows();
+    out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end());
+
+    in_tensors.emplace_back(in_sr.value());
   }
 
   // write the output
-  auto out_handle = static_cast<VarHandle *>(this->outputs_[0]);
-  auto &out_place = out_handle->place_;
-  auto out_scope_idx = out_handle->scope_idx_;
-  auto out_var = local_scopes_[out_scope_idx]->FindVar(out_handle->name_);
-  PADDLE_ENFORCE_EQ(out_place.which(), pre_place.which(),
-                    "The place of input and output should be the same.");
-  if (pre_in_var->IsType<framework::SelectedRows>()) {
-    auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
-    auto out = out_var->GetMutable<framework::SelectedRows>();
-    out->set_height(pre_in.height());
-    out->set_rows(out_rows);
-    size_t rows = out_rows.size();
-    DDim out_dim = pre_in.GetCompleteDims();
-    out_dim[0] = static_cast<int64_t>(rows);
-    out->mutable_value()->Resize(out_dim);
-    out->mutable_value()->mutable_data(out_place, pre_in.value().type());
-    auto out_tensor = out->mutable_value();
-    // copy
-    int s = 0, e = 0;
-    for (size_t j = 0; j < in_tensors.size(); ++j) {
-      e += in_tensors[j]->dims()[0];
-      auto sub_out = out_tensor->Slice(s, e);
-      paddle::framework::TensorCopy(*(in_tensors[j]), out_place,
-                                    *(dev_ctxes_[in_places[j]]), &sub_out);
-      s = e;
-    }
-  } else if (pre_in_var->IsType<framework::LoDTensor>()) {
-    PADDLE_THROW("Currently, Var only can be SelectedRows.");
-  } else {
-    PADDLE_THROW("Var should be SelectedRows.");
+  auto &out_place = out_var_handles[0]->place_;
+  auto out_scope_idx = out_var_handles[0]->scope_idx_;
+  auto out_var =
+      local_scopes_[out_scope_idx]->FindVar(out_var_handles[0]->name_);
+
+  auto out = out_var->GetMutable<framework::SelectedRows>();
+  out->set_height(pre_in.height());
+  out->set_rows(out_rows);
+  size_t rows = out_rows.size();
+  DDim out_dim = pre_in.GetCompleteDims();
+  out_dim[0] = static_cast<int64_t>(rows);
+  out->mutable_value()->Resize(out_dim);
+  out->mutable_value()->mutable_data(out_place, pre_in.value().type());
+  Tensor *out_tensor = out->mutable_value();
+
+  // copy
+  int s = 0, e = 0;
+  for (size_t j = 0; j < in_tensors.size(); ++j) {
+    e += in_tensors[j].dims()[0];
+    auto sub_out = out_tensor->Slice(s, e);
+    paddle::framework::TensorCopy(in_tensors[j], out_place,
+                                  *(dev_ctxes_[in_places[j]]), &sub_out);
+    s = e;
   }
 }
 
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 3cf2155320..10839f239d 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -26,14 +26,26 @@ namespace p = paddle::platform;
 // test data amount
 const f::DDim kDims = {20, 20};
 
-class GatherTester : public ::testing::Test {
- public:
+struct TestGatherOpHandle {
+  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
+  std::vector<Scope*> local_scopes_;
+  Scope g_scope_;
+  std::unique_ptr<OpHandleBase> op_handle_;
+  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  std::vector<p::Place> gpu_list_;
+
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+  }
+
   void InitCtxOnGpu(bool use_gpu) {
     if (use_gpu) {
 #ifdef PADDLE_WITH_CUDA
       int count = p::GetCUDADeviceCount();
       if (count <= 1) {
-        LOG(WARNING) << "Cannot test multi-gpu Gather, because the CUDA "
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
                         "device count is "
                      << count;
         exit(0);
@@ -56,57 +68,51 @@ class GatherTester : public ::testing::Test {
     }
   }
 
-  void InitGatherOp(int input_scope_idx) {
+  void InitGatherOp(size_t input_scope_idx) {
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      local_scope_.push_back(&g_scope_.NewScope());
-      local_scope_[j]->Var("input");
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      local_scopes_[j]->Var("out");
     }
-    local_scope_[input_scope_idx]->Var("out");
-
-    gather_op_handle_ = new f::details::GatherOpHandle(local_scope_, gpu_list_);
-
-    f::details::VarHandle* out_var_handle = new f::details::VarHandle();
-    out_var_handle->place_ = gpu_list_[input_scope_idx];
-    out_var_handle->name_ = "out";
-    out_var_handle->version_ = 2;
-    out_var_handle->scope_idx_ = input_scope_idx;
-    out_var_handle->generated_op_ = gather_op_handle_;
-    gather_op_handle_->AddOutput(out_var_handle);
+    local_scopes_[input_scope_idx]->Var("input");
 
+    op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
+    // add input
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      gather_op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j];
-      f::details::VarHandle* in_var_handle = new f::details::VarHandle();
+      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
+      vars_.emplace_back(new VarHandle());
+      VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
       in_var_handle->place_ = gpu_list_[j];
       in_var_handle->name_ = "input";
       in_var_handle->version_ = 1;
       in_var_handle->scope_idx_ = j;
       in_var_handle->generated_op_ = nullptr;
-      gather_op_handle_->AddInput(in_var_handle);
-    }
-  }
-  void GatherOpDestroy() {
-    for (auto in : gather_op_handle_->inputs_) {
-      delete in;
-    }
-    for (auto out : gather_op_handle_->outputs_) {
-      delete out;
-    }
-    delete gather_op_handle_;
-    for (size_t j = 0; j < ctxs_.size(); ++j) {
-      delete ctxs_[j];
+      op_handle_->AddInput(in_var_handle);
     }
-  }
 
-  void WaitAll() {
-    for (size_t j = 0; j < ctxs_.size(); ++j) {
-      ctxs_[j]->Wait();
-    }
-  }
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* in_dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    in_dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddInput(in_dummy_var_handle);
+
+    // add output
+    vars_.emplace_back(new VarHandle());
+    VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
+    out_var_handle->place_ = gpu_list_[input_scope_idx];
+    out_var_handle->name_ = "out";
+    out_var_handle->version_ = 2;
+    out_var_handle->scope_idx_ = input_scope_idx;
+    op_handle_->AddOutput(out_var_handle);
 
-  void TestGatherSelectedRows() {
-    int output_scope_idx = 0;
-    InitGatherOp(output_scope_idx);
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    op_handle_->AddOutput(dummy_var_handle);
+  }
 
+  void TestGatherSelectedRows(size_t output_scope_idx) {
     int height = kDims[0] * 2;
     std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
                               2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
@@ -117,7 +123,7 @@ class GatherTester : public ::testing::Test {
 
     for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
          ++input_scope_idx) {
-      auto in_var = local_scope_[input_scope_idx]->Var("input");
+      auto in_var = local_scopes_[input_scope_idx]->Var("input");
       auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
       auto value = in_selected_rows->mutable_value();
       value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@@ -130,13 +136,21 @@ class GatherTester : public ::testing::Test {
       value->Resize(kDims);
     }
 
-    gather_op_handle_->Run(false);
+    auto out_var = local_scopes_[output_scope_idx]->Var("out");
+    auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
+
+    auto in_var = local_scopes_[output_scope_idx]->Var("input");
+    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+
+    out_selected_rows->mutable_value()->ShareDataWith(
+        in_selected_rows->value());
+
+    op_handle_->Run(false);
 
     WaitAll();
 
     p::CPUPlace cpu_place;
 
-    auto out_var = local_scope_[output_scope_idx]->Var("out");
     auto& out_select_rows = out_var->Get<f::SelectedRows>();
     auto rt = out_select_rows.value();
 
@@ -152,28 +166,25 @@ class GatherTester : public ::testing::Test {
     for (int64_t j = 0; j < f::product(kDims); ++j) {
       ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
     }
-
-    GatherOpDestroy();
   }
-
- public:
-  f::Scope g_scope_;
-  std::vector<p::DeviceContext*> ctxs_;
-  std::vector<f::Scope*> local_scope_;
-  std::vector<p::Place> gpu_list_;
-  f::details::GatherOpHandle* gather_op_handle_;
 };
 
-TEST_F(GatherTester, TestCPUGatherTestSelectedRows) {
-  InitCtxOnGpu(false);
-  TestGatherSelectedRows();
+TEST(GatherTester, TestCPUGatherTestSelectedRows) {
+  TestGatherOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitGatherOp(input_scope_idx);
+  test_op.TestGatherSelectedRows(input_scope_idx);
 }
 
 #ifdef PADDLE_WITH_CUDA
 
-TEST_F(GatherTester, TestGPUGatherTestSelectedRows) {
-  InitCtxOnGpu(true);
-  TestGatherSelectedRows();
+TEST(GatherTester, TestGPUGatherTestSelectedRows) {
+  TestGatherOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitGatherOp(input_scope_idx);
+  test_op.TestGatherSelectedRows(input_scope_idx);
 }
 #endif
 }  // namespace details
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 0d7fbdfeab..e4194a7442 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -17,21 +17,6 @@
 namespace paddle {
 namespace framework {
 namespace details {
-
-// GetTensorFromVar is used in broadcast_op handle and gather_op handle, so it
-// should be placed in a commonplace. I don't find an appropriate place, so I
-// temporarily place it in op_handle_base.
-Tensor *GetTensorFromVar(Variable *in_var) {
-  if (in_var->IsType<LoDTensor>()) {
-    return in_var->GetMutable<LoDTensor>();
-  } else if (in_var->IsType<SelectedRows>()) {
-    return in_var->GetMutable<SelectedRows>()->mutable_value();
-  } else {
-    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
-  }
-  return nullptr;
-}
-
 std::string OpHandleBase::DebugString() const {
   std::stringstream ss;
   ss << "(";
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index b733817dcd..fbdb54ba8d 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -17,9 +17,6 @@
 #include <vector>
 
 #include "paddle/fluid/framework/details/var_handle.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -27,11 +24,6 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-// GetTensorFromVar is used in broadcast_op handle and gather_op handle, so it
-// should be placed in a commonplace. I don't find an appropriate place, so I
-// temporarily place it in op_handle.
-Tensor *GetTensorFromVar(Variable *in_var);
-
 constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
 
 class OpHandleBase {

From 482314e3b1a7f869daca7de302eab0b53abd91cf Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 13 Apr 2018 17:09:39 +0800
Subject: [PATCH 55/67] Add CRF unittest

---
 .../tests/book/test_label_semantic_roles.py   |  11 +-
 .../tests/unittests/test_parallel_executor.py | 145 ++++++++++++++++++
 2 files changed, 150 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index 4d8bca4d24..d9cd76952e 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -12,17 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
 import math
-
 import numpy as np
+import os
+import time
+import unittest
+
 import paddle
 import paddle.dataset.conll05 as conll05
 import paddle.fluid as fluid
-from paddle.fluid.initializer import init_on_cpu
-import contextlib
-import time
-import unittest
-import os
 
 word_dict, verb_dict, label_dict = conll05.get_dict()
 word_dict_len = len(word_dict)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index 95845ea4de..83d22fd799 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -505,3 +505,148 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
                         train_loss, test_loss, atol=1e-8),
                     "Train loss: " + str(train_loss) + "\n Test loss:" +
                     str(test_loss))
+
+
+import paddle.dataset.conll05 as conll05
+import paddle.fluid as fluid
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_dict_len = len(verb_dict)
+mark_dict_len = 2
+word_dim = 32
+mark_dim = 5
+hidden_dim = 512
+depth = 8
+mix_hidden_lr = 1e-3
+embedding_name = 'emb'
+
+
+def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
+            **ignored):
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        size=[pred_dict_len, word_dim],
+        dtype='float32',
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark, size=[mark_dict_len, mark_dim], dtype='float32')
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[word_dict_len, word_dim],
+            input=x,
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
+        for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=hidden_dim,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=hidden_dim,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
+    ])
+
+    return feature_out
+
+
+class TestCRFModel(unittest.TestCase):
+    def test_all(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            word = fluid.layers.data(
+                name='word_data', shape=[1], dtype='int64', lod_level=1)
+            predicate = fluid.layers.data(
+                name='verb_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_n2 = fluid.layers.data(
+                name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_n1 = fluid.layers.data(
+                name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_0 = fluid.layers.data(
+                name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_p1 = fluid.layers.data(
+                name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_p2 = fluid.layers.data(
+                name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+            mark = fluid.layers.data(
+                name='mark_data', shape=[1], dtype='int64', lod_level=1)
+            feature_out = db_lstm(**locals())
+            target = fluid.layers.data(
+                name='target', shape=[1], dtype='int64', lod_level=1)
+            crf_cost = fluid.layers.linear_chain_crf(
+                input=feature_out,
+                label=target,
+                param_attr=fluid.ParamAttr(
+                    name='crfw', learning_rate=1e-1))
+            avg_cost = fluid.layers.mean(crf_cost)
+
+            sgd_optimizer = fluid.optimizer.SGD(
+                learning_rate=fluid.layers.exponential_decay(
+                    learning_rate=0.01,
+                    decay_steps=100000,
+                    decay_rate=0.5,
+                    staircase=True))
+            sgd_optimizer.minimize(avg_cost)
+
+            train_data = paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.conll05.test(), buf_size=8192),
+                batch_size=16)
+
+            place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup)
+
+            pe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
+
+            feeder = fluid.DataFeeder(
+                feed_list=[
+                    word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
+                    mark, target
+                ],
+                place=fluid.CPUPlace())
+
+            data = train_data()
+            for i in xrange(10):
+                cur_batch = next(data)
+                print map(numpy.array,
+                          pe.run(feed_dict=feeder.feed(cur_batch),
+                                 fetch_list=[avg_cost.name]))[0]

From 253441b55355303f6bc5814f41806d1ab0420b0d Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Fri, 13 Apr 2018 17:36:46 +0800
Subject: [PATCH 56/67] fix duplicate lr op after distribute transpiler

---
 python/paddle/fluid/distribute_transpiler.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index b0522b49f4..aa15392d7e 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -1115,4 +1115,6 @@ class DistributeTranspiler:
             for op2 in find_ops:
                 if ufind.is_connected(op1, op2):
                     lr_ops.append(op1)
+                    # we only need to append op for once
+                    break
         return lr_ops

From 4452ff76b79b3a9acdcd15ba6e751117889db3fb Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 13 Apr 2018 17:38:43 +0800
Subject: [PATCH 57/67] Fix CPU compile

---
 paddle/fluid/framework/details/op_handle_base.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 28f1e7b508..534d77860f 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -109,18 +109,18 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 
 void OpHandleBase::RunAndRecordEvent(platform::Place p,
                                      const std::function<void()> &callback) {
+#ifdef PADDLE_WITH_CUDA
   if (platform::is_cpu_place(p) || events_.empty()) {
     callback();
   } else {
-#ifdef PADDLE_WITH_CUDA
     auto *ctx = dev_ctxes_.at(p);
     auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx);
     cuda_ctx->RecordEvent(events_.at(boost::get<platform::CUDAPlace>(p).device),
                           callback);
+  }
 #else
-    PADDLE_THROW("Not implemented");
+  callback();
 #endif
-  }
 }
 
 }  // namespace details

From 384d6ee8ac3e0ca9372ef90a1626f7129c9e7f37 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Fri, 13 Apr 2018 17:44:58 +0800
Subject: [PATCH 58/67] follow comments

---
 .../framework/details/broadcast_op_handle.cc  | 37 +++----------------
 .../framework/details/gather_op_handle.cc     |  9 ++---
 paddle/fluid/framework/tensor_util.cc         |  6 +--
 3 files changed, 12 insertions(+), 40 deletions(-)

diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 24115cae81..7d29012380 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -61,33 +61,24 @@ void BroadcastOpHandle::RunImpl() {
   // Wait input done, this Wait is asynchronous operation
   auto &in_place = in_var_handle[0]->place_;
   if (in_var_handle[0]->generated_op_) {
-    in_var_handle[0]->generated_op_->Wait(dev_ctxes_[in_place]);
     for (auto *out : out_var_handles) {
       auto &out_p = out->place_;
-      if (platform::is_same_place(in_place, out_p)) continue;
       in_var_handle[0]->generated_op_->Wait(dev_ctxes_[out_p]);
     }
   }
 
   //
   auto in_scope_idx = in_var_handle[0]->scope_idx_;
-  PADDLE_ENFORCE_LT(in_scope_idx, local_scopes_.size(),
-                    "The input(%s) is not in the local_scopes.",
-                    in_var_handle[0]->name_);
-  auto in_var = local_scopes_[in_scope_idx]->FindVar(in_var_handle[0]->name_);
+  auto in_var =
+      local_scopes_.at(in_scope_idx)->FindVar(in_var_handle[0]->name_);
   Tensor *in_tensor = GetTensorFromVar(in_var);
 
   for (auto *out : out_var_handles) {
     auto &out_p = out->place_;
+    auto out_var = local_scopes_.at(out->scope_idx_)->FindVar(out->name_);
 
-    auto out_scope_idx = out->scope_idx_;
-    PADDLE_ENFORCE_LT(out_scope_idx, local_scopes_.size(),
-                      "%s is not in the local_scopes ", out->name_);
-
-    auto *s = local_scopes_[out_scope_idx];
-    auto out_var = s->FindVar(out->name_);
     PADDLE_ENFORCE_EQ(out_p.which(), in_place.which(),
-                      "The place of input and output should be the same.");
+                      "Places must be all on CPU or all on CUDA.");
 
     if (in_var->IsType<framework::SelectedRows>()) {
       auto &in_sr = in_var->Get<framework::SelectedRows>();
@@ -109,24 +100,8 @@ void BroadcastOpHandle::RunImpl() {
     }
 
     Tensor *out_tensor = GetTensorFromVar(out_var);
-    if (platform::is_cpu_place(in_place)) {
-      paddle::framework::TensorCopy(*in_tensor, out_p, *(dev_ctxes_[in_place]),
-                                    out_tensor);
-    } else if (platform::is_gpu_place(in_place)) {
-#ifdef PADDLE_WITH_CUDA
-      auto src_gpu_place = boost::get<platform::CUDAPlace>(in_place);
-      auto dst_gpu_place = boost::get<platform::CUDAPlace>(out_p);
-      void *dst_ptr = out_tensor->mutable_data(out_p);
-      void *src_ptr = in_tensor->data<void>();
-      int64_t size = in_tensor->numel() * SizeOfType(in_tensor->type());
-      memory::Copy(
-          dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-          reinterpret_cast<platform::CUDADeviceContext *>(dev_ctxes_[out_p])
-              ->stream());
-#else
-      PADDLE_THROW("CUDAPlace is not supported in CPU device.");
-#endif
-    }
+    paddle::framework::TensorCopy(*in_tensor, out_p, *(dev_ctxes_[in_place]),
+                                  out_tensor);
   }
 }
 
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
index 3c3054c03d..8dd85be567 100644
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -74,13 +74,10 @@ void GatherOpHandle::RunImpl() {
     auto in_handle = static_cast<VarHandle *>(in);
     auto in_p = in_handle->place_;
     in_places.push_back(in_p);
-    PADDLE_ENFORCE_LT(in_handle->scope_idx_, local_scopes_.size(),
-                      "%s is not the the local_scopes ", in_handle->name_);
     PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
-                      "The place of input should be the same.");
-    auto *s = local_scopes_[in_handle->scope_idx_];
-    auto in_var = s->FindVar(in_handle->name_);
-
+                      "Places must be all on CPU or all on CUDA.");
+    auto in_var =
+        local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
     auto &in_sr = in_var->Get<framework::SelectedRows>();
 
     PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(),
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 1d864af011..d1b01ae05b 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -11,8 +11,10 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
-
 #include "paddle/fluid/framework/tensor_util.h"
+#include <algorithm>
+#include <limits>
+#include <vector>
 
 namespace paddle {
 namespace framework {
@@ -65,8 +67,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
     auto ctx_place = ctx.GetPlace();
     PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
     memory::Copy(
         dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());

From 47609ab2b8c5e620c2d9cbe367136d542715b782 Mon Sep 17 00:00:00 2001
From: Yi Wang <yi.wang.2005@gmail.com>
Date: Fri, 13 Apr 2018 16:11:24 -0700
Subject: [PATCH 59/67] Document transform.h and fix cpplint errors (#9913)

---
 ..._cast.h => cuda_transform_iterator_cast.h} | 33 ++++++++++++-----
 paddle/fluid/platform/transform.h             | 35 ++++++++++++++-----
 2 files changed, 50 insertions(+), 18 deletions(-)
 rename paddle/fluid/platform/details/{device_ptr_cast.h => cuda_transform_iterator_cast.h} (50%)

diff --git a/paddle/fluid/platform/details/device_ptr_cast.h b/paddle/fluid/platform/details/cuda_transform_iterator_cast.h
similarity index 50%
rename from paddle/fluid/platform/details/device_ptr_cast.h
rename to paddle/fluid/platform/details/cuda_transform_iterator_cast.h
index 1c502a19c0..06afc44c25 100644
--- a/paddle/fluid/platform/details/device_ptr_cast.h
+++ b/paddle/fluid/platform/details/cuda_transform_iterator_cast.h
@@ -18,16 +18,22 @@ limitations under the License. */
 #error device_ptr_cast must be include by .cu file
 #endif
 
-#include <thrust/device_ptr.h>
+#include <type_traits>  // For std::remove_pointer and std::is_pointer.
+
+#include "thrust/device_ptr.h"
 
 namespace paddle {
 namespace platform {
 namespace details {
+
+// PointerToThrustDevicePtr has two speicalizations, one casts a (CUDA
+// device) pointer into thrust::device_ptr, the other keeps rest types
+// un-casted.
 template <typename T, bool is_ptr>
-struct DevicePtrCast;
+struct PointerToThrustDevicePtr;
 
 template <typename T>
-struct DevicePtrCast<T, true> {
+struct PointerToThrustDevicePtr<T, true> {
   using ELEM = typename std::remove_pointer<T>::type;
   using RTYPE = thrust::device_ptr<ELEM>;
 
@@ -37,17 +43,26 @@ struct DevicePtrCast<T, true> {
 };
 
 template <typename T>
-struct DevicePtrCast<T, false> {
+struct PointerToThrustDevicePtr<T, false> {
   using RTYPE = T;
   inline RTYPE operator()(RTYPE it) const { return it; }
 };
 
-// Cast T to thrust::device_ptr if T is a pointer.
-// Otherwise, e.g., T is a iterator, return T itself.
+// CastToCUDATransformIterator casts a pointer to thrust::device_ptr
+// so it could be used as the iterator of thrust::transform.  It
+// doesn't cast other types.
+//
+// We need CastToCUDATransformIterator because it is often that we
+// want to use device memory pointers as transform iterators, e.g., to
+// transform a block of float32 to float16.  In this case, we want
+// CastToCUDATransformIterator to cast float16/32 pointers to
+// thrust::device_ptr, otherwise they cannot work as the iterator
+// required by thrust::transform.  At the same time, we don't want to
+// cast thrust::device_ptr to thrust::device_ptr repeatedly.
 template <typename T>
-auto DevPtrCast(T t) ->
-    typename DevicePtrCast<T, std::is_pointer<T>::value>::RTYPE {
-  DevicePtrCast<T, std::is_pointer<T>::value> cast;
+auto CastToCUDATransformIterator(T t) ->
+    typename PointerToThrustDevicePtr<T, std::is_pointer<T>::value>::RTYPE {
+  PointerToThrustDevicePtr<T, std::is_pointer<T>::value> cast;
   return cast(t);
 }
 
diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h
index 917c48b47f..7877d3e41c 100644
--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@@ -14,29 +14,44 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+#include <type_traits>
+
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/fluid/platform/place.h"
 
-#include <algorithm>
-#include <type_traits>
 #ifdef __NVCC__
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
-#include "paddle/fluid/platform/details/device_ptr_cast.h"
+#include "paddle/fluid/platform/details/cuda_transform_iterator_cast.h"
 #endif
 
 namespace paddle {
 namespace platform {
 
-// Transform on host or device. It provides the same API in std library.
+// Transform applys a unary or a binary functor on each element in a
+// range defined by a pair of iterators.
+//
+// - The specialization for CPU calls std::transform.
+// - The specialization for CUDA calls thrust::tranform.
+//
+// NOTE: We need to define InputIter and OutputIter defined as
+//       different types, because the InputIter points op's inputs and
+//       OutputIter pints to op's outputs.
+//
+// NOTE: We don't assume that InputIter to be const InputType* and
+//       OutputIter to be OutputType*, because we might use a iterator
+//       class, paddle::fluid::operators::RowwiseTRansformIterator.
 template <typename DeviceContext>
 struct Transform {
+  // The unary version.
   template <typename InputIter, typename OutputIter, typename UnaryOperation>
   void operator()(const DeviceContext& context, InputIter first, InputIter last,
                   OutputIter result, UnaryOperation op);
 
+  // The binary version.
   template <typename InputIter1, typename InputIter2, typename OutputIter,
             typename BinaryOperation>
   void operator()(const DeviceContext& context, InputIter1 first1,
@@ -70,8 +85,9 @@ struct Transform<platform::CUDADeviceContext> {
     auto place = context.GetPlace();
     PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
     thrust::transform(thrust::cuda::par.on(context.stream()),
-                      details::DevPtrCast(first), details::DevPtrCast(last),
-                      details::DevPtrCast(result), op);
+                      details::CastToCUDATransformIterator(first),
+                      details::CastToCUDATransformIterator(last),
+                      details::CastToCUDATransformIterator(result), op);
   }
 
   template <typename InputIter1, typename InputIter2, typename OutputIter,
@@ -82,9 +98,10 @@ struct Transform<platform::CUDADeviceContext> {
     auto place = context.GetPlace();
     PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
     thrust::transform(thrust::cuda::par.on(context.stream()),
-                      details::DevPtrCast(first1), details::DevPtrCast(last1),
-                      details::DevPtrCast(first2), details::DevPtrCast(result),
-                      op);
+                      details::CastToCUDATransformIterator(first1),
+                      details::CastToCUDATransformIterator(last1),
+                      details::CastToCUDATransformIterator(first2),
+                      details::CastToCUDATransformIterator(result), op);
   }
 };
 #endif

From 92913027fc17a1240a97aa565ec7e953d4181a78 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <kexin.zhao.paddle@gmail.com>
Date: Fri, 13 Apr 2018 16:15:02 -0700
Subject: [PATCH 60/67] fix unused var error (#9908)

---
 paddle/fluid/operators/math/math_function.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index c28047e6e9..9badf26c9b 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -268,6 +268,7 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const float16 alpha, const float16* A, const float16* B, const float16 beta,
     float16* C, const int batchCount, const int strideA, const int strideB) {
+#if CUDA_VERSION >= 8000
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -289,7 +290,6 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
   PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
                     "cublas Hgemm requires GPU compute capability >= 53");
 
-#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
       strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
@@ -304,6 +304,7 @@ void batched_gemm<platform::CUDADeviceContext, float>(
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const float alpha, const float* A, const float* B, const float beta,
     float* C, const int batchCount, const int strideA, const int strideB) {
+#if CUDA_VERSION >= 8000
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -315,7 +316,6 @@ void batched_gemm<platform::CUDADeviceContext, float>(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int strideC = M * N;
 
-#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
       strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
@@ -330,6 +330,7 @@ void batched_gemm<platform::CUDADeviceContext, double>(
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const double alpha, const double* A, const double* B, const double beta,
     double* C, const int batchCount, const int strideA, const int strideB) {
+#if CUDA_VERSION >= 8000
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -341,7 +342,6 @@ void batched_gemm<platform::CUDADeviceContext, double>(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int strideC = M * N;
 
-#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
       strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));

From f22da580fa7bba561ccbfd3ea55fcfbb264bf12a Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Sat, 14 Apr 2018 07:23:14 +0800
Subject: [PATCH 61/67] fix compiler error in paddle:latest-dev image (#9907)

---
 python/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 7cbd7f22bf..c7c0812fe2 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,5 +1,5 @@
 file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
-file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/ *.py)
+file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/*.py)
 set(PY_FILES paddle/__init__.py
   ${UTILS_PY_FILES}
   ${FLUID_PY_FILES})
@@ -7,7 +7,7 @@ set(PY_FILES paddle/__init__.py
 if(NOT WITH_FLUID_ONLY)
   file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
   file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
-  file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
+  file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/*.py)
   set(PY_FILES ${PY_FILES}
     ${TRAINER_PY_FILES}
     ${HELPERS_PY_FILES}
@@ -55,7 +55,7 @@ add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_BINARY_DIR}/python/pad
 
 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND touch stub.cc
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle
+    COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
     COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp

From 7b86da71954ecac5f9cfd28804177f78c5022eab Mon Sep 17 00:00:00 2001
From: Abhinav Arora <abhinavarora28@gmail.com>
Date: Fri, 13 Apr 2018 16:24:09 -0700
Subject: [PATCH 62/67] Fix CPPLint errors in operators (#9826)

* Fix CPPLint errors in operators

* Fix cast in softmax

* Fix softmax_mkldnn

* Fix send_recv_op_test

* Send_recv

* Fix softmax mkldnn
---
 paddle/fluid/operators/scale_op.cc            |  1 -
 paddle/fluid/operators/scatter_op.cu          |  5 +-
 paddle/fluid/operators/scatter_op.h           |  4 +-
 paddle/fluid/operators/scatter_test.cc        | 46 ++++++++++---------
 paddle/fluid/operators/send_barrier_op.cc     |  2 +-
 paddle/fluid/operators/send_op.cc             |  2 +-
 paddle/fluid/operators/send_recv_util.h       |  3 ++
 paddle/fluid/operators/sequence_concat_op.h   |  1 +
 paddle/fluid/operators/sequence_conv_op.h     |  1 +
 paddle/fluid/operators/sequence_erase_op.cc   |  1 +
 paddle/fluid/operators/sequence_erase_op.h    |  1 +
 paddle/fluid/operators/sequence_pool_op.cc    |  1 +
 paddle/fluid/operators/sequence_pool_op.h     |  1 +
 paddle/fluid/operators/sequence_softmax_op.cc |  1 +
 paddle/fluid/operators/softmax_mkldnn_op.cc   |  9 ++--
 paddle/fluid/operators/split_op.h             |  2 +-
 16 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 7ca7639fdb..1e938638c9 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
-
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu
index ef7d700659..a70b909172 100644
--- a/paddle/fluid/operators/scatter_op.cu
+++ b/paddle/fluid/operators/scatter_op.cu
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "gather.cu.h"
+#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "scatter.cu.h"
+#include "paddle/fluid/operators/scatter.cu.h"
+#include "paddle/fluid/operators/scatter_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
index 2151d8a924..d29947b55e 100644
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "gather.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "scatter.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/scatter.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc
index b67af3c371..750245153a 100644
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
@@ -13,44 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/scatter.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
-
 #include <gtest/gtest.h>
 #include <iostream>
 #include <string>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
 
 TEST(scatter, ScatterUpdate) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::operators;
+  // using namespace paddle::framework;
+  // using namespace paddle::platform;
+  // using namespace paddle::operators;
 
-  Tensor* src = new Tensor();
-  Tensor* index = new Tensor();
-  Tensor* output = new Tensor();
+  paddle::framework::Tensor* src = new paddle::framework::Tensor();
+  paddle::framework::Tensor* index = new paddle::framework::Tensor();
+  paddle::framework::Tensor* output = new paddle::framework::Tensor();
 
   float* p_src = nullptr;
   int* p_index = nullptr;
-  p_src = src->mutable_data<float>(make_ddim({1, 4}), CPUPlace());
-  p_index = index->mutable_data<int>(make_ddim({1}), CPUPlace());
+  p_src = src->mutable_data<float>(paddle::framework::make_ddim({1, 4}),
+                                   paddle::platform::CPUPlace());
+  p_index = index->mutable_data<int>(paddle::framework::make_ddim({1}),
+                                     paddle::platform::CPUPlace());
 
-  for (size_t i = 0; i < 4; ++i) p_src[i] = float(i);
+  for (size_t i = 0; i < 4; ++i) p_src[i] = static_cast<float>(i);
   p_index[0] = 1;
 
-  float* p_output = output->mutable_data<float>(make_ddim({4, 4}), CPUPlace());
+  float* p_output = output->mutable_data<float>(
+      paddle::framework::make_ddim({4, 4}), paddle::platform::CPUPlace());
 
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  ScatterAssign<float>(ctx, *src, *index, output);
+  paddle::operators::ScatterAssign<float>(ctx, *src, *index, output);
 
-  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0));
-  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
-  for (size_t i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], float(i - 4));
+  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f);
+  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], 0.0f);
+  for (size_t i = 4; i < 8; ++i) {
+    EXPECT_EQ(p_output[i], static_cast<float>(i - 4));
+  }
   for (size_t i = 4; i < 8; ++i)
-    EXPECT_EQ(output->data<float>()[i], float(i - 4));
-  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], float(0));
-  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
+    EXPECT_EQ(output->data<float>()[i], static_cast<float>(i - 4));
+  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], 0.0f);
+  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data<float>()[i], 0.0f);
 
   delete src;
   delete index;
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
index 8d02a6f291..12b844daaa 100644
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <future>  // NOLINT
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
@@ -19,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-#include <future>
 #include "paddle/fluid/operators/detail/grpc_client.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index d47f66de21..82ff087d0a 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>
+#include <future>  // NOLINT
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
diff --git a/paddle/fluid/operators/send_recv_util.h b/paddle/fluid/operators/send_recv_util.h
index 196f56f634..113513eb6b 100644
--- a/paddle/fluid/operators/send_recv_util.h
+++ b/paddle/fluid/operators/send_recv_util.h
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+#include <string>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/sequence_concat_op.h b/paddle/fluid/operators/sequence_concat_op.h
index 9f04c41991..71c9f45287 100644
--- a/paddle/fluid/operators/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_concat_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 
diff --git a/paddle/fluid/operators/sequence_conv_op.h b/paddle/fluid/operators/sequence_conv_op.h
index ee48339c52..b59504bb98 100644
--- a/paddle/fluid/operators/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_conv_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/context_project.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/sequence_erase_op.cc b/paddle/fluid/operators/sequence_erase_op.cc
index 32b9d7f7c1..73c0e89512 100644
--- a/paddle/fluid/operators/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_erase_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_erase_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_erase_op.h b/paddle/fluid/operators/sequence_erase_op.h
index b490c34f54..265390528a 100644
--- a/paddle/fluid/operators/sequence_erase_op.h
+++ b/paddle/fluid/operators/sequence_erase_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_pool_op.cc b/paddle/fluid/operators/sequence_pool_op.cc
index 3d4d54a3a3..933c8c2623 100644
--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_pool_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_pool_op.h b/paddle/fluid/operators/sequence_pool_op.h
index c58d677c92..2aa20792f2 100644
--- a/paddle/fluid/operators/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_pool_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_softmax_op.cc
index e8b4df0428..d2c1317bef 100644
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_softmax_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc
index cf0244e866..dc2f176344 100644
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <iostream>
 #include "mkldnn.hpp"
 #include "paddle/fluid/operators/softmax_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
-#include <iostream>
-
 namespace paddle {
 namespace operators {
 
@@ -63,9 +62,11 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
                                               softmax_md, 1 /*dim: C*/);
     // create memory primitives
     auto softmax_src_memory =
-        memory({softmax_md, mkldnn_engine}, (void*)input_data);
+        memory({softmax_md, mkldnn_engine},
+               static_cast<void*>(const_cast<T*>(input_data)));
     auto softmax_dst_memory =
-        memory({softmax_md, mkldnn_engine}, (void*)output_data);
+        memory({softmax_md, mkldnn_engine},
+               static_cast<void*>(const_cast<T*>(output_data)));
     auto softmax_prim_desc =
         softmax_forward::primitive_desc(softmax_desc, mkldnn_engine);
     auto softmax = softmax_forward(softmax_prim_desc, softmax_src_memory,
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
index ae8562c0c5..e2c41f44ab 100644
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include <chrono>
+#include <chrono>  // NOLINT
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"

From d4024a6ebd8d25a287bf4671e06ea8fa781b85fd Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 13 Apr 2018 20:14:42 -0700
Subject: [PATCH 63/67] Polish memory_optimizer code with mode comments and
 less identions

---
 .../fluid/memory_optimization_transpiler.py   | 135 ++++++++++++------
 1 file changed, 88 insertions(+), 47 deletions(-)

diff --git a/python/paddle/fluid/memory_optimization_transpiler.py b/python/paddle/fluid/memory_optimization_transpiler.py
index 41d1eca82e..20ed191042 100644
--- a/python/paddle/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/memory_optimization_transpiler.py
@@ -29,17 +29,20 @@ dtype_to_size = {
     core.VarDesc.VarType.BOOL: 1
 }
 
-sub_block_ops = [
+SUB_BLOCK_OPS = [
     "while", "while_grad", "parallel_do", "parallel_do_grad",
     "conditional_block", "conditional_block_grad"
 ]
 
+SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
+                  ("conditional_block", "conditional_block_grad")]
+
 PRINT_LOG = False
 
 
 class ControlFlowGraph(object):
-    def __init__(self, Program, ops, forward_num, skip_opt):
-        self._program = Program
+    def __init__(self, program, ops, forward_num, skip_opt):
+        self._program = program
         self._ops = ops
         self._forward_num = forward_num
         self._successors = defaultdict(set)
@@ -51,6 +54,7 @@ class ControlFlowGraph(object):
         self._skip_opt = skip_opt
 
     def _add_connections(self, connections):
+        """Populates _successors and _presuccessors for two neighbor nodes."""
         for node1, node2 in connections:
             self._add(node1, node2)
 
@@ -58,7 +62,11 @@ class ControlFlowGraph(object):
         self._successors[node1].add(node2)
         self._presuccessors[node2].add(node1)
 
+    # TODO(panyx0718): We need to have a unified way of building intermediate
+    # representation.
     def _build_graph(self):
+        """Build a graph based on op sequence.
+        """
         self.op_size = len(self._ops)
         op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)]
         self._add_connections(op_node_connections)
@@ -82,15 +90,14 @@ class ControlFlowGraph(object):
                 self._live_out[i].add(new_name)
 
     def _reach_fixed_point(self, live_in, live_out):
+        """Check if the liveness set has stablized."""
         if len(live_in) != len(self._live_in):
             return False
         if len(live_out) != len(self._live_out):
             return False
         for i in range(self.op_size):
-            if live_in[i] != self._live_in[i]:
-                return False
-        for i in range(self.op_size):
-            if live_out[i] != self._live_out[i]:
+            if (live_in[i] != self._live_in[i] or
+                    live_out[i] != self._live_out[i]):
                 return False
         return True
 
@@ -98,6 +105,8 @@ class ControlFlowGraph(object):
         self._build_graph()
         live_in = defaultdict(set)
         live_out = defaultdict(set)
+        # Repeatedly apply liveness updates until the algorithm stablize
+        # on a complete set live input vars and live output vars.
         while True:
             for i in range(self.op_size, 0, -1):
                 live_in[i] = set(self._live_in[i])
@@ -141,6 +150,8 @@ class ControlFlowGraph(object):
             return False
         return True
 
+    # TODO(panyx0718): This needs to be less hacky. It seems memory optimization
+    # doesn't consider vars copied between cpu and gpu.
     def _update_skip_opt_set(self):
         for i in range(self.op_size):
             op = self._ops[i]
@@ -154,7 +165,7 @@ class ControlFlowGraph(object):
         bwd_id = 0
         for i in range(self.op_size):
             op = self._ops[i]
-            if op.type() in sub_block_ops:
+            if op.type() in SUB_BLOCK_OPS:
                 continue
             block_desc = op.block()
             is_forward = i < self._forward_num
@@ -177,13 +188,15 @@ class ControlFlowGraph(object):
         def compare_shape(x_shape, cache_shape, opt_level):
             if opt_level == 0:
                 return x_shape == cache_shape
-            if opt_level == 1:
+            elif opt_level == 1:
                 if (x_shape[0] == -1) ^ (cache_shape[0] == -1):
                     return False
                 x_size = abs(reduce(lambda x, y: x * y, x_shape))
                 cache_size = abs(reduce(lambda x, y: x * y, cache_shape))
                 if x_size <= cache_size:
                     return True
+            else:
+                raise ValueError("only support opt_level 0 or 1.")
             return False
 
         self._dataflow_analyze()
@@ -191,10 +204,9 @@ class ControlFlowGraph(object):
         self.pool = []
         for i in range(self.op_size):
             op = self._ops[i]
-            if op.type() in sub_block_ops:
+            if op.type() in SUB_BLOCK_OPS:
                 continue
             block_desc = op.block()
-            self.current_block_desc = block_desc
             is_forward = i < self._forward_num
             if self.pool:
                 defs_can_optimize = filter(
@@ -211,37 +223,40 @@ class ControlFlowGraph(object):
                     for index, cache_pair in enumerate(self.pool):
                         cache_var = cache_pair[0]
                         cache_shape = cache_pair[1]
-                        if compare_shape(x_shape, cache_shape, level):
-                            if self._has_var(block_desc, cache_var, is_forward):
-                                x_dtype = self._find_var(block_desc, x,
-                                                         is_forward).dtype()
-                                cache_dtype = self._find_var(
-                                    block_desc, cache_var, is_forward).dtype()
-                                # TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
-                                # and dtype_to_size[cache_dtype]
-                                if x_dtype == cache_dtype:
-                                    if PRINT_LOG:
-                                        print(
-                                            ("Hit Cache !!!! cache pool index "
-                                             "is %d, var name is %s, "
-                                             "cached var name is %s, "
-                                             "var shape is %s ") %
-                                            (index, x, cache_var,
-                                             str(cache_shape)))
-                                    self.pool.pop(index)
-                                    if x == cache_var:
-                                        break
-                                    _rename_arg_(
-                                        self._ops, x, cache_var, begin_idx=i)
-                                    self._program.block(block_desc.id).var(
-                                        str(x)).desc = self._find_var(
-                                            block_desc, cache_var, is_forward)
-                                    self._update_graph(
-                                        x, cache_var, begin_idx=i)
-                                    break
-
-            in_diff, out_diff = self._get_diff(self._live_in[i],
-                                               self._live_out[i])
+                        if not compare_shape(x_shape, cache_shape, level):
+                            continue
+
+                        if not self._has_var(block_desc, cache_var, is_forward):
+                            continue
+
+                        x_dtype = self._find_var(block_desc, x,
+                                                 is_forward).dtype()
+                        cache_dtype = self._find_var(block_desc, cache_var,
+                                                     is_forward).dtype()
+                        # TODO(qijun): actually, we should compare
+                        # dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
+                        if x_dtype != cache_dtype:
+                            continue
+
+                        if PRINT_LOG:
+                            print(("Hit Cache !!!! cache pool index "
+                                   "is %d, var name is %s, "
+                                   "cached var name is %s, "
+                                   "var shape is %s ") % (index, x, cache_var,
+                                                          str(cache_shape)))
+                        self.pool.pop(index)
+                        if x == cache_var:
+                            break
+                        # Rename the var to the cache var already with
+                        # memory allocated in order to reuse the memory.
+                        _rename_arg_(self._ops, x, cache_var, begin_idx=i)
+                        self._program.block(block_desc.id).var(str(
+                            x)).desc = self._find_var(block_desc, cache_var,
+                                                      is_forward)
+                        self._update_graph(x, cache_var, begin_idx=i)
+                        break
+
+            in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
             can_optimize = filter(
                 lambda x: self._check_var_validity(block_desc, x, is_forward),
                 in_diff)
@@ -252,6 +267,19 @@ class ControlFlowGraph(object):
 
 
 def _process_sub_block_pair(pdesc, sub_block_pair):
+    """Creates a list of tuple each of which tracks info of a subblock.
+
+      Note: this function doesn't handle nested subblocks yet.
+      TODO(panyx0718): assert if case nested subblocks happen.
+
+    :param pdesc: ProgramDesc.
+    :param sub_block_pair: A list op pairs. Each op pair is the forward
+        op and backward op. The ops in the list are special that they contain
+        a subblock of ops.
+    :return: A list of tuples, each tuple is (all ops in a subblock pair
+        including forward and backward, number of forward ops,
+        all output args names of the ops in the subblock pairs).
+    """
     ops_list = []
     block_desc = pdesc.block(0)
     op_size = block_desc.op_size()
@@ -308,6 +336,11 @@ def _process_sub_block_pair(pdesc, sub_block_pair):
 
 
 def _get_cfgs(input_program):
+    """Process each block and create ControlFlowGraph for each of them.
+
+    :param input_program: Program object.
+    :return: A list of ControlFlowGraph, each corresponds to a block.
+    """
     ops_list = []
     pdesc = input_program.get_desc()
     block_desc = pdesc.block(0)
@@ -316,11 +349,8 @@ def _get_cfgs(input_program):
     ops_list.append(
         ([block_desc.op(i) for i in range(op_size)], op_size, set()))
 
-    sub_block_pair = [("while", "while_grad"), ("parallel_do",
-                                                "parallel_do_grad"),
-                      ("conditional_block", "conditional_block_grad")]
-
-    ops_list.extend(_process_sub_block_pair(pdesc, sub_block_pair))
+    # Only process one level of nested subblock.
+    ops_list.extend(_process_sub_block_pair(pdesc, SUB_BLOCK_PAIR))
 
     cfgs = [
         ControlFlowGraph(input_program, ops, forward_num, skip_opt)
@@ -330,6 +360,17 @@ def _get_cfgs(input_program):
 
 
 def memory_optimize(input_program, print_log=False, level=0):
+    """Optimize memory by reusing var memory.
+
+      Note: it doesn't not support subblock nested in subblock.
+
+    :param input_program: Input Program
+    :param print_log: whether to print debug log.
+    :param level: If level=0, reuse if the shape is completely equal, o
+    :return:
+    """
+    if level != 0 and level != 1:
+        raise ValueError("only support opt_level 0 or 1.")
     global PRINT_LOG
     PRINT_LOG = print_log
     cfgs = _get_cfgs(input_program)

From b48cf1712bebe617764539064aadf31f7ecc2b1d Mon Sep 17 00:00:00 2001
From: Yi Wang <yi.wang.2005@gmail.com>
Date: Fri, 13 Apr 2018 22:33:25 -0700
Subject: [PATCH 64/67] Fix cpplint errors in transform_test.cu (#9915)

* Fix cpplint errors with transformer_test.cu

* Update
---
 paddle/fluid/platform/transform_test.cu | 31 +++++++++++++++----------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu
index 7b5cfd8f43..f65d1f6010 100644
--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
@@ -18,11 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/fluid/platform/transform.h"
 
+namespace {
+
 template <typename T>
 class Scale {
  public:
   explicit Scale(const T& scale) : scale_(scale) {}
-
   HOSTDEVICE T operator()(const T& a) const { return a * scale_; }
 
  private:
@@ -35,11 +36,23 @@ class Multiply {
   HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
 };
 
+}  // namespace
+
+using paddle::memory::Alloc;
+using paddle::memory::Free;
+using paddle::memory::Copy;
+
+using paddle::platform::CPUPlace;
+using paddle::platform::CUDAPlace;
+using paddle::platform::CPUDeviceContext;
+using paddle::platform::CUDADeviceContext;
+
+using paddle::platform::Transform;
+
 TEST(Transform, CPUUnary) {
-  using namespace paddle::platform;
   CPUDeviceContext ctx;
   float buf[4] = {0.1, 0.2, 0.3, 0.4};
-  Transform<paddle::platform::CPUDeviceContext> trans;
+  Transform<CPUDeviceContext> trans;
   trans(ctx, buf, buf + 4, buf, Scale<float>(10));
   for (int i = 0; i < 4; ++i) {
     ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
@@ -47,14 +60,12 @@ TEST(Transform, CPUUnary) {
 }
 
 TEST(Transform, GPUUnary) {
-  using namespace paddle::platform;
-  using namespace paddle::memory;
   CUDAPlace gpu0(0);
   CUDADeviceContext ctx(gpu0);
   float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
   float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
   Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
-  Transform<paddle::platform::CUDADeviceContext> trans;
+  Transform<CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
   ctx.Wait();
   Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
@@ -65,10 +76,8 @@ TEST(Transform, GPUUnary) {
 }
 
 TEST(Transform, CPUBinary) {
-  using namespace paddle::platform;
-  using namespace paddle::memory;
   int buf[4] = {1, 2, 3, 4};
-  Transform<paddle::platform::CPUDeviceContext> trans;
+  Transform<CPUDeviceContext> trans;
   CPUDeviceContext ctx;
   trans(ctx, buf, buf + 4, buf, buf, Multiply<int>());
   for (int i = 0; i < 4; ++i) {
@@ -77,14 +86,12 @@ TEST(Transform, CPUBinary) {
 }
 
 TEST(Transform, GPUBinary) {
-  using namespace paddle::platform;
-  using namespace paddle::memory;
   int buf[4] = {1, 2, 3, 4};
   CUDAPlace gpu0(0);
   CUDADeviceContext ctx(gpu0);
   int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
   Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
-  Transform<paddle::platform::CUDADeviceContext> trans;
+  Transform<CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
   ctx.Wait();
   Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());

From 630943c7a79ce2ee3c3ce291a3bb8c5a32b8931d Mon Sep 17 00:00:00 2001
From: Yi Wang <yi.wang.2005@gmail.com>
Date: Fri, 13 Apr 2018 23:42:02 -0700
Subject: [PATCH 65/67] Update documentation (#9918)

---
 paddle/fluid/platform/cuda_profiler.h |  9 ++++----
 paddle/fluid/platform/variant.h       | 32 ++++++++++++---------------
 2 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h
index ebd6aebd76..41d7c12146 100644
--- a/paddle/fluid/platform/cuda_profiler.h
+++ b/paddle/fluid/platform/cuda_profiler.h
@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
+
 #include <cuda_profiler_api.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+
+#include <string>
+
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h
index 05ca33137d..45f60fc9d7 100644
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -14,29 +14,25 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef __CUDACC__
-#ifdef __CUDACC_VER_MAJOR__
-// CUDA 9 define `__CUDACC_VER__` as a warning message, manually define
-// __CUDACC_VER__ instead.
+// Boost 1.41.0 requires __CUDACC_VER__, but in CUDA 9 __CUDACC_VER__
+// is removed, so we have to manually define __CUDACC_VER__ instead.
+// For details, please refer to
+// https://github.com/PaddlePaddle/Paddle/issues/6626
+#if defined(__CUDACC__) && defined(__CUDACC_VER_MAJOR__)
 #undef __CUDACC_VER__
-
-#define __CUDACC_VER__                                         \
-  (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 + \
-   __CUDACC_VER_BUILD__)
-#endif
-
+#define __CUDACC_VER__                                  \
+  __CUDACC_VER_BUILD__ + __CUDACC_VER_MAJOR__ * 10000 + \
+      __CUDACC_VER_MINOR__ * 100
 #endif
 
-#include <boost/config.hpp>
+#include "boost/config.hpp"
 
-#ifdef PADDLE_WITH_CUDA
-
-// Because boost's variadic templates has bug on nvcc, boost will disable
-// variadic template support when GPU enabled on nvcc.
-// Define BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
-// function symbols.
-//
+// Because Boost 1.41.0's variadic templates has bug on nvcc, boost
+// will disable variadic template support in NVCC mode.  Define
+// BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
+// function symbols.  For details,
 // https://github.com/PaddlePaddle/Paddle/issues/3386
+#ifdef PADDLE_WITH_CUDA
 #ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES
 #define BOOST_NO_CXX11_VARIADIC_TEMPLATES
 #endif

From 494c262a26a1ff29143491fa60fd6ba546d3bebf Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Sat, 14 Apr 2018 14:42:58 +0800
Subject: [PATCH 66/67] Fix average_accumulate_op for parallel executor.
 (#9852)

---
 .../fluid/operators/average_accumulates_op.cu | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu
index 046f72b471..104e24f6ee 100644
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -25,12 +25,14 @@ void GetAccumulators<paddle::platform::CUDADeviceContext>(
   auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
   auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
   auto stream = ctx.cuda_device_context().stream();
-  memory::Copy(platform::CPUPlace(), old_num_accumulates_,
-               platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
-               sizeof(int64_t), stream);
-  memory::Copy(platform::CPUPlace(), num_accumulates_, platform::CUDAPlace(),
+  auto cuda_place =
+      boost::get<platform::CUDAPlace>(in_old_num_accumulates->place());
+  memory::Copy(platform::CPUPlace(), old_num_accumulates_, cuda_place,
+               in_old_num_accumulates->data<int64_t>(), sizeof(int64_t),
+               stream);
+  memory::Copy(platform::CPUPlace(), num_accumulates_, cuda_place,
                in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
-  memory::Copy(platform::CPUPlace(), num_updates_, platform::CUDAPlace(),
+  memory::Copy(platform::CPUPlace(), num_updates_, cuda_place,
                in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
 }
 
@@ -42,14 +44,16 @@ void SetAccumulators<paddle::platform::CUDADeviceContext>(
   auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
   auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
   auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
+  auto cuda_place =
+      boost::get<platform::CUDAPlace>(out_old_num_accumulates->place());
 
-  memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data<int64_t>(),
+  memory::Copy(cuda_place, out_old_num_accumulates->data<int64_t>(),
                platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
                stream);
-  memory::Copy(platform::CUDAPlace(), out_num_accumulates->data<int64_t>(),
+  memory::Copy(cuda_place, out_num_accumulates->data<int64_t>(),
                platform::CPUPlace(), &num_accumulates_, sizeof(int64_t),
                stream);
-  memory::Copy(platform::CUDAPlace(), out_num_updates->data<int64_t>(),
+  memory::Copy(cuda_place, out_num_updates->data<int64_t>(),
                platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream);
 }
 

From 04a652c6365145d8e6f2f879369e1a92f0ce3d36 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Mon, 16 Apr 2018 10:07:25 +0800
Subject: [PATCH 67/67] specified pip version in dev image

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 7856d3bbc4..0f13acabc3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -57,7 +57,7 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
     pip install -U wheel && \
     pip install -U docopt PyYAML sphinx==1.5.6 && \
     pip install sphinx-rtd-theme==0.1.9 recommonmark