From 88a8eedda17dead5471f4d9a64e291e49b522775 Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Thu, 28 Sep 2017 14:36:38 -0700
Subject: [PATCH 01/37] scatter gather gpu

gather scatter gpu
---
 paddle/operators/cond_op.cc                   |  6 +-
 paddle/operators/gather.cu.h                  | 84 ++++++++++++++++++
 paddle/operators/gather.h                     | 35 ++++----
 paddle/operators/gather_op.cc                 |  9 +-
 paddle/operators/gather_op.cu                 | 70 +++++++++++++++
 paddle/operators/gather_op.h                  | 27 ++++--
 paddle/operators/gather_test.cc               |  2 +-
 paddle/operators/scatter.cu.h                 | 86 +++++++++++++++++++
 paddle/operators/scatter.h                    | 45 ++++------
 paddle/operators/scatter_op.cc                |  7 +-
 paddle/operators/scatter_op.cu                | 63 ++++++++++++++
 paddle/operators/scatter_op.h                 | 12 ++-
 paddle/operators/scatter_test.cc              |  2 +-
 .../v2/framework/tests/test_scatter_op.py     |  4 +-
 14 files changed, 375 insertions(+), 77 deletions(-)
 create mode 100644 paddle/operators/gather.cu.h
 create mode 100644 paddle/operators/gather_op.cu
 create mode 100644 paddle/operators/scatter.cu.h
 create mode 100644 paddle/operators/scatter_op.cu
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index aaffa6661f..157656786a 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -169,8 +169,8 @@ void CondOp::Run(const Scope& scope,
       tensor_child->Resize(dim);
       tensor_child->mutable_data<float>(dim, platform::CPUPlace());
 
-      Gather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i],
-                    tensor_child);
+      CPUTGather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i],
+                        tensor_child);
     }
   }
 
@@ -194,7 +194,7 @@ void CondOp::Run(const Scope& scope,
       PADDLE_ENFORCE_NOT_NULL(v);
       LoDTensor* tensor_child = v->GetMutable<LoDTensor>();
 
-      ScatterUpdate<float>(dev_ctx.GetPlace(), tensor_child, &index_tensors[i],
+      ScatterAssign<float>(dev_ctx.GetPlace(), tensor_child, &index_tensors[i],
                            tensor_parent);
     }
   }
diff --git a/paddle/operators/gather.cu.h b/paddle/operators/gather.cu.h
new file mode 100644
index 0000000000..c96071e295
--- /dev/null
+++ b/paddle/operators/gather.cu.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using platform::Place;
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void GatherCUDAKernel(const T* params, const int* indices, T* output,
+                                 size_t index_size, size_t slice_size) {
+  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
+    int indices_i = i / slice_size;
+    int slice_i = i - indices_i * slice_size;  // offset inside the slice
+    int gather_i = indices[indices_i];
+    int params_i = gather_i * slice_size + slice_i;
+    *(output + i) = *(params + params_i);
+  }
+}
+
+// Implementation of GPU copy:
+template <typename T>
+struct GPUGather {
+  void operator()(const T* src, const int* index, const int slice_size,
+                  const int index_size, T* output) {
+    int block = 512;
+    int n = slice_size * index_size;
+    int grid = (n + block - 1) / block;
+    GatherCUDAKernel<T><<<grid, block>>>(src, index, output, index_size,
+                                         slice_size);
+  }
+};
+
+/**
+ * A thin wrapper on gpu tensor
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void GPUTGather(const Place& place, const Tensor* src, const Tensor* index,
+                Tensor* output) {
+  PADDLE_ENFORCE(platform::is_gpu_place(place));
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index->dims().size() == 1);
+  int index_size = index->dims()[0];
+
+  auto src_dims = src->dims();
+  framework::DDim output_dims(src_dims);
+  output_dims[0] = index_size;
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  // Gathering
+  GPUGather<T> gather_functor;
+  gather_functor(src->data<T>(), index->data<int>(), slice_size, index_size,
+                 output->data<T>());
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gather.h b/paddle/operators/gather.h
index 92fb51ec17..a3db17bd3d 100644
--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
@@ -26,31 +26,31 @@ namespace operators {
 
 // Implementation of CPU copy
 template <typename T>
-void CPUGather(const T* src, const int* indices, const int slice_size,
-               const int index_size, T* output) {
-  const size_t slice_bytes = slice_size * sizeof(T);
+struct CPUGather {
+  void operator()(const T* src, const int* indices, const int slice_size,
+                  const int index_size, T* output) {
+    const size_t slice_bytes = slice_size * sizeof(T);
 
-  for (int i = 0; i < index_size; ++i) {
-    int index_ = indices[i];
-    memcpy(output + i * slice_size, src + index_ * slice_size, slice_bytes);
+    for (int i = 0; i < index_size; ++i) {
+      int index_ = indices[i];
+      memcpy(output + i * slice_size, src + index_ * slice_size, slice_bytes);
+    }
   }
-}
-
-// Implementation of GPU copy:
-template <typename T>
-void GPUGather(const T* src, const int* index, const int slice_size,
-               const int index_size, T* output);
+};
 
 /**
+ * A thin wrapper on cpu tensor
  * Return a new tensor from source tensor, gathered according to index
  * input[src]: type-T source Tensor
  * input[index]: type-int index Tensor (1-D)
  * return: output tensor
  */
 template <typename T>
-void Gather(const platform::Place& place, const paddle::framework::Tensor* src,
-            const paddle::framework::Tensor* index,
-            paddle::framework::Tensor* output) {
+void CPUTGather(const platform::Place& place,
+                const paddle::framework::Tensor* src,
+                const paddle::framework::Tensor* index,
+                paddle::framework::Tensor* output) {
+  PADDLE_ENFORCE(platform::is_cpu_place(place));
   // check index of shape 1-D
   PADDLE_ENFORCE(index->dims().size() == 1);
   int index_size = index->dims()[0];
@@ -64,10 +64,9 @@ void Gather(const platform::Place& place, const paddle::framework::Tensor* src,
   for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
 
   // Gathering
-  if (platform::is_cpu_place(place)) {
-    CPUGather<T>(src->data<T>(), index->data<int>(), slice_size, index_size,
+  CPUGather<T> gather_functor;
+  gather_functor(src->data<T>(), index->data<int>(), slice_size, index_size,
                  output->data<T>());
-  }
 }
 
 }  // namespace operators
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index da22bd0c52..fe305337cb 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -31,6 +31,8 @@ class GatherOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of GatherOp should not be null.");
 
+    auto index_dims = ctx->GetInputDim("Index");
+    PADDLE_ENFORCE(index_dims.size() == 1);
     int batch_size = ctx->GetInputDim("Index")[0];
     PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
     framework::DDim output_dims(ctx->GetInputDim("X"));
@@ -79,8 +81,5 @@ Out = X[Index]
 namespace ops = paddle::operators;
 REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
             ops::GatherGradOp);
-REGISTER_OP_CPU_KERNEL(gather,
-                       ops::GatherOpKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    gather_grad,
-    ops::GatherGradientOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>);
+REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>);
diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu
new file mode 100644
index 0000000000..f3ed692666
--- /dev/null
+++ b/paddle/operators/gather_op.cu
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "gather.cu.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/operators/gather_op.h"
+#include "scatter.cu.h"
+
+namespace paddle {
+namespace operators {
+
+// template <typename T>
+__global__ void print_arr(const float *params, const int N) {
+  CUDA_1D_KERNEL_LOOP(i, N) { printf("device: %d, %f\n", i, params[i]); }
+}
+
+template <typename T>
+class GatherOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *output = ctx.Output<Tensor>("Out");
+
+    output->mutable_data<T>(ctx.GetPlace());
+
+    GPUTGather<T>(ctx.GetPlace(), x, index, output);
+  }
+};
+
+template <typename T>
+class GatherGradOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    LOG(INFO) << "Gather grad here";
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *x = ctx.Input<Tensor>("X");
+
+    dX->mutable_data<T>(ctx.GetPlace());
+    auto dxt = framework::EigenVector<T>::Flatten(*dX);
+    auto place = ctx.GetEigenDevice<platform::GPUPlace>();
+    dxt.device(place) = dxt.constant(static_cast<T>(0));
+
+    GPUTScatter<T>(ctx.GetPlace(), dO, Index, dX);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
diff --git a/paddle/operators/gather_op.h b/paddle/operators/gather_op.h
index 073e566e8f..b80a4ab370 100644
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
@@ -23,29 +23,40 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
+template <typename T>
 class GatherOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Index = ctx.Input<Tensor>("Index");
-    auto *Y = ctx.Output<Tensor>("Out");
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *output = ctx.Output<Tensor>("Out");
+
+    output->mutable_data<T>(ctx.GetPlace());
 
-    Y->mutable_data<T>(ctx.GetPlace());
-    Gather<T>(ctx.GetPlace(), X, Index, Y);
+    CPUTGather<T>(ctx.GetPlace(), x, index, output);
   }
 };
 
-template <typename Place, typename T>
+template <typename T>
 class GatherGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+
     auto *Index = ctx.Input<Tensor>("Index");
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
     dX->mutable_data<T>(ctx.GetPlace());
-    ScatterUpdate<T>(ctx.GetPlace(), dO, Index, dX);
+    auto dxt = framework::EigenVector<T>::Flatten(*dX);
+    auto place = ctx.GetEigenDevice<platform::CPUPlace>();
+    dxt.device(place) = dxt.constant(static_cast<T>(0));
+
+    ScatterAssign<T>(ctx.GetPlace(), dO, Index, dX);
   }
 };
 
diff --git a/paddle/operators/gather_test.cc b/paddle/operators/gather_test.cc
index 0ae1e99452..ea06ae2847 100644
--- a/paddle/operators/gather_test.cc
+++ b/paddle/operators/gather_test.cc
@@ -41,7 +41,7 @@ TEST(Gather, GatherData) {
 
   int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
 
-  Gather<int>(CPUPlace(), src, index, output);
+  CPUTGather<int>(CPUPlace(), src, index, output);
 
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
   for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
diff --git a/paddle/operators/scatter.cu.h b/paddle/operators/scatter.cu.h
new file mode 100644
index 0000000000..82e5040305
--- /dev/null
+++ b/paddle/operators/scatter.cu.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void ScatterCUDAKernel(const T* params, const int* indices,
+                                  T* output, size_t index_size,
+                                  size_t slice_size) {
+  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
+    int indices_i = i / slice_size;
+    int slice_i = i - indices_i * slice_size;  // offset inside the slice
+    int scatter_i = indices[indices_i];
+    int out_i = scatter_i * slice_size + slice_i;
+    *(output + out_i) = *(params + i);
+  }
+}
+
+// Implementation of GPU copy:
+template <typename T>
+struct GPUScatterAssign {
+  void operator()(const T* src, const int* index, const int slice_size,
+                  const int index_size, T* output) {
+    int block = 512;
+    int n = slice_size * index_size;
+    int grid = (n + block - 1) / block;
+    // printf("grid, block: %d %d\n", grid, block);
+    ScatterCUDAKernel<T><<<grid, block>>>(src, index, output, index_size,
+                                          slice_size);
+  }
+};
+
+/**
+ * A thin wrapper on gpu tensor
+ * Return a new updated tensor from source tensor, scatter-assigned according to
+ * index
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void GPUTScatter(const platform::Place& place,
+                 const paddle::framework::Tensor* src,
+                 const paddle::framework::Tensor* index,
+                 paddle::framework::Tensor* output) {
+  PADDLE_ENFORCE(platform::is_gpu_place(place));
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index->dims().size() == 1);
+  int index_size = index->dims()[0];
+
+  auto src_dims = src->dims();
+  framework::DDim output_dims(src_dims);
+  output_dims[0] = index_size;
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  // Scatter Assign
+  GPUScatterAssign<T> scatter_functor;
+  scatter_functor(src->data<T>(), index->data<int>(), slice_size, index_size,
+                  output->data<T>());
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/scatter.h b/paddle/operators/scatter.h
index 6b542675c2..670204b4dd 100644
--- a/paddle/operators/scatter.h
+++ b/paddle/operators/scatter.h
@@ -24,49 +24,33 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 // Implementation of CPU copy
 template <typename T>
-void CPUScatterUpdate(const paddle::framework::Tensor* src, const int* index,
-                      const size_t index_size,
-                      paddle::framework::Tensor* output) {
-  paddle::framework::DDim output_dims = output->dims();
+void CPUScatterAssign(const T* src, const int* index, const int slice_size,
+                      const int index_size, T* output) {
+  // paddle::framework::DDim output_dims = output->dims();
+  const size_t slice_bytes = slice_size * sizeof(T);
 
-  for (size_t i = 0; i < index_size; ++i) {
+  for (int i = 0; i < index_size; ++i) {
     int index_ = index[i];
-
-    paddle::framework::Tensor src_ = *src;
-    paddle::framework::Tensor output_ = *output;
-    if (index_size > 1) src_ = src->Slice<T>(i, i + 1);
-    if (output_dims[0] > 1) output_ = output->Slice<T>(index_, index_ + 1);
-
-    auto X = EigenVector<T>::Flatten(src_);
-    auto Y = EigenVector<T>::Flatten(output_);
-
-    Y = X + Y;
+    memcpy(output + index_ * slice_size, src + i * slice_size, slice_bytes);
   }
 }
 
-// Implementation of GPU scatter:
-template <typename T>
-void GPUScatterUpdate(const T* src, const int* index, const int slice_size,
-                      const int index_size, T* output);
-
 /**
  * Return a updated tensor from source tensor, scattered according to index:
- * dst[i] += src[index[i]]
+ * dst[i] = src[index[i]]
  * input[src]: type-T source Tensor
  * input[index]: type-int index Tensor (1-D)
  * return: output tensor
  */
 template <typename T>
-void ScatterUpdate(const platform::Place& place,
+void ScatterAssign(const platform::Place& place,
                    const paddle::framework::Tensor* src,
                    const paddle::framework::Tensor* index,
                    paddle::framework::Tensor* output) {
+  PADDLE_ENFORCE(platform::is_cpu_place(place));
   // check index of shape 1-D
   PADDLE_ENFORCE(index->dims().size() == 1);
   int index_size = index->dims()[0];
@@ -74,18 +58,19 @@ void ScatterUpdate(const platform::Place& place,
   auto src_dims = src->dims();
   auto dst_dims = output->dims();
 
+  const T* p_src = src->data<T>();
+  const int* p_index = index->data<int>();
+  T* p_output = output->data<T>();
+
   // check src shape and dst shape should match
   for (int i = 1; i < src_dims.size(); i++)
     PADDLE_ENFORCE(src_dims[i] == dst_dims[i]);
 
   // slice size
   size_t slice_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
 
-  if (platform::is_cpu_place(place)) {
-    CPUScatterUpdate<T>(src, index->data<int>(), index_size, output);
-  } else {
-  }
+  CPUScatterAssign<T>(p_src, p_index, slice_size, index_size, p_output);
 }
 
 }  // namespace operators
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index cadd8841b6..d15ba15153 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -97,8 +97,5 @@ Out[Index] = Ref[Index] + Updates
 namespace ops = paddle::operators;
 REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
             ops::ScatterGradOp);
-REGISTER_OP_CPU_KERNEL(scatter,
-                       ops::ScatterOpKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    scatter_grad,
-    ops::ScatterGradientOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
+REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
new file mode 100644
index 0000000000..e27a926c6a
--- /dev/null
+++ b/paddle/operators/scatter_op.cu
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "gather.cu.h"
+#include "paddle/operators/gather_op.h"
+#include "scatter.cu.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ScatterOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *Ref = ctx.Input<Tensor>("Ref");
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Updates = ctx.Input<Tensor>("Updates");
+    auto *Out = ctx.Output<Tensor>("Out");
+
+    Out->ShareDataWith<T>(*Ref);
+
+    GPUTScatter<T>(ctx.GetPlace(), Updates, Index, Out);
+  }
+};
+
+template <typename T>
+class ScatterGradOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    // In place gradient: dRef = dO
+    dRef->ShareDataWith<T>(*dOut);
+    dUpdates->mutable_data<T>(ctx.GetPlace());
+    // Gradient by Gather: dUpdates = dO[Index]
+    GPUTGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel<float>);
diff --git a/paddle/operators/scatter_op.h b/paddle/operators/scatter_op.h
index a8eb54399a..74b2718f43 100644
--- a/paddle/operators/scatter_op.h
+++ b/paddle/operators/scatter_op.h
@@ -23,10 +23,12 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
+template <typename T>
 class ScatterOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
     auto *Ref = ctx.Input<Tensor>("Ref");
     auto *Index = ctx.Input<Tensor>("Index");
     auto *Updates = ctx.Input<Tensor>("Updates");
@@ -35,14 +37,16 @@ class ScatterOpKernel : public framework::OpKernel<T> {
     // In place output: Out = Ref, Out[Index] += Updates
     Out->ShareDataWith<T>(*Ref);
     // Apply ScatterUpdate: Out[index] += Updates[:]
-    ScatterUpdate<T>(ctx.GetPlace(), Updates, Index, Out);
+    ScatterAssign<T>(ctx.GetPlace(), Updates, Index, Out);
   }
 };
 
-template <typename Place, typename T>
+template <typename T>
 class ScatterGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
     auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
     auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
     auto *Index = ctx.Input<Tensor>("Index");
@@ -52,7 +56,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
     dRef->ShareDataWith<T>(*dOut);
     dUpdates->mutable_data<T>(ctx.GetPlace());
     // Gradient by Gather: dUpdates += dO[Index]
-    Gather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
+    CPUTGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
   }
 };
 
diff --git a/paddle/operators/scatter_test.cc b/paddle/operators/scatter_test.cc
index 26fdaff146..bace6419d0 100644
--- a/paddle/operators/scatter_test.cc
+++ b/paddle/operators/scatter_test.cc
@@ -40,7 +40,7 @@ TEST(scatter, ScatterUpdate) {
 
   float* p_output = output->mutable_data<float>(make_ddim({4, 4}), CPUPlace());
 
-  ScatterUpdate<float>(CPUPlace(), src, index, output);
+  ScatterAssign<float>(CPUPlace(), src, index, output);
 
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0));
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
diff --git a/python/paddle/v2/framework/tests/test_scatter_op.py b/python/paddle/v2/framework/tests/test_scatter_op.py
index 33c73c5263..1032269d5d 100644
--- a/python/paddle/v2/framework/tests/test_scatter_op.py
+++ b/python/paddle/v2/framework/tests/test_scatter_op.py
@@ -10,7 +10,7 @@ class TestScatterOp(OpTest):
         index_np = np.array([1, 2]).astype("int32")
         updates_np = np.random.random((2, 3)).astype("float32")
         output_np = np.copy(ref_np)
-        output_np[index_np] += updates_np
+        output_np[index_np] = updates_np
         self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np}
         self.outputs = {'Out': output_np}
 
@@ -18,7 +18,7 @@ class TestScatterOp(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['Updates', 'Ref'], 'Out', in_place=True)
+        self.check_grad(['Updates'], 'Out', in_place=True)
 
 
 if __name__ == "__main__":

From b851515b16d179f35410836a17f855b9b6a9c268 Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Thu, 28 Sep 2017 15:41:20 -0700
Subject: [PATCH 02/37] merge new op grammar

---
 paddle/operators/gather_op.cu  | 9 ++-------
 paddle/operators/scatter_op.cu | 4 ++--
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu
index f3ed692666..f7533cdd64 100644
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
@@ -20,13 +20,8 @@
 namespace paddle {
 namespace operators {
 
-// template <typename T>
-__global__ void print_arr(const float *params, const int N) {
-  CUDA_1D_KERNEL_LOOP(i, N) { printf("device: %d, %f\n", i, params[i]); }
-}
-
 template <typename T>
-class GatherOpCUDAKernel : public framework::OpKernel {
+class GatherOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
@@ -42,7 +37,7 @@ class GatherOpCUDAKernel : public framework::OpKernel {
 };
 
 template <typename T>
-class GatherGradOpCUDAKernel : public framework::OpKernel {
+class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
index e27a926c6a..89d23945e0 100644
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-class ScatterOpCUDAKernel : public framework::OpKernel {
+class ScatterOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
@@ -37,7 +37,7 @@ class ScatterOpCUDAKernel : public framework::OpKernel {
 };
 
 template <typename T>
-class ScatterGradOpCUDAKernel : public framework::OpKernel {
+class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),

From 78808b20911dd95e1a49495c99d814b59e3290c9 Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Thu, 28 Sep 2017 17:27:37 -0700
Subject: [PATCH 03/37] 1 api

---
 paddle/operators/cond_op.cc     |  4 ++--
 paddle/operators/gather.cu.h    | 30 ++++++++++----------------
 paddle/operators/gather.h       | 38 +++++++++++++--------------------
 paddle/operators/gather_op.cu   |  4 ++--
 paddle/operators/gather_op.h    |  2 +-
 paddle/operators/gather_test.cc |  2 +-
 paddle/operators/scatter.cu.h   | 36 ++++++++++++-------------------
 paddle/operators/scatter.h      | 20 ++++++-----------
 paddle/operators/scatter_op.cu  |  4 ++--
 paddle/operators/scatter_op.h   |  2 +-
 10 files changed, 55 insertions(+), 87 deletions(-)

diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index 157656786a..983b5142b1 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -169,8 +169,8 @@ void CondOp::Run(const Scope& scope,
       tensor_child->Resize(dim);
       tensor_child->mutable_data<float>(dim, platform::CPUPlace());
 
-      CPUTGather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i],
-                        tensor_child);
+      CPUGather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i],
+                       tensor_child);
     }
   }
 
diff --git a/paddle/operators/gather.cu.h b/paddle/operators/gather.cu.h
index c96071e295..b400c10440 100644
--- a/paddle/operators/gather.cu.h
+++ b/paddle/operators/gather.cu.h
@@ -38,19 +38,6 @@ __global__ void GatherCUDAKernel(const T* params, const int* indices, T* output,
   }
 }
 
-// Implementation of GPU copy:
-template <typename T>
-struct GPUGather {
-  void operator()(const T* src, const int* index, const int slice_size,
-                  const int index_size, T* output) {
-    int block = 512;
-    int n = slice_size * index_size;
-    int grid = (n + block - 1) / block;
-    GatherCUDAKernel<T><<<grid, block>>>(src, index, output, index_size,
-                                         slice_size);
-  }
-};
-
 /**
  * A thin wrapper on gpu tensor
  * Return a new tensor from source tensor, gathered according to index
@@ -59,8 +46,8 @@ struct GPUGather {
  * return: output tensor
  */
 template <typename T>
-void GPUTGather(const Place& place, const Tensor* src, const Tensor* index,
-                Tensor* output) {
+void GPUGather(const Place& place, const Tensor* src, const Tensor* index,
+               Tensor* output) {
   PADDLE_ENFORCE(platform::is_gpu_place(place));
   // check index of shape 1-D
   PADDLE_ENFORCE(index->dims().size() == 1);
@@ -74,10 +61,15 @@ void GPUTGather(const Place& place, const Tensor* src, const Tensor* index,
   int slice_size = 1;
   for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
 
-  // Gathering
-  GPUGather<T> gather_functor;
-  gather_functor(src->data<T>(), index->data<int>(), slice_size, index_size,
-                 output->data<T>());
+  const T* p_src = src->data<T>();
+  const int* p_index = index->data<int>();
+  T* p_output = output->data<T>();
+
+  int block = 512;
+  int n = slice_size * index_size;
+  int grid = (n + block - 1) / block;
+  GatherCUDAKernel<T><<<grid, block>>>(p_src, p_index, p_output, index_size,
+                                       slice_size);
 }
 
 }  // namespace operators
diff --git a/paddle/operators/gather.h b/paddle/operators/gather.h
index a3db17bd3d..cb635f6825 100644
--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
@@ -24,32 +24,18 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-// Implementation of CPU copy
-template <typename T>
-struct CPUGather {
-  void operator()(const T* src, const int* indices, const int slice_size,
-                  const int index_size, T* output) {
-    const size_t slice_bytes = slice_size * sizeof(T);
-
-    for (int i = 0; i < index_size; ++i) {
-      int index_ = indices[i];
-      memcpy(output + i * slice_size, src + index_ * slice_size, slice_bytes);
-    }
-  }
-};
-
 /**
- * A thin wrapper on cpu tensor
+ * A thin wrapper for gathering on cpu tensor
  * Return a new tensor from source tensor, gathered according to index
  * input[src]: type-T source Tensor
  * input[index]: type-int index Tensor (1-D)
  * return: output tensor
  */
 template <typename T>
-void CPUTGather(const platform::Place& place,
-                const paddle::framework::Tensor* src,
-                const paddle::framework::Tensor* index,
-                paddle::framework::Tensor* output) {
+void CPUGather(const platform::Place& place,
+               const paddle::framework::Tensor* src,
+               const paddle::framework::Tensor* index,
+               paddle::framework::Tensor* output) {
   PADDLE_ENFORCE(platform::is_cpu_place(place));
   // check index of shape 1-D
   PADDLE_ENFORCE(index->dims().size() == 1);
@@ -59,14 +45,20 @@ void CPUTGather(const platform::Place& place,
   framework::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
+  const T* p_src = src->data<T>();
+  const int* p_index = index->data<int>();
+  T* p_output = output->data<T>();
+
   // slice size
   int slice_size = 1;
   for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
 
-  // Gathering
-  CPUGather<T> gather_functor;
-  gather_functor(src->data<T>(), index->data<int>(), slice_size, index_size,
-                 output->data<T>());
+  const size_t slice_bytes = slice_size * sizeof(T);
+
+  for (int i = 0; i < index_size; ++i) {
+    int index_ = p_index[i];
+    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
+  }
 }
 
 }  // namespace operators
diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu
index f7533cdd64..06004614b2 100644
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
@@ -32,7 +32,7 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
 
     output->mutable_data<T>(ctx.GetPlace());
 
-    GPUTGather<T>(ctx.GetPlace(), x, index, output);
+    GPUGather<T>(ctx.GetPlace(), x, index, output);
   }
 };
 
@@ -53,7 +53,7 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
     auto place = ctx.GetEigenDevice<platform::GPUPlace>();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
 
-    GPUTScatter<T>(ctx.GetPlace(), dO, Index, dX);
+    GPUScatterAssign<T>(ctx.GetPlace(), dO, Index, dX);
   }
 };
 
diff --git a/paddle/operators/gather_op.h b/paddle/operators/gather_op.h
index b80a4ab370..fb065b8da7 100644
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
@@ -36,7 +36,7 @@ class GatherOpKernel : public framework::OpKernel<T> {
 
     output->mutable_data<T>(ctx.GetPlace());
 
-    CPUTGather<T>(ctx.GetPlace(), x, index, output);
+    CPUGather<T>(ctx.GetPlace(), x, index, output);
   }
 };
 
diff --git a/paddle/operators/gather_test.cc b/paddle/operators/gather_test.cc
index ea06ae2847..3c1d06ccd1 100644
--- a/paddle/operators/gather_test.cc
+++ b/paddle/operators/gather_test.cc
@@ -41,7 +41,7 @@ TEST(Gather, GatherData) {
 
   int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
 
-  CPUTGather<int>(CPUPlace(), src, index, output);
+  CPUGather<int>(CPUPlace(), src, index, output);
 
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
   for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
diff --git a/paddle/operators/scatter.cu.h b/paddle/operators/scatter.cu.h
index 82e5040305..add4791a79 100644
--- a/paddle/operators/scatter.cu.h
+++ b/paddle/operators/scatter.cu.h
@@ -36,20 +36,6 @@ __global__ void ScatterCUDAKernel(const T* params, const int* indices,
   }
 }
 
-// Implementation of GPU copy:
-template <typename T>
-struct GPUScatterAssign {
-  void operator()(const T* src, const int* index, const int slice_size,
-                  const int index_size, T* output) {
-    int block = 512;
-    int n = slice_size * index_size;
-    int grid = (n + block - 1) / block;
-    // printf("grid, block: %d %d\n", grid, block);
-    ScatterCUDAKernel<T><<<grid, block>>>(src, index, output, index_size,
-                                          slice_size);
-  }
-};
-
 /**
  * A thin wrapper on gpu tensor
  * Return a new updated tensor from source tensor, scatter-assigned according to
@@ -59,10 +45,10 @@ struct GPUScatterAssign {
  * return: output tensor
  */
 template <typename T>
-void GPUTScatter(const platform::Place& place,
-                 const paddle::framework::Tensor* src,
-                 const paddle::framework::Tensor* index,
-                 paddle::framework::Tensor* output) {
+void GPUScatterAssign(const platform::Place& place,
+                      const paddle::framework::Tensor* src,
+                      const paddle::framework::Tensor* index,
+                      paddle::framework::Tensor* output) {
   PADDLE_ENFORCE(platform::is_gpu_place(place));
   // check index of shape 1-D
   PADDLE_ENFORCE(index->dims().size() == 1);
@@ -76,10 +62,16 @@ void GPUTScatter(const platform::Place& place,
   int slice_size = 1;
   for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
 
-  // Scatter Assign
-  GPUScatterAssign<T> scatter_functor;
-  scatter_functor(src->data<T>(), index->data<int>(), slice_size, index_size,
-                  output->data<T>());
+  const T* p_src = src->data<T>();
+  const int* p_index = index->data<int>();
+  T* p_output = output->data<T>();
+
+  int block = 512;
+  int n = slice_size * index_size;
+  int grid = (n + block - 1) / block;
+
+  ScatterCUDAKernel<T><<<grid, block>>>(p_src, p_index, p_output, index_size,
+                                        slice_size);
 }
 
 }  // namespace operators
diff --git a/paddle/operators/scatter.h b/paddle/operators/scatter.h
index 670204b4dd..f895f22e28 100644
--- a/paddle/operators/scatter.h
+++ b/paddle/operators/scatter.h
@@ -25,19 +25,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-// Implementation of CPU copy
-template <typename T>
-void CPUScatterAssign(const T* src, const int* index, const int slice_size,
-                      const int index_size, T* output) {
-  // paddle::framework::DDim output_dims = output->dims();
-  const size_t slice_bytes = slice_size * sizeof(T);
-
-  for (int i = 0; i < index_size; ++i) {
-    int index_ = index[i];
-    memcpy(output + index_ * slice_size, src + i * slice_size, slice_bytes);
-  }
-}
-
 /**
  * Return a updated tensor from source tensor, scattered according to index:
  * dst[i] = src[index[i]]
@@ -70,7 +57,12 @@ void ScatterAssign(const platform::Place& place,
   size_t slice_size = 1;
   for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
 
-  CPUScatterAssign<T>(p_src, p_index, slice_size, index_size, p_output);
+  const size_t slice_bytes = slice_size * sizeof(T);
+
+  for (int i = 0; i < index_size; ++i) {
+    int index_ = p_index[i];
+    memcpy(p_output + index_ * slice_size, p_src + i * slice_size, slice_bytes);
+  }
 }
 
 }  // namespace operators
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
index 89d23945e0..831eabdae4 100644
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
@@ -32,7 +32,7 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
 
     Out->ShareDataWith<T>(*Ref);
 
-    GPUTScatter<T>(ctx.GetPlace(), Updates, Index, Out);
+    GPUScatterAssign<T>(ctx.GetPlace(), Updates, Index, Out);
   }
 };
 
@@ -51,7 +51,7 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
     dRef->ShareDataWith<T>(*dOut);
     dUpdates->mutable_data<T>(ctx.GetPlace());
     // Gradient by Gather: dUpdates = dO[Index]
-    GPUTGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
+    GPUGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
   }
 };
 
diff --git a/paddle/operators/scatter_op.h b/paddle/operators/scatter_op.h
index 74b2718f43..771a1f2ddb 100644
--- a/paddle/operators/scatter_op.h
+++ b/paddle/operators/scatter_op.h
@@ -56,7 +56,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
     dRef->ShareDataWith<T>(*dOut);
     dUpdates->mutable_data<T>(ctx.GetPlace());
     // Gradient by Gather: dUpdates += dO[Index]
-    CPUTGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
+    CPUGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
   }
 };
 

From 61cc3ae4d13a798f341ceb5b2240b92526b3f43f Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 28 Sep 2017 11:52:03 -0700
Subject: [PATCH 04/37] Stablize elementwise_mul by using double precision

---
 paddle/pybind/pybind.cc                       | 16 +++--
 paddle/pybind/tensor_py.h                     | 15 ++++-
 python/paddle/v2/framework/tests/op_test.py   | 60 +++++++++++++------
 .../tests/test_elementwise_mul_op.py          | 32 +++++-----
 4 files changed, 78 insertions(+), 45 deletions(-)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index d85bf6c7fa..f4121e9d71 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -77,20 +77,18 @@ PYBIND11_PLUGIN(core) {
            })
       .def("set", PyCPUTensorSetFromArray<float>)
       .def("set", PyCPUTensorSetFromArray<int>)
+      .def("set", PyCPUTensorSetFromArray<double>)
 #ifndef PADDLE_ONLY_CPU
       .def("set", PyCUDATensorSetFromArray<float>)
       .def("set", PyCUDATensorSetFromArray<int>)
+      .def("set", PyCUDATensorSetFromArray<double>)
 #endif
       .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
-      .def("set_float_element",
-           [](Tensor &self, size_t offset, float f) {
-             // TODO(yuyang18): Only support GPU now.
-             self.data<float>()[offset] = f;
-           })
-      .def("get_float_element", [](Tensor &self, size_t offset) -> float {
-        // TODO(yuyang18): Only support GPU now.
-        return self.data<float>()[offset];
-      });
+      .def("set_float_element", TensorSetElement<float>)
+      .def("get_float_element", TensorGetElement<float>)
+      .def("set_double_element", TensorSetElement<double>)
+      .def("get_double_element", TensorGetElement<double>)
+      .def("dtype", [](Tensor &self) { return ToDataType(self.type()); });
 
   py::class_<LoDTensor, Tensor>(m, "LoDTensor")
       .def_buffer(
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index 10621e90ee..3e3e6bc031 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -73,10 +73,23 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 };
 }  // namespace details
 inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
-  auto buffer_info = details::CastToPyBufferImpl<true, 0, float, int>()(tensor);
+  auto buffer_info =
+      details::CastToPyBufferImpl<true, 0, float, int, double>()(tensor);
   return buffer_info;
 }
 
+template <typename T>
+T TensorGetElement(framework::Tensor &self, size_t offset) {
+  PADDLE_ENFORCE(platform::is_cpu_place(self.place()));
+  return self.data<T>()[offset];
+}
+
+template <typename T>
+void TensorSetElement(framework::Tensor &self, size_t offset, T elem) {
+  PADDLE_ENFORCE(platform::is_cpu_place(self.place()));
+  self.data<T>()[offset] = elem;
+}
+
 template <typename T>
 void PyCPUTensorSetFromArray(
     framework::Tensor &self,
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 89979044f2..70ae50d401 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -69,24 +69,27 @@ def set_input(scope, op, inputs, place):
 
 
 def set_output_grad(scope, op, outputs, place):
+    def __set_tensor__(name):
+        out_tensor = scope.find_var(name).get_tensor()
+        grad_tensor = scope.new_var(grad_var_name(name)).get_tensor()
+        out_dtype = out_tensor.dtype()
+        if out_dtype == core.DataType.FP64:
+            data = np.ones(out_tensor.shape(), dtype=np.float64)
+        elif out_dtype == core.DataType.FP32:
+            data = np.ones(out_tensor.shape(), dtype=np.float32)
+        else:
+            raise ValueError("Not supported data type " + str(out_dtype))
+
+        grad_tensor.set(data, place)
+
     for out_name, out_dup in Operator.get_op_outputs(op.type()):
         if out_name in outputs:
             if out_dup:
                 sub_out = outputs[out_name]
                 for sub_out_name, _ in sub_out:
-                    out_tensor = scope.find_var(sub_out_name).get_tensor()
-                    grad_tensor = scope.new_var(grad_var_name(
-                        sub_out_name)).get_tensor()
-                    grad_tensor.set_dims(out_tensor.shape())
-                    data = np.ones(out_tensor.shape(), dtype=np.float32)
-                    grad_tensor.set(data, place)
+                    __set_tensor__(sub_out_name)
             else:
-                out_tensor = scope.find_var(out_name).get_tensor()
-                grad_tensor = scope.new_var(grad_var_name(out_name)).get_tensor(
-                )
-                grad_tensor.set_dims(out_tensor.shape())
-                data = np.ones(out_tensor.shape(), dtype=np.float32)
-                grad_tensor.set(data, place)
+                __set_tensor__(out_name)
 
 
 def get_numeric_gradient(scope,
@@ -96,7 +99,6 @@ def get_numeric_gradient(scope,
                          output_names,
                          delta=0.005,
                          in_place=False):
-
     set_input(scope, op, inputs, core.CPUPlace())
 
     tensor_to_check = scope.find_var(input_to_check).get_tensor()
@@ -115,7 +117,29 @@ def get_numeric_gradient(scope,
 
     tensor_to_check = scope.find_var(input_to_check).get_tensor()
     tensor_size = product(tensor_to_check.get_dims())
-    gradient_flat = np.zeros(shape=(tensor_size, ), dtype='float32')
+    tensor_to_check_dtype = tensor_to_check.dtype()
+    if tensor_to_check_dtype == core.DataType.FP32:
+        tensor_to_check_dtype = np.float32
+    elif tensor_to_check_dtype == core.DataType.FP64:
+        tensor_to_check_dtype = np.float64
+    else:
+        raise ValueError("Not supported data type " + str(
+            tensor_to_check_dtype))
+
+    gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype)
+
+    def __get_elem__(tensor, i):
+        if tensor_to_check_dtype == np.float32:
+            return tensor.get_float_element(i)
+        else:
+            return tensor.get_double_element(i)
+
+    def __set_elem__(tensor, i, e):
+        if tensor_to_check_dtype == np.float32:
+            tensor.set_float_element(i, e)
+        else:
+            tensor.set_double_element(i, e)
+
     # we only compute gradient of one element each time.
     # we use a for loop to compute the gradient of every element.
     for i in xrange(tensor_size):
@@ -123,20 +147,20 @@ def get_numeric_gradient(scope,
             set_input(scope, op, inputs, core.CPUPlace())
 
         # get one input element throw it's index i.
-        origin = tensor_to_check.get_float_element(i)
+        origin = __get_elem__(tensor_to_check, i)
         # add delta to it, run op and then get the sum of the result tensor.
         x_pos = origin + delta
-        tensor_to_check.set_float_element(i, x_pos)
+        __set_elem__(tensor_to_check, i, x_pos)
         y_pos = get_output()
 
         if in_place:
             set_input(scope, op, inputs, core.CPUPlace())
 
         x_neg = origin - delta
-        tensor_to_check.set_float_element(i, x_neg)
+        __set_elem__(tensor_to_check, i, x_neg)
         y_neg = get_output()
 
-        tensor_to_check.set_float_element(i, origin)
+        __set_elem__(tensor_to_check, i, origin)
         gradient_flat[i] = (y_pos - y_neg) / delta / 2
 
     return gradient_flat.reshape(tensor_to_check.get_dims())
diff --git a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
index cee4385a81..261ca9cb3d 100644
--- a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
@@ -7,8 +7,8 @@ class ElementwiseMulOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float64"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float64")
         }
         self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
 
@@ -16,23 +16,21 @@ class ElementwiseMulOp(OpTest):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.1)
+        self.check_grad(['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
 
 
 class TestElementwiseMulOp_Vector(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
-            'X': np.random.random((32, )).astype("float32"),
-            'Y': np.random.random((32, )).astype("float32")
+            'X': np.random.random((32, )).astype("float64"),
+            'Y': np.random.random((32, )).astype("float64")
         }
         self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
 
@@ -41,8 +39,8 @@ class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(2).astype(np.float32)
+            'X': np.random.rand(2, 3, 4).astype(np.float64),
+            'Y': np.random.rand(2).astype(np.float64)
         }
 
         self.attrs = {'axis': 0}
@@ -55,8 +53,8 @@ class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(3).astype(np.float32)
+            'X': np.random.rand(2, 3, 4).astype(np.float64),
+            'Y': np.random.rand(3).astype(np.float64)
         }
 
         self.attrs = {'axis': 1}
@@ -69,8 +67,8 @@ class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(4).astype(np.float32)
+            'X': np.random.rand(2, 3, 4).astype(np.float64),
+            'Y': np.random.rand(4).astype(np.float64)
         }
 
         self.outputs = {
@@ -82,8 +80,8 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
-            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
-            'Y': np.random.rand(3, 4).astype(np.float32)
+            'X': np.random.rand(2, 3, 4, 5).astype(np.float64),
+            'Y': np.random.rand(3, 4).astype(np.float64)
         }
 
         self.attrs = {'axis': 1}

From 54892c079735aaffafc7388486482e06ff139439 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 28 Sep 2017 11:59:17 -0700
Subject: [PATCH 05/37] Simplify op_test

---
 python/paddle/v2/framework/tests/op_test.py | 42 +++++++++------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 70ae50d401..23794151bd 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -12,17 +12,19 @@ def grad_var_name(var_name):
 def create_op(scope, op_type, inputs, outputs, attrs):
     kwargs = dict()
 
+    def __create_var__(name, var_name):
+        scope.new_var(var_name)
+        kwargs[name].append(var_name)
+
     for in_name, in_dup in Operator.get_op_inputs(op_type):
         if in_name in inputs:
             kwargs[in_name] = []
             if in_dup:
                 sub_in = inputs[in_name]
                 for sub_in_name, _ in sub_in:
-                    var = scope.new_var(sub_in_name)
-                    kwargs[in_name].append(sub_in_name)
+                    __create_var__(in_name, sub_in_name)
             else:
-                var = scope.new_var(in_name)
-                kwargs[in_name].append(in_name)
+                __create_var__(in_name, in_name)
 
     for out_name, out_dup in Operator.get_op_outputs(op_type):
         if out_name in outputs:
@@ -30,11 +32,9 @@ def create_op(scope, op_type, inputs, outputs, attrs):
             if out_dup:
                 sub_out = outputs[out_name]
                 for sub_out_name, _ in sub_out:
-                    var = scope.new_var(sub_out_name)
-                    kwargs[out_name].append(sub_out_name)
+                    __create_var__(out_name, sub_out_name)
             else:
-                var = scope.new_var(out_name)
-                kwargs[out_name].append(out_name)
+                __create_var__(out_name, out_name)
 
     for attr_name in Operator.get_op_attr_names(op_type):
         if attr_name in attrs:
@@ -44,28 +44,22 @@ def create_op(scope, op_type, inputs, outputs, attrs):
 
 
 def set_input(scope, op, inputs, place):
+    def __set_input__(var_name, var):
+        tensor = scope.find_var(var_name).get_tensor()
+        if isinstance(var, tuple):
+            tensor.set_lod(var[1])
+            var = var[0]
+        tensor.set_dims(var.shape)
+        tensor.set(var, place)
+
     for in_name, in_dup in Operator.get_op_inputs(op.type()):
         if in_name in inputs:
             if in_dup:
                 sub_in = inputs[in_name]
                 for sub_in_name, sub_in_val in sub_in:
-                    var = scope.find_var(sub_in_name)
-                    tensor = var.get_tensor()
-                    sub_in_array = sub_in_val[0] \
-                        if isinstance(sub_in_val, tuple) else sub_in_val
-                    tensor.set_dims(sub_in_array.shape)
-                    tensor.set(sub_in_array, place)
-                    if isinstance(sub_in_val, tuple):
-                        tensor.set_lod(sub_in_val[1])
+                    __set_input__(sub_in_name, sub_in_val)
             else:
-                var = scope.find_var(in_name)
-                tensor = var.get_tensor()
-                in_val = inputs[in_name]
-                in_array = in_val[0] if isinstance(in_val, tuple) else in_val
-                tensor.set_dims(in_array.shape)
-                tensor.set(in_array, place)
-                if isinstance(in_val, tuple):
-                    tensor.set_lod(in_val[1])
+                __set_input__(in_name, inputs[in_name])
 
 
 def set_output_grad(scope, op, outputs, place):

From 279178e457dfb12c15ddb4e51d8f75e75ad6db1f Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 28 Sep 2017 16:45:13 -0700
Subject: [PATCH 06/37] Fix bug in test_prelu and test_xe

They were using float64 for FP32 kernel before.
---
 python/paddle/v2/framework/tests/test_cross_entropy_op.py | 2 +-
 python/paddle/v2/framework/tests/test_prelu_op.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index 1de514dff4..4ea14da7fd 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -80,7 +80,7 @@ class TestCrossEntropyOp3(OpTest):
         cross_entropy2 = (-label * np.log(X)).sum(
             axis=1, keepdims=True).astype("float32")
 
-        self.inputs = {"X": X, "Label": label}
+        self.inputs = {"X": X, "Label": label.astype(np.float32)}
         self.outputs = {"Y": cross_entropy}
         self.attrs = {"softLabel": True}
 
diff --git a/python/paddle/v2/framework/tests/test_prelu_op.py b/python/paddle/v2/framework/tests/test_prelu_op.py
index 676fd9f7c5..7be932ac8f 100644
--- a/python/paddle/v2/framework/tests/test_prelu_op.py
+++ b/python/paddle/v2/framework/tests/test_prelu_op.py
@@ -17,7 +17,7 @@ class PReluTest(OpTest):
 
         x_np_sign = np.sign(x_np)
         x_np = x_np_sign * np.maximum(x_np, .005)
-        alpha_np = np.array([.1])
+        alpha_np = np.array([.1], dtype="float32")
         self.inputs = {'X': x_np, 'Alpha': alpha_np}
         out_np = np.maximum(self.inputs['X'], 0.)
         out_np = out_np + np.minimum(self.inputs['X'],

From e3a642e027e3c749e5405f26478e28887cab504a Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sat, 30 Sep 2017 15:39:42 -0700
Subject: [PATCH 07/37] Extract BaseClass of grad_op_desc_maker and add some
 common method

---
 paddle/framework/details/op_registry.h |   6 +-
 paddle/framework/grad_op_desc_maker.h  | 115 +++++++++++++++++++++++++
 paddle/framework/op_desc.h             |  22 ++++-
 paddle/framework/op_info.h             |   8 +-
 4 files changed, 139 insertions(+), 12 deletions(-)
 create mode 100644 paddle/framework/grad_op_desc_maker.h

diff --git a/paddle/framework/details/op_registry.h b/paddle/framework/details/op_registry.h
index d2516ccc1e..daa474e8c5 100644
--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/framework/grad_op_desc_maker.h"
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/op_proto_maker.h"
 #include "paddle/framework/operator.h"
@@ -96,7 +97,10 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
 template <typename T>
 struct OpInfoFiller<T, kGradOpDescMaker> {
   void operator()(const char* op_type, OpInfo* info) const {
-    info->grad_op_maker_ = new T();
+    info->grad_op_maker_ = [](const OpDescBind& fwd_op) {
+      T maker(fwd_op);
+      return maker();
+    };
   }
 };
 }  // namespace details
diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h
new file mode 100644
index 0000000000..cb4d160bd0
--- /dev/null
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+
+class GradOpDescMakerBase {
+ public:
+  explicit GradOpDescMakerBase(const OpDescBind& fwd_op) : fwd_op_(fwd_op) {}
+
+  virtual ~GradOpDescMakerBase() = default;
+  virtual std::vector<OpDescBind> operator()() const = 0;
+
+ protected:
+  static std::vector<std::string> ToGradNames(
+      const std::vector<std::string>& var_names) {
+    std::vector<std::string> ret_val;
+    ret_val.reserve(var_names.size());
+    std::transform(var_names.begin(), var_names.end(),
+                   std::back_inserter(ret_val), GradVarName);
+    return ret_val;
+  }
+
+  std::vector<std::string> InputGrad(const std::string& name) const {
+    return ToGradNames(fwd_op_.Input(name));
+  }
+
+  std::vector<std::string> OutputGrad(const std::string& name) const {
+    return ToGradNames(fwd_op_.Output(name));
+  }
+
+  std::vector<std::string> InputParamNames() const {
+    return this->fwd_op_.InputParamNames();
+  }
+
+  std::vector<std::string> OutputParamNames() const {
+    return this->fwd_op_.OutputParamNames();
+  }
+
+  std::vector<std::string> Input(const std::string& name) const {
+    return fwd_op_.Input(name);
+  }
+
+  std::vector<std::string> Output(const std::string& name) const {
+    return fwd_op_.Output(name);
+  }
+
+  const std::unordered_map<std::string, Attribute>& Attrs() const {
+    return fwd_op_.GetAttrMap();
+  }
+
+  const Attribute& GetAttr(const std::string& name) const {
+    auto& map = fwd_op_.GetAttrMap();
+    auto it = map.find(name);
+    PADDLE_ENFORCE(it != map.end(), "Cannot find attribute %s", name);
+    return it->second;
+  }
+
+  std::string ForwardOpType() const { return this->fwd_op_.Type(); }
+
+ private:
+  const OpDescBind& fwd_op_;
+};
+
+class SingleGradOpDescMaker : public GradOpDescMakerBase {
+ public:
+  std::vector<OpDescBind> operator()() const { return {this->Apply()}; }
+
+ protected:
+  virtual OpDescBind Apply() const = 0;
+};
+
+class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
+ protected:
+  virtual OpDescBind Apply() const {
+    OpDescBind grad;
+    grad.SetType(this->GradOpType());
+
+    for (auto& input_param : this->InputParamNames()) {
+      grad.SetInput(input_param, this->Input(input_param));
+      grad.SetOutput(GradVarName(input_param), this->InputGrad(input_param));
+    }
+
+    for (auto& output_param : this->OutputParamNames()) {
+      grad.SetInput(output_param, this->Output(output_param));
+      grad.SetInput(GradVarName(output_param), this->OutputGrad(output_param));
+    }
+
+    grad.SetAttrMap(this->Attrs());
+
+    return grad;
+  }
+
+  virtual std::string GradOpType() const {
+    return this->ForwardOpType() + "_grad";
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index 0cf7d13971..851a305061 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -60,17 +60,31 @@ class OpDescBind {
 
   void SetBlockAttr(const std::string &name, BlockDescBind &block);
 
-  // Only be used in C++
-  void SetAttrMap(const std::unordered_map<std::string, Attribute> &attr_map);
-
   Attribute GetAttr(const std::string &name) const;
 
   int GetBlockAttr(const std::string &name) const;
 
-  // Only be used in C++
+  // The following methods should only be used in C++
   const std::unordered_map<std::string, Attribute> &GetAttrMap() const;
 
+  void SetAttrMap(const std::unordered_map<std::string, Attribute> &attr_map);
+
+  std::vector<std::string> InputParamNames() const { return MapKeys(inputs_); }
+  std::vector<std::string> OutputParamNames() const {
+    return MapKeys(outputs_);
+  }
+
  private:
+  template <typename MapType>
+  static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
+    std::vector<typename MapType::key_type> ret_val;
+    ret_val.reserve(map.size());
+    std::transform(
+        map.begin(), map.end(), ret_val.begin(),
+        [](const typename MapType::value_type &pair) { return pair.first; });
+    return ret_val;
+  }
+
   struct SetAttrDescVisitor : public boost::static_visitor<void> {
     explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
     mutable OpDesc::Attr *attr_;
diff --git a/paddle/framework/op_info.h b/paddle/framework/op_info.h
index 6d1ee4dece..8149c0061a 100644
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@@ -29,16 +29,10 @@ using OpCreator = std::function<OperatorBase*(
     const std::string& /*type*/, const VariableNameMap& /*inputs*/,
     const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
 
-class GradOpDescMakerBase {
- public:
-  virtual ~GradOpDescMakerBase() = default;
-  virtual std::vector<OpDescBind> operator()(const OpDescBind&) const = 0;
-};
-
 struct OpInfo {
   OpCreator creator_;
   std::string grad_op_type_;
-  GradOpDescMakerBase* grad_op_maker_{nullptr};
+  std::function<std::vector<OpDescBind>(const OpDescBind&)> grad_op_maker_;
   OpProto* proto_{nullptr};
   OpAttrChecker* checker_{nullptr};
 

From 3e99b166ba147b8d954332a9be882bee25ca6591 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Sun, 1 Oct 2017 08:29:09 +0000
Subject: [PATCH 08/37] add generic add operator

---
 paddle/framework/backward.cc | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 0ec18de5b8..c625c0caf7 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -141,9 +141,35 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
         net->ops_[op_offset]->Rename(name, dup_outputs.back());
       }
       // collect all the offset to append `add` op for each alias
-      insert_position.push_back(
-          {dup_op.back(), OpRegistry::CreateOp("add", {{"X", {dup_outputs}}},
-                                               {{"Out", {name}}}, {})});
+      //
+      // one variable is shared between multiple operators.
+      // insert add operator one by one, then add it to output
+      if (dup_outputs.size() == 2) {
+        insert_position.push_back(
+            {dup_op.back(),
+             OpRegistry::CreateOp(
+                 "add", {{"X", {dup_outputs[0]}}, {"Y", {dup_outputs[1]}}},
+                 {{"Out", {name}}}, {})});
+      } else {
+        for (size_t output_idx = 0; output_idx < dup_outputs.size() - 1;
+             ++output_idx) {
+          auto insert_add_x = dup_outputs[output_idx];
+          auto insert_add_y = dup_outputs[output_idx];
+          auto insert_add_out = name + "@SHARED@" + std::to_string(output_idx);
+          // first add op inserted
+          if (output_idx == dup_outputs.size() - 1) {
+            insert_add_out = name;
+          }
+          if (output_idx != 0) {
+            insert_add_y = name + "@SHARED@" + std::to_string(output_idx);
+          }
+          insert_position.push_back(
+              {dup_op.back(),
+               OpRegistry::CreateOp(
+                   "add", {{"X", {insert_add_x}}, {"Y", {insert_add_y}}},
+                   {{"Out", {insert_add_out}}}, {})});
+        }
+      }
     }
 
     // make sure the inserted `add` ops follow the BFS order.

From c08635898f3a57cdf19c45f12b1aac28d864c73e Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Sun, 1 Oct 2017 08:36:04 +0000
Subject: [PATCH 09/37] fix typo

---
 paddle/framework/backward.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index c625c0caf7..b850939040 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -161,7 +161,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
             insert_add_out = name;
           }
           if (output_idx != 0) {
-            insert_add_y = name + "@SHARED@" + std::to_string(output_idx);
+            insert_add_y = name + "@SHARED@" + std::to_string(output_idx-1);
           }
           insert_position.push_back(
               {dup_op.back(),

From e9a9dd6d4d5fb6428917cffb678cfa0b582fc018 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Sun, 1 Oct 2017 09:18:21 -0700
Subject: [PATCH 10/37] relauch ci

---
 paddle/framework/backward.cc | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index b850939040..fbacfeed94 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -144,31 +144,23 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
       //
       // one variable is shared between multiple operators.
       // insert add operator one by one, then add it to output
-      if (dup_outputs.size() == 2) {
+      for (size_t output_idx = 0; output_idx < dup_outputs.size() - 1;
+           ++output_idx) {
+        auto insert_add_x = dup_outputs[output_idx];
+        auto insert_add_y = dup_outputs[output_idx];
+        auto insert_add_out = name + "@SHARED@" + std::to_string(output_idx);
+        // first add op inserted
+        if (output_idx == dup_outputs.size() - 2) {
+          insert_add_out = name;
+        }
+        if (output_idx != 0) {
+          insert_add_y = name + "@SHARED@" + std::to_string(output_idx - 1);
+        }
         insert_position.push_back(
             {dup_op.back(),
              OpRegistry::CreateOp(
-                 "add", {{"X", {dup_outputs[0]}}, {"Y", {dup_outputs[1]}}},
-                 {{"Out", {name}}}, {})});
-      } else {
-        for (size_t output_idx = 0; output_idx < dup_outputs.size() - 1;
-             ++output_idx) {
-          auto insert_add_x = dup_outputs[output_idx];
-          auto insert_add_y = dup_outputs[output_idx];
-          auto insert_add_out = name + "@SHARED@" + std::to_string(output_idx);
-          // first add op inserted
-          if (output_idx == dup_outputs.size() - 1) {
-            insert_add_out = name;
-          }
-          if (output_idx != 0) {
-            insert_add_y = name + "@SHARED@" + std::to_string(output_idx-1);
-          }
-          insert_position.push_back(
-              {dup_op.back(),
-               OpRegistry::CreateOp(
-                   "add", {{"X", {insert_add_x}}, {"Y", {insert_add_y}}},
-                   {{"Out", {insert_add_out}}}, {})});
-        }
+                 "add", {{"X", {insert_add_x}}, {"Y", {insert_add_y}}},
+                 {{"Out", {insert_add_out}}}, {})});
       }
     }
 

From 3723304da953cce7aa88d1fdbd684bff91412dae Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 2 Oct 2017 11:56:14 -0700
Subject: [PATCH 11/37] Add missing ctor

---
 paddle/framework/grad_op_desc_maker.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h
index cb4d160bd0..b4b6d54bf3 100644
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -79,6 +79,8 @@ class GradOpDescMakerBase {
 
 class SingleGradOpDescMaker : public GradOpDescMakerBase {
  public:
+  using GradOpDescMakerBase::GradOpDescMakerBase;
+
   std::vector<OpDescBind> operator()() const { return {this->Apply()}; }
 
  protected:
@@ -86,6 +88,9 @@ class SingleGradOpDescMaker : public GradOpDescMakerBase {
 };
 
 class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
+ public:
+  using SingleGradOpDescMaker::SingleGradOpDescMaker;
+
  protected:
   virtual OpDescBind Apply() const {
     OpDescBind grad;

From 9b54ad18f8c6c974cff39b9e128a3ef82dd57455 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Sat, 30 Sep 2017 16:06:52 +0800
Subject: [PATCH 12/37] add configuration helper for resize layer.

---
 doc/api/v1/index_cn.rst                       |  2 +-
 doc/api/v2/config/layer.rst                   |  5 ++++
 .../paddle/trainer_config_helpers/layers.py   | 25 ++++++++++++++++-
 .../tests/configs/file_list.sh                |  2 +-
 .../protostr/test_resize_layer.protostr       | 27 +++++++++++++++++++
 .../tests/configs/test_resize_layer.py        |  6 +++++
 6 files changed, 64 insertions(+), 3 deletions(-)
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py

diff --git a/doc/api/v1/index_cn.rst b/doc/api/v1/index_cn.rst
index 3718cd73a2..cf146dc088 100644
--- a/doc/api/v1/index_cn.rst
+++ b/doc/api/v1/index_cn.rst
@@ -21,7 +21,7 @@ Model Config API
     trainer_config_helpers/optimizers.rst
     trainer_config_helpers/data_sources.rst
     trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/activations.rst
     trainer_config_helpers/poolings.rst
     trainer_config_helpers/networks.rst
     trainer_config_helpers/evaluators.rst
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index c94627a728..d4e9d53e5c 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -345,6 +345,11 @@ clip
 ..  autoclass:: paddle.v2.layer.clip
     :noindex:
 
+resize
+------
+..  autoclass:: paddle.v2.layer.resize
+    :noindex:
+
 slope_intercept
 ---------------
 ..  autoclass:: paddle.v2.layer.slope_intercept
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 74025d2a7b..d37f29d2c4 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -142,6 +142,7 @@ __all__ = [
     'img_pool3d_layer',
     'scale_shift_layer',
     'img_conv3d_layer',
+    'resize_layer',
 ]
 
 
@@ -250,6 +251,8 @@ class LayerType(object):
     KMAX_SEQ_SCORE = 'kmax_seq_score'
     SCALE_SHIFT_LAYER = 'scale_shift'
 
+    RESIZE = 'resize'
+
     @staticmethod
     def is_layer_type(type_name):
         """
@@ -6473,7 +6476,7 @@ def switch_order_layer(input,
                        act=None,
                        layer_attr=None):
     """
-    This layer switch dimension order of image input. 
+    This layer switch dimension order of image input.
     From order "batchSize, channels, height, width"
     to order "batchSize, height, width, channels".
 
@@ -6932,3 +6935,23 @@ def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
         bias=ParamAttr.to_bias(bias_attr))
     return LayerOutput(
         name, LayerType.SCALE_SHIFT_LAYER, parents=[input], size=input.size)
+
+
+@wrap_name_default("resize")
+def resize_layer(input, size, name=None):
+    """
+    The resize layer resizes the input matrix with a shape of [Height, Width]
+    into the output matrix with a shape of [Height x Width / size, size],
+    where size is the parameter of this layer indicating the output dimension.
+
+    :param input: The input to this layer.
+    :type input: LayerOutput.
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param size: The resized output dimesion of this layer.
+    :type size: int
+    :return: A LayerOutput object.
+    :rtype: LayerOutput
+    """
+    Layer(name=name, type=LayerType.RESIZE, inputs=Input(input.name), size=size)
+    return LayerOutput(name, LayerType.RESIZE, parents=[input], size=input.size)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 8a204a96f3..6a4550c209 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -10,6 +10,6 @@ test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_la
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
 test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
 test_seq_slice_layer test_cross_entropy_over_beam test_pooling3D_layer
-test_conv3d_layer test_deconv3d_layer test_BatchNorm3D)
+test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
new file mode 100644
index 0000000000..9399252b23
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
@@ -0,0 +1,27 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__resize_0__"
+  type: "resize"
+  size: 150
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+}
+input_layer_names: "input"
+output_layer_names: "__resize_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__resize_0__"
+  input_layer_names: "input"
+  output_layer_names: "__resize_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
new file mode 100644
index 0000000000..09a6f50733
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
@@ -0,0 +1,6 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+resized = resize_layer(input=data, size=150)
+
+outputs(resized)

From 04e604b7198179d2feedd76b2cf455656698b21f Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sat, 30 Sep 2017 16:55:40 -0700
Subject: [PATCH 13/37] Unify Map in OpDescBind

---
 paddle/framework/op_desc.cc | 27 ++++++++++++++++++++++++++-
 paddle/framework/op_desc.h  | 37 ++++++-------------------------------
 paddle/platform/enforce.h   |  4 ++--
 3 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 0c12c55dc0..33a064890c 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -112,6 +112,30 @@ const std::unordered_map<std::string, Attribute> &OpDescBind::GetAttrMap()
   return attrs_;
 }
 
+struct SetAttrDescVisitor : public boost::static_visitor<void> {
+  explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
+  mutable OpDesc::Attr *attr_;
+  void operator()(int v) const { attr_->set_i(v); }
+  void operator()(float v) const { attr_->set_f(v); }
+  void operator()(const std::string &v) const { attr_->set_s(v); }
+  void operator()(bool b) const { attr_->set_b(b); }
+
+  void operator()(const std::vector<int> &v) const {
+    VectorToRepeated(v, attr_->mutable_ints());
+  }
+  void operator()(const std::vector<float> &v) const {
+    VectorToRepeated(v, attr_->mutable_floats());
+  }
+  void operator()(const std::vector<std::string> &v) const {
+    VectorToRepeated(v, attr_->mutable_strings());
+  }
+  void operator()(const std::vector<bool> &v) const {
+    VectorToRepeated(v, attr_->mutable_bools());
+  }
+  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->idx()); }
+  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
+};
+
 void OpDescBind::Sync() {
   if (need_update_) {
     this->op_desc_.mutable_inputs()->Clear();
@@ -134,7 +158,8 @@ void OpDescBind::Sync() {
       attr_desc->set_name(attr.first);
       attr_desc->set_type(
           static_cast<framework::AttrType>(attr.second.which() - 1));
-      boost::apply_visitor(SetAttrDescVisitor(attr_desc), attr.second);
+      SetAttrDescVisitor visitor(attr_desc);
+      boost::apply_visitor(visitor, attr.second);
     }
 
     need_update_ = false;
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index 0cf7d13971..e03b4d067f 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/op_info.h"
 #include "paddle/framework/var_desc.h"
 
 namespace paddle {
@@ -61,48 +62,22 @@ class OpDescBind {
   void SetBlockAttr(const std::string &name, BlockDescBind &block);
 
   // Only be used in C++
-  void SetAttrMap(const std::unordered_map<std::string, Attribute> &attr_map);
+  void SetAttrMap(const AttributeMap &attr_map);
 
   Attribute GetAttr(const std::string &name) const;
 
   int GetBlockAttr(const std::string &name) const;
 
   // Only be used in C++
-  const std::unordered_map<std::string, Attribute> &GetAttrMap() const;
+  const AttributeMap &GetAttrMap() const;
 
  private:
-  struct SetAttrDescVisitor : public boost::static_visitor<void> {
-    explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
-    mutable OpDesc::Attr *attr_;
-    void operator()(int v) const { attr_->set_i(v); }
-    void operator()(float v) const { attr_->set_f(v); }
-    void operator()(const std::string &v) const { attr_->set_s(v); }
-    void operator()(bool b) const { attr_->set_b(b); }
-
-    void operator()(const std::vector<int> &v) const {
-      VectorToRepeated(v, attr_->mutable_ints());
-    }
-    void operator()(const std::vector<float> &v) const {
-      VectorToRepeated(v, attr_->mutable_floats());
-    }
-    void operator()(const std::vector<std::string> &v) const {
-      VectorToRepeated(v, attr_->mutable_strings());
-    }
-    void operator()(const std::vector<bool> &v) const {
-      VectorToRepeated(v, attr_->mutable_bools());
-    }
-    void operator()(BlockDesc *desc) const {
-      attr_->set_block_idx(desc->idx());
-    }
-    void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
-  };
-
   void Sync();
 
   OpDesc op_desc_;
-  std::unordered_map<std::string, std::vector<std::string>> inputs_;
-  std::unordered_map<std::string, std::vector<std::string>> outputs_;
-  std::unordered_map<std::string, Attribute> attrs_;
+  VariableNameMap inputs_;
+  VariableNameMap outputs_;
+  AttributeMap attrs_;
 
   // need_update_ indicate there some local changes not be synchronized. If
   // local changes should be synchronized, need_update_ should be set to true.
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index b523ef03c0..52bd23039b 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -185,7 +185,7 @@ inline void throw_on_error(T e) {
         std::make_exception_ptr(                                       \
             std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \
         __FILE__, __LINE__);                                           \
-  } while (0)
+  } while (false)
 
 #define PADDLE_ENFORCE(...)                                             \
   do {                                                                  \
@@ -195,7 +195,7 @@ inline void throw_on_error(T e) {
       throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
                                               __FILE__, __LINE__);      \
     }                                                                   \
-  } while (0)
+  } while (false)
 
 /*
  * Some enforce helpers here, usage:

From c7ae0aac6660cc6cad2e7977ae573359433d484c Mon Sep 17 00:00:00 2001
From: kexinzhao <19hskevin87@gmail.com>
Date: Sun, 1 Oct 2017 09:36:02 -0700
Subject: [PATCH 14/37] Fix typo in new_op_cn.md

---
 doc/howto/dev/new_op_cn.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index 9d3d02ffc3..c823d7e9fc 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -206,7 +206,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
     - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
     - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
-    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulKernel`类。
+    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
 
 
 - 在 `.cu`文件中注册GPU Kernel。

From 83764d491cc8a835a315f29a04fe1addfd1a05ae Mon Sep 17 00:00:00 2001
From: kexinzhao <19hskevin87@gmail.com>
Date: Sun, 1 Oct 2017 09:40:49 -0700
Subject: [PATCH 15/37] Update new_op_en.md

---
 doc/howto/dev/new_op_en.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md
index 57ff7caad1..1e88e1f5b4 100644
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@@ -205,7 +205,7 @@ The definition of its corresponding backward operator, if applicable, is similar
 
     - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
     - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
-    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulKernel`.
+    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
 
 
 - Registering GPU Kernel in `.cu` files

From 18799476c20743313eb8361efc6cd21886d95862 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 2 Oct 2017 11:39:10 -0700
Subject: [PATCH 16/37] Use `type_defs.h` to resolve cyclic dependencies

---
 paddle/framework/attribute.h | 10 +---------
 paddle/framework/op_desc.h   |  2 +-
 paddle/framework/op_info.h   |  7 +------
 paddle/framework/type_defs.h | 38 ++++++++++++++++++++++++++++++++++++
 4 files changed, 41 insertions(+), 16 deletions(-)
 create mode 100644 paddle/framework/type_defs.h

diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
index c7559cefb6..d13530e340 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -21,20 +21,12 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/framework/framework.pb.h"
+#include "paddle/framework/type_defs.h"
 #include "paddle/platform/enforce.h"
-#include "paddle/platform/variant.h"
 
 namespace paddle {
 namespace framework {
 
-// The order should be as same as framework.proto
-typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
-                       std::vector<float>, std::vector<std::string>, bool,
-                       std::vector<bool>, BlockDesc*>
-    Attribute;
-
-typedef std::unordered_map<std::string, Attribute> AttributeMap;
-
 ProgramDesc& GetProgramDesc();
 
 template <typename T>
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index e03b4d067f..0af4169715 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 #include "paddle/framework/attribute.h"
-#include "paddle/framework/op_info.h"
+#include "paddle/framework/type_defs.h"
 #include "paddle/framework/var_desc.h"
 
 namespace paddle {
diff --git a/paddle/framework/op_info.h b/paddle/framework/op_info.h
index 6d1ee4dece..470336d367 100644
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@@ -19,15 +19,10 @@
 #include <unordered_map>
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/op_desc.h"
+#include "paddle/framework/type_defs.h"
 
 namespace paddle {
 namespace framework {
-class OperatorBase;
-using VariableNameMap = std::map<std::string, std::vector<std::string>>;
-
-using OpCreator = std::function<OperatorBase*(
-    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
-    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
 
 class GradOpDescMakerBase {
  public:
diff --git a/paddle/framework/type_defs.h b/paddle/framework/type_defs.h
new file mode 100644
index 0000000000..dec5066f1e
--- /dev/null
+++ b/paddle/framework/type_defs.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <functional>
+#include <map>
+#include "paddle/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+class OperatorBase;
+using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+
+// The order should be as same as framework.proto
+using Attribute =
+    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
+                   std::vector<float>, std::vector<std::string>, bool,
+                   std::vector<bool>, BlockDesc*>;
+
+using AttributeMap = std::unordered_map<std::string, Attribute>;
+
+using OpCreator = std::function<OperatorBase*(
+    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
+    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
+
+}  // namespace framework
+}  // namespace paddle

From d2bd6f45cb82531eff7ce7e64360d75e351c643d Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Mon, 2 Oct 2017 15:07:02 -0700
Subject: [PATCH 17/37] "replace add with sum"

---
 paddle/framework/backward.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index fbacfeed94..35759f8e78 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -159,7 +159,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
         insert_position.push_back(
             {dup_op.back(),
              OpRegistry::CreateOp(
-                 "add", {{"X", {insert_add_x}}, {"Y", {insert_add_y}}},
+                 "add", {{"X", {insert_add_x}}, {"X", {insert_add_y}}},
                  {{"Out", {insert_add_out}}}, {})});
       }
     }

From 2bceab0fb446aa1c5370a613bf67041d4534f187 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Sun, 1 Oct 2017 08:29:09 +0000
Subject: [PATCH 18/37] add generic add operator

---
 paddle/framework/backward.cc | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 0ec18de5b8..c625c0caf7 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -141,9 +141,35 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
         net->ops_[op_offset]->Rename(name, dup_outputs.back());
       }
       // collect all the offset to append `add` op for each alias
-      insert_position.push_back(
-          {dup_op.back(), OpRegistry::CreateOp("add", {{"X", {dup_outputs}}},
-                                               {{"Out", {name}}}, {})});
+      //
+      // one variable is shared between multiple operators.
+      // insert add operator one by one, then add it to output
+      if (dup_outputs.size() == 2) {
+        insert_position.push_back(
+            {dup_op.back(),
+             OpRegistry::CreateOp(
+                 "add", {{"X", {dup_outputs[0]}}, {"Y", {dup_outputs[1]}}},
+                 {{"Out", {name}}}, {})});
+      } else {
+        for (size_t output_idx = 0; output_idx < dup_outputs.size() - 1;
+             ++output_idx) {
+          auto insert_add_x = dup_outputs[output_idx];
+          auto insert_add_y = dup_outputs[output_idx];
+          auto insert_add_out = name + "@SHARED@" + std::to_string(output_idx);
+          // first add op inserted
+          if (output_idx == dup_outputs.size() - 1) {
+            insert_add_out = name;
+          }
+          if (output_idx != 0) {
+            insert_add_y = name + "@SHARED@" + std::to_string(output_idx);
+          }
+          insert_position.push_back(
+              {dup_op.back(),
+               OpRegistry::CreateOp(
+                   "add", {{"X", {insert_add_x}}, {"Y", {insert_add_y}}},
+                   {{"Out", {insert_add_out}}}, {})});
+        }
+      }
     }
 
     // make sure the inserted `add` ops follow the BFS order.

From 800085fe2d37e5ad3ef706701ffb008a2c668ee1 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Sun, 1 Oct 2017 08:36:04 +0000
Subject: [PATCH 19/37] fix typo

---
 paddle/framework/backward.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index c625c0caf7..b850939040 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -161,7 +161,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
             insert_add_out = name;
           }
           if (output_idx != 0) {
-            insert_add_y = name + "@SHARED@" + std::to_string(output_idx);
+            insert_add_y = name + "@SHARED@" + std::to_string(output_idx-1);
           }
           insert_position.push_back(
               {dup_op.back(),

From f6496272cfcbb4d2ad9eb8a272a065007059a004 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Sun, 1 Oct 2017 09:18:21 -0700
Subject: [PATCH 20/37] relauch ci

---
 paddle/framework/backward.cc | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index b850939040..fbacfeed94 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -144,31 +144,23 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
       //
       // one variable is shared between multiple operators.
       // insert add operator one by one, then add it to output
-      if (dup_outputs.size() == 2) {
+      for (size_t output_idx = 0; output_idx < dup_outputs.size() - 1;
+           ++output_idx) {
+        auto insert_add_x = dup_outputs[output_idx];
+        auto insert_add_y = dup_outputs[output_idx];
+        auto insert_add_out = name + "@SHARED@" + std::to_string(output_idx);
+        // first add op inserted
+        if (output_idx == dup_outputs.size() - 2) {
+          insert_add_out = name;
+        }
+        if (output_idx != 0) {
+          insert_add_y = name + "@SHARED@" + std::to_string(output_idx - 1);
+        }
         insert_position.push_back(
             {dup_op.back(),
              OpRegistry::CreateOp(
-                 "add", {{"X", {dup_outputs[0]}}, {"Y", {dup_outputs[1]}}},
-                 {{"Out", {name}}}, {})});
-      } else {
-        for (size_t output_idx = 0; output_idx < dup_outputs.size() - 1;
-             ++output_idx) {
-          auto insert_add_x = dup_outputs[output_idx];
-          auto insert_add_y = dup_outputs[output_idx];
-          auto insert_add_out = name + "@SHARED@" + std::to_string(output_idx);
-          // first add op inserted
-          if (output_idx == dup_outputs.size() - 1) {
-            insert_add_out = name;
-          }
-          if (output_idx != 0) {
-            insert_add_y = name + "@SHARED@" + std::to_string(output_idx-1);
-          }
-          insert_position.push_back(
-              {dup_op.back(),
-               OpRegistry::CreateOp(
-                   "add", {{"X", {insert_add_x}}, {"Y", {insert_add_y}}},
-                   {{"Out", {insert_add_out}}}, {})});
-        }
+                 "add", {{"X", {insert_add_x}}, {"Y", {insert_add_y}}},
+                 {{"Out", {insert_add_out}}}, {})});
       }
     }
 

From a1b935e356573266dfff08d5fd279815492c8843 Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Mon, 2 Oct 2017 15:07:02 -0700
Subject: [PATCH 21/37] "replace add with sum"

---
 paddle/framework/backward.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index fbacfeed94..35759f8e78 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -159,7 +159,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
         insert_position.push_back(
             {dup_op.back(),
              OpRegistry::CreateOp(
-                 "add", {{"X", {insert_add_x}}, {"Y", {insert_add_y}}},
+                 "add", {{"X", {insert_add_x}}, {"X", {insert_add_y}}},
                  {{"Out", {insert_add_out}}}, {})});
       }
     }

From 84b8baf1967e327712269e7632235438d09759d9 Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Mon, 2 Oct 2017 15:50:24 -0700
Subject: [PATCH 22/37] gather scatter with cuda streams

---
 paddle/operators/gather.cu.h   | 13 ++++++++-----
 paddle/operators/gather_op.cu  |  5 ++---
 paddle/operators/scatter.cu.h  | 10 ++++++----
 paddle/operators/scatter_op.cu |  4 ++--
 4 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/paddle/operators/gather.cu.h b/paddle/operators/gather.cu.h
index b400c10440..2ae11376a2 100644
--- a/paddle/operators/gather.cu.h
+++ b/paddle/operators/gather.cu.h
@@ -46,9 +46,9 @@ __global__ void GatherCUDAKernel(const T* params, const int* indices, T* output,
  * return: output tensor
  */
 template <typename T>
-void GPUGather(const Place& place, const Tensor* src, const Tensor* index,
-               Tensor* output) {
-  PADDLE_ENFORCE(platform::is_gpu_place(place));
+void GPUGather(const platform::DeviceContext& ctx, const Tensor* src,
+               const Tensor* index, Tensor* output) {
+  // PADDLE_ENFORCE(platform::is_gpu_place(place));
   // check index of shape 1-D
   PADDLE_ENFORCE(index->dims().size() == 1);
   int index_size = index->dims()[0];
@@ -68,8 +68,11 @@ void GPUGather(const Place& place, const Tensor* src, const Tensor* index,
   int block = 512;
   int n = slice_size * index_size;
   int grid = (n + block - 1) / block;
-  GatherCUDAKernel<T><<<grid, block>>>(p_src, p_index, p_output, index_size,
-                                       slice_size);
+
+  GatherCUDAKernel<T><<<
+      grid, block, 0,
+      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      p_src, p_index, p_output, index_size, slice_size);
 }
 
 }  // namespace operators
diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu
index 06004614b2..9937be5915 100644
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
@@ -32,7 +32,7 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
 
     output->mutable_data<T>(ctx.GetPlace());
 
-    GPUGather<T>(ctx.GetPlace(), x, index, output);
+    GPUGather<T>(ctx.device_context(), x, index, output);
   }
 };
 
@@ -42,7 +42,6 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
-    LOG(INFO) << "Gather grad here";
     auto *Index = ctx.Input<Tensor>("Index");
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -53,7 +52,7 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
     auto place = ctx.GetEigenDevice<platform::GPUPlace>();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
 
-    GPUScatterAssign<T>(ctx.GetPlace(), dO, Index, dX);
+    GPUScatterAssign<T>(ctx.device_context(), dO, Index, dX);
   }
 };
 
diff --git a/paddle/operators/scatter.cu.h b/paddle/operators/scatter.cu.h
index add4791a79..f4a3965d94 100644
--- a/paddle/operators/scatter.cu.h
+++ b/paddle/operators/scatter.cu.h
@@ -45,11 +45,11 @@ __global__ void ScatterCUDAKernel(const T* params, const int* indices,
  * return: output tensor
  */
 template <typename T>
-void GPUScatterAssign(const platform::Place& place,
+void GPUScatterAssign(const platform::DeviceContext& ctx,
                       const paddle::framework::Tensor* src,
                       const paddle::framework::Tensor* index,
                       paddle::framework::Tensor* output) {
-  PADDLE_ENFORCE(platform::is_gpu_place(place));
+  // PADDLE_ENFORCE(platform::is_gpu_place(place));
   // check index of shape 1-D
   PADDLE_ENFORCE(index->dims().size() == 1);
   int index_size = index->dims()[0];
@@ -70,8 +70,10 @@ void GPUScatterAssign(const platform::Place& place,
   int n = slice_size * index_size;
   int grid = (n + block - 1) / block;
 
-  ScatterCUDAKernel<T><<<grid, block>>>(p_src, p_index, p_output, index_size,
-                                        slice_size);
+  ScatterCUDAKernel<T><<<
+      grid, block, 0,
+      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      p_src, p_index, p_output, index_size, slice_size);
 }
 
 }  // namespace operators
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
index 831eabdae4..6d13a876f9 100644
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
@@ -32,7 +32,7 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
 
     Out->ShareDataWith<T>(*Ref);
 
-    GPUScatterAssign<T>(ctx.GetPlace(), Updates, Index, Out);
+    GPUScatterAssign<T>(ctx.device_context(), Updates, Index, Out);
   }
 };
 
@@ -51,7 +51,7 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
     dRef->ShareDataWith<T>(*dOut);
     dUpdates->mutable_data<T>(ctx.GetPlace());
     // Gradient by Gather: dUpdates = dO[Index]
-    GPUGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
+    GPUGather<T>(ctx.device_context(), dOut, Index, dUpdates);
   }
 };
 

From 2ccaec4f57afe94f36ee4781bae6e0eec78b29a8 Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Mon, 2 Oct 2017 18:31:55 -0700
Subject: [PATCH 23/37] gather scatter cond

---
 paddle/operators/cond_op.cc      | 5 ++---
 paddle/operators/gather.h        | 4 ++--
 paddle/operators/gather_op.h     | 4 ++--
 paddle/operators/gather_test.cc  | 4 +++-
 paddle/operators/scatter.h       | 4 ++--
 paddle/operators/scatter_op.h    | 4 ++--
 paddle/operators/scatter_test.cc | 4 +++-
 7 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index 55822827d9..7d7f1ba3b1 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -126,8 +126,7 @@ void CondOp::PrepareDataForSubnet(
       dim[0] = index_tensors[i].dims()[0];
       tensor_child->mutable_data<float>(dim, platform::CPUPlace());
 
-      CPUGather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i],
-                       tensor_child);
+      CPUGather<float>(dev_ctx, tensor_parent, &index_tensors[i], tensor_child);
     }
   }
 
@@ -188,7 +187,7 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
       Variable* var_child = sub_scopes[i]->FindVar(output);
       PADDLE_ENFORCE_NOT_NULL(var_child);
       auto* tensor_child = &var_child->Get<LoDTensor>();
-      ScatterAssign<float>(dev_ctx.GetPlace(), tensor_child, &index_tensors[i],
+      ScatterAssign<float>(dev_ctx, tensor_child, &index_tensors[i],
                            tensor_parent);
     }
   }
diff --git a/paddle/operators/gather.h b/paddle/operators/gather.h
index cb635f6825..1e39a6da27 100644
--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
@@ -32,11 +32,11 @@ namespace operators {
  * return: output tensor
  */
 template <typename T>
-void CPUGather(const platform::Place& place,
+void CPUGather(const platform::DeviceContext& ctx,
                const paddle::framework::Tensor* src,
                const paddle::framework::Tensor* index,
                paddle::framework::Tensor* output) {
-  PADDLE_ENFORCE(platform::is_cpu_place(place));
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
   // check index of shape 1-D
   PADDLE_ENFORCE(index->dims().size() == 1);
   int index_size = index->dims()[0];
diff --git a/paddle/operators/gather_op.h b/paddle/operators/gather_op.h
index fb065b8da7..5bd2c36f7b 100644
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
@@ -36,7 +36,7 @@ class GatherOpKernel : public framework::OpKernel<T> {
 
     output->mutable_data<T>(ctx.GetPlace());
 
-    CPUGather<T>(ctx.GetPlace(), x, index, output);
+    CPUGather<T>(ctx.device_context(), x, index, output);
   }
 };
 
@@ -56,7 +56,7 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
     auto place = ctx.GetEigenDevice<platform::CPUPlace>();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
 
-    ScatterAssign<T>(ctx.GetPlace(), dO, Index, dX);
+    ScatterAssign<T>(ctx.device_context(), dO, Index, dX);
   }
 };
 
diff --git a/paddle/operators/gather_test.cc b/paddle/operators/gather_test.cc
index 3c1d06ccd1..d8bf8dd9a4 100644
--- a/paddle/operators/gather_test.cc
+++ b/paddle/operators/gather_test.cc
@@ -41,7 +41,9 @@ TEST(Gather, GatherData) {
 
   int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
 
-  CPUGather<int>(CPUPlace(), src, index, output);
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  paddle::platform::CPUDeviceContext ctx(*cpu_place);
+  CPUGather<int>(ctx, src, index, output);
 
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
   for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
diff --git a/paddle/operators/scatter.h b/paddle/operators/scatter.h
index f895f22e28..0d174d3b5b 100644
--- a/paddle/operators/scatter.h
+++ b/paddle/operators/scatter.h
@@ -33,11 +33,11 @@ using Tensor = framework::Tensor;
  * return: output tensor
  */
 template <typename T>
-void ScatterAssign(const platform::Place& place,
+void ScatterAssign(const platform::DeviceContext& ctx,
                    const paddle::framework::Tensor* src,
                    const paddle::framework::Tensor* index,
                    paddle::framework::Tensor* output) {
-  PADDLE_ENFORCE(platform::is_cpu_place(place));
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
   // check index of shape 1-D
   PADDLE_ENFORCE(index->dims().size() == 1);
   int index_size = index->dims()[0];
diff --git a/paddle/operators/scatter_op.h b/paddle/operators/scatter_op.h
index 771a1f2ddb..ac04968549 100644
--- a/paddle/operators/scatter_op.h
+++ b/paddle/operators/scatter_op.h
@@ -37,7 +37,7 @@ class ScatterOpKernel : public framework::OpKernel<T> {
     // In place output: Out = Ref, Out[Index] += Updates
     Out->ShareDataWith<T>(*Ref);
     // Apply ScatterUpdate: Out[index] += Updates[:]
-    ScatterAssign<T>(ctx.GetPlace(), Updates, Index, Out);
+    ScatterAssign<T>(ctx.device_context(), Updates, Index, Out);
   }
 };
 
@@ -56,7 +56,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
     dRef->ShareDataWith<T>(*dOut);
     dUpdates->mutable_data<T>(ctx.GetPlace());
     // Gradient by Gather: dUpdates += dO[Index]
-    CPUGather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
+    CPUGather<T>(ctx.device_context(), dOut, Index, dUpdates);
   }
 };
 
diff --git a/paddle/operators/scatter_test.cc b/paddle/operators/scatter_test.cc
index bace6419d0..321bba3dad 100644
--- a/paddle/operators/scatter_test.cc
+++ b/paddle/operators/scatter_test.cc
@@ -40,7 +40,9 @@ TEST(scatter, ScatterUpdate) {
 
   float* p_output = output->mutable_data<float>(make_ddim({4, 4}), CPUPlace());
 
-  ScatterAssign<float>(CPUPlace(), src, index, output);
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  paddle::platform::CPUDeviceContext ctx(*cpu_place);
+  ScatterAssign<float>(ctx, src, index, output);
 
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0));
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], float(0));

From 494b3bda7d784315433b85826c9cbd18cac5723a Mon Sep 17 00:00:00 2001
From: dongzhihong <dzhwinter@gmail.com>
Date: Tue, 3 Oct 2017 10:28:57 -0700
Subject: [PATCH 24/37] fix backward test case

---
 paddle/framework/backward.cc      |  2 +-
 paddle/framework/backward_test.cc | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 35759f8e78..2c13ddd8d0 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -159,7 +159,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
         insert_position.push_back(
             {dup_op.back(),
              OpRegistry::CreateOp(
-                 "add", {{"X", {insert_add_x}}, {"X", {insert_add_y}}},
+                 "sum", {{"X", {insert_add_x}}, {"X", {insert_add_y}}},
                  {{"Out", {insert_add_out}}}, {})});
       }
     }
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 6932f5b989..a36e7bde8c 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -133,15 +133,18 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
   }
 };
 
-class AddOpMaker : public OpProtoAndCheckerMaker {
+class SumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  SumOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x").AsDuplicable();
-    AddOutput("Out", "out");
+    AddInput("X", "the input tensors of sum operator.")
+        .AsDuplicable()
+        .NotInGradient();
+    AddOutput("Out", "the output tensor of sum operator.").NotInGradient();
     AddComment("");
   }
 };
+
 }  // namespace framework
 }  // namespace paddle
 
@@ -154,7 +157,7 @@ REGISTER_OP(mul, f::NOP, f::MulOpMaker, mul_grad, f::NOP);
 REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, sigmoid_grad, f::NOP);
 REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NOP, f::NoGradOpMaker);
 REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NOP, f::FillZeroOpMaker);
-REGISTER_OP(add, f::NOP, f::AddOpMaker, add_grad, f::NOP);
+REGISTER_OP(sum, f::NOP, f::SumOpMaker, sum_grad, f::NOP);
 REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
 REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad,
             f::NOP);
@@ -283,7 +286,7 @@ TEST(Backward, net_shared_weight) {
   ASSERT_TRUE(bwd->IsNetOp());
   auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
   ASSERT_EQ(3UL, bwd_net->ops_.size());
-  ASSERT_EQ("add", bwd_net->ops_[2]->Type());
+  ASSERT_EQ("sum", bwd_net->ops_[2]->Type());
 }
 
 TEST(Backward, op_register_grad_not_for_network) {

From 2d876b864395513de8db52db944ee5e8150d2730 Mon Sep 17 00:00:00 2001
From: zchen0211 <chenzhuoyuan07@gmail.com>
Date: Tue, 3 Oct 2017 10:54:22 -0700
Subject: [PATCH 25/37] gather scatter fix according to google style

---
 paddle/operators/cond_op.cc      |  4 ++--
 paddle/operators/gather.cu.h     | 14 +++++++-------
 paddle/operators/gather.h        | 18 +++++++++---------
 paddle/operators/gather_op.cu    |  4 ++--
 paddle/operators/gather_op.h     |  4 ++--
 paddle/operators/gather_test.cc  |  2 +-
 paddle/operators/scatter.cu.h    | 18 +++++++++---------
 paddle/operators/scatter.h       | 16 +++++++---------
 paddle/operators/scatter_op.cu   |  4 ++--
 paddle/operators/scatter_op.h    |  4 ++--
 paddle/operators/scatter_test.cc |  2 +-
 11 files changed, 44 insertions(+), 46 deletions(-)

diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index 7d7f1ba3b1..2737104a20 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -126,7 +126,7 @@ void CondOp::PrepareDataForSubnet(
       dim[0] = index_tensors[i].dims()[0];
       tensor_child->mutable_data<float>(dim, platform::CPUPlace());
 
-      CPUGather<float>(dev_ctx, tensor_parent, &index_tensors[i], tensor_child);
+      CPUGather<float>(dev_ctx, *tensor_parent, index_tensors[i], tensor_child);
     }
   }
 
@@ -187,7 +187,7 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
       Variable* var_child = sub_scopes[i]->FindVar(output);
       PADDLE_ENFORCE_NOT_NULL(var_child);
       auto* tensor_child = &var_child->Get<LoDTensor>();
-      ScatterAssign<float>(dev_ctx, tensor_child, &index_tensors[i],
+      ScatterAssign<float>(dev_ctx, *tensor_child, index_tensors[i],
                            tensor_parent);
     }
   }
diff --git a/paddle/operators/gather.cu.h b/paddle/operators/gather.cu.h
index 2ae11376a2..8d04ecd284 100644
--- a/paddle/operators/gather.cu.h
+++ b/paddle/operators/gather.cu.h
@@ -46,14 +46,14 @@ __global__ void GatherCUDAKernel(const T* params, const int* indices, T* output,
  * return: output tensor
  */
 template <typename T>
-void GPUGather(const platform::DeviceContext& ctx, const Tensor* src,
-               const Tensor* index, Tensor* output) {
+void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
+               const Tensor& index, Tensor* output) {
   // PADDLE_ENFORCE(platform::is_gpu_place(place));
   // check index of shape 1-D
-  PADDLE_ENFORCE(index->dims().size() == 1);
-  int index_size = index->dims()[0];
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
 
-  auto src_dims = src->dims();
+  auto src_dims = src.dims();
   framework::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
@@ -61,8 +61,8 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor* src,
   int slice_size = 1;
   for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
 
-  const T* p_src = src->data<T>();
-  const int* p_index = index->data<int>();
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
   T* p_output = output->data<T>();
 
   int block = 512;
diff --git a/paddle/operators/gather.h b/paddle/operators/gather.h
index 1e39a6da27..052db49cb3 100644
--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
@@ -24,6 +24,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using framework::Tensor;
+
 /**
  * A thin wrapper for gathering on cpu tensor
  * Return a new tensor from source tensor, gathered according to index
@@ -32,21 +34,19 @@ namespace operators {
  * return: output tensor
  */
 template <typename T>
-void CPUGather(const platform::DeviceContext& ctx,
-               const paddle::framework::Tensor* src,
-               const paddle::framework::Tensor* index,
-               paddle::framework::Tensor* output) {
+void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
+               const Tensor& index, Tensor* output) {
   PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
   // check index of shape 1-D
-  PADDLE_ENFORCE(index->dims().size() == 1);
-  int index_size = index->dims()[0];
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
 
-  auto src_dims = src->dims();
+  auto src_dims = src.dims();
   framework::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
-  const T* p_src = src->data<T>();
-  const int* p_index = index->data<int>();
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
   T* p_output = output->data<T>();
 
   // slice size
diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu
index 9937be5915..92219d6a43 100644
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
@@ -32,7 +32,7 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
 
     output->mutable_data<T>(ctx.GetPlace());
 
-    GPUGather<T>(ctx.device_context(), x, index, output);
+    GPUGather<T>(ctx.device_context(), *x, *index, output);
   }
 };
 
@@ -52,7 +52,7 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
     auto place = ctx.GetEigenDevice<platform::GPUPlace>();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
 
-    GPUScatterAssign<T>(ctx.device_context(), dO, Index, dX);
+    GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
   }
 };
 
diff --git a/paddle/operators/gather_op.h b/paddle/operators/gather_op.h
index 5bd2c36f7b..8276ed0d3d 100644
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
@@ -36,7 +36,7 @@ class GatherOpKernel : public framework::OpKernel<T> {
 
     output->mutable_data<T>(ctx.GetPlace());
 
-    CPUGather<T>(ctx.device_context(), x, index, output);
+    CPUGather<T>(ctx.device_context(), *x, *index, output);
   }
 };
 
@@ -56,7 +56,7 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
     auto place = ctx.GetEigenDevice<platform::CPUPlace>();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
 
-    ScatterAssign<T>(ctx.device_context(), dO, Index, dX);
+    ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
   }
 };
 
diff --git a/paddle/operators/gather_test.cc b/paddle/operators/gather_test.cc
index d8bf8dd9a4..cbd86b8796 100644
--- a/paddle/operators/gather_test.cc
+++ b/paddle/operators/gather_test.cc
@@ -43,7 +43,7 @@ TEST(Gather, GatherData) {
 
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  CPUGather<int>(ctx, src, index, output);
+  CPUGather<int>(ctx, *src, *index, output);
 
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
   for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
diff --git a/paddle/operators/scatter.cu.h b/paddle/operators/scatter.cu.h
index f4a3965d94..d95436be4f 100644
--- a/paddle/operators/scatter.cu.h
+++ b/paddle/operators/scatter.cu.h
@@ -19,6 +19,8 @@
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 #define CUDA_1D_KERNEL_LOOP(i, n)                              \
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
        i += blockDim.x * gridDim.x)
@@ -45,16 +47,14 @@ __global__ void ScatterCUDAKernel(const T* params, const int* indices,
  * return: output tensor
  */
 template <typename T>
-void GPUScatterAssign(const platform::DeviceContext& ctx,
-                      const paddle::framework::Tensor* src,
-                      const paddle::framework::Tensor* index,
-                      paddle::framework::Tensor* output) {
+void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
+                      const Tensor& index, Tensor* output) {
   // PADDLE_ENFORCE(platform::is_gpu_place(place));
   // check index of shape 1-D
-  PADDLE_ENFORCE(index->dims().size() == 1);
-  int index_size = index->dims()[0];
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
 
-  auto src_dims = src->dims();
+  auto src_dims = src.dims();
   framework::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
@@ -62,8 +62,8 @@ void GPUScatterAssign(const platform::DeviceContext& ctx,
   int slice_size = 1;
   for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
 
-  const T* p_src = src->data<T>();
-  const int* p_index = index->data<int>();
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
   T* p_output = output->data<T>();
 
   int block = 512;
diff --git a/paddle/operators/scatter.h b/paddle/operators/scatter.h
index 0d174d3b5b..c1fb844ebd 100644
--- a/paddle/operators/scatter.h
+++ b/paddle/operators/scatter.h
@@ -33,20 +33,18 @@ using Tensor = framework::Tensor;
  * return: output tensor
  */
 template <typename T>
-void ScatterAssign(const platform::DeviceContext& ctx,
-                   const paddle::framework::Tensor* src,
-                   const paddle::framework::Tensor* index,
-                   paddle::framework::Tensor* output) {
+void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
+                   const Tensor& index, Tensor* output) {
   PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
   // check index of shape 1-D
-  PADDLE_ENFORCE(index->dims().size() == 1);
-  int index_size = index->dims()[0];
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
 
-  auto src_dims = src->dims();
+  auto src_dims = src.dims();
   auto dst_dims = output->dims();
 
-  const T* p_src = src->data<T>();
-  const int* p_index = index->data<int>();
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
   T* p_output = output->data<T>();
 
   // check src shape and dst shape should match
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
index 6d13a876f9..06f4d75944 100644
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
@@ -32,7 +32,7 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
 
     Out->ShareDataWith<T>(*Ref);
 
-    GPUScatterAssign<T>(ctx.device_context(), Updates, Index, Out);
+    GPUScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
   }
 };
 
@@ -51,7 +51,7 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
     dRef->ShareDataWith<T>(*dOut);
     dUpdates->mutable_data<T>(ctx.GetPlace());
     // Gradient by Gather: dUpdates = dO[Index]
-    GPUGather<T>(ctx.device_context(), dOut, Index, dUpdates);
+    GPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
   }
 };
 
diff --git a/paddle/operators/scatter_op.h b/paddle/operators/scatter_op.h
index ac04968549..6101219006 100644
--- a/paddle/operators/scatter_op.h
+++ b/paddle/operators/scatter_op.h
@@ -37,7 +37,7 @@ class ScatterOpKernel : public framework::OpKernel<T> {
     // In place output: Out = Ref, Out[Index] += Updates
     Out->ShareDataWith<T>(*Ref);
     // Apply ScatterUpdate: Out[index] += Updates[:]
-    ScatterAssign<T>(ctx.device_context(), Updates, Index, Out);
+    ScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
   }
 };
 
@@ -56,7 +56,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
     dRef->ShareDataWith<T>(*dOut);
     dUpdates->mutable_data<T>(ctx.GetPlace());
     // Gradient by Gather: dUpdates += dO[Index]
-    CPUGather<T>(ctx.device_context(), dOut, Index, dUpdates);
+    CPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
   }
 };
 
diff --git a/paddle/operators/scatter_test.cc b/paddle/operators/scatter_test.cc
index 321bba3dad..00dbdacbfe 100644
--- a/paddle/operators/scatter_test.cc
+++ b/paddle/operators/scatter_test.cc
@@ -42,7 +42,7 @@ TEST(scatter, ScatterUpdate) {
 
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  ScatterAssign<float>(ctx, src, index, output);
+  ScatterAssign<float>(ctx, *src, *index, output);
 
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0));
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], float(0));

From 3395bf7ad0f8dd6443bb2075e67331d3152eef43 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 3 Oct 2017 12:48:03 -0700
Subject: [PATCH 26/37] Remove duplicated method in OpDesc

---
 paddle/framework/grad_op_desc_maker.h | 12 ++++++------
 paddle/framework/op_desc.cc           | 18 ------------------
 paddle/framework/op_desc.h            | 10 ++--------
 3 files changed, 8 insertions(+), 32 deletions(-)

diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h
index b4b6d54bf3..e6d63e4b8a 100644
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -44,12 +44,12 @@ class GradOpDescMakerBase {
     return ToGradNames(fwd_op_.Output(name));
   }
 
-  std::vector<std::string> InputParamNames() const {
-    return this->fwd_op_.InputParamNames();
+  std::vector<std::string> InputNames() const {
+    return this->fwd_op_.InputNames();
   }
 
-  std::vector<std::string> OutputParamNames() const {
-    return this->fwd_op_.OutputParamNames();
+  std::vector<std::string> OutputNames() const {
+    return this->fwd_op_.OutputNames();
   }
 
   std::vector<std::string> Input(const std::string& name) const {
@@ -96,12 +96,12 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
     OpDescBind grad;
     grad.SetType(this->GradOpType());
 
-    for (auto& input_param : this->InputParamNames()) {
+    for (auto& input_param : this->InputNames()) {
       grad.SetInput(input_param, this->Input(input_param));
       grad.SetOutput(GradVarName(input_param), this->InputGrad(input_param));
     }
 
-    for (auto& output_param : this->OutputParamNames()) {
+    for (auto& output_param : this->OutputNames()) {
       grad.SetInput(output_param, this->Output(output_param));
       grad.SetInput(GradVarName(output_param), this->OutputGrad(output_param));
     }
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 33a064890c..852f0f1eb8 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -31,15 +31,6 @@ const std::vector<std::string> &OpDescBind::Input(
   return it->second;
 }
 
-std::vector<std::string> OpDescBind::InputNames() const {
-  std::vector<std::string> retv;
-  retv.reserve(this->inputs_.size());
-  for (auto &ipt : this->inputs_) {
-    retv.push_back(ipt.first);
-  }
-  return retv;
-}
-
 void OpDescBind::SetInput(const std::string &param_name,
                           const std::vector<std::string> &args) {
   need_update_ = true;
@@ -54,15 +45,6 @@ const std::vector<std::string> &OpDescBind::Output(
   return it->second;
 }
 
-std::vector<std::string> OpDescBind::OutputNames() const {
-  std::vector<std::string> retv;
-  retv.reserve(this->outputs_.size());
-  for (auto &ipt : this->outputs_) {
-    retv.push_back(ipt.first);
-  }
-  return retv;
-}
-
 void OpDescBind::SetOutput(const std::string &param_name,
                            const std::vector<std::string> &args) {
   need_update_ = true;
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index 12706b9d71..508bcaa67e 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -35,15 +35,11 @@ class OpDescBind {
 
   const std::vector<std::string> &Input(const std::string &name) const;
 
-  std::vector<std::string> InputNames() const;
-
   void SetInput(const std::string &param_name,
                 const std::vector<std::string> &args);
 
   const std::vector<std::string> &Output(const std::string &name) const;
 
-  std::vector<std::string> OutputNames() const;
-
   void SetOutput(const std::string &param_name,
                  const std::vector<std::string> &args);
 
@@ -71,10 +67,8 @@ class OpDescBind {
   // Only be used in C++
   void SetAttrMap(const AttributeMap &attr_map);
 
-  std::vector<std::string> InputParamNames() const { return MapKeys(inputs_); }
-  std::vector<std::string> OutputParamNames() const {
-    return MapKeys(outputs_);
-  }
+  std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
+  std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }
 
  private:
   template <typename MapType>

From 495a80a73645bcabe9e392a6b1a2878845f7a234 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 3 Oct 2017 13:17:10 -0700
Subject: [PATCH 27/37] Update design doc

---
 doc/design/register_grad_op.md | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/doc/design/register_grad_op.md b/doc/design/register_grad_op.md
index 12b04fb271..cdb7a8435b 100644
--- a/doc/design/register_grad_op.md
+++ b/doc/design/register_grad_op.md
@@ -33,22 +33,45 @@ The mapping relationship between an operator and its gradient operators is a fun
 
 ```cpp
 // (OpDesc) --> vector<OpDesc>
-using GradOpDescMaker = std::function<std::vector<OpDesc>(const OpDesc&)>;
+std::function<std::vector<OpDescBind>(const OpDescBind&)>;
 ```
 
-The function take a `OpDesc` of the forward operator and return one or many gradient operator descriptions.
+The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for protobuf message `OpDesc` to manipulate `OpDesc` fast.
 
 The `GradOpDescMaker` will be registered in `OpInfo`, to replace `grad_op_type_` field. The `OpInfo` should be
 
 ```cpp
 struct OpInfo {
-  GradOpDescMaker grad_op_maker_;
+  std::function<std::vector<OpDescBind>(const OpDescBind&)>  grad_op_maker_;
   ...
 };
 ```
 
 The `grad_op_maker_ ` is `nullptr` if the operator does not have associated gradient operators.
 
+We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is
+
+```cpp
+class GradOpDescMakerBase {
+public:
+  GradOpDescMakerBase(const OpDescBind& );
+  virtual std::vector<OpDescBind> operator()()const = 0;
+};
+```
+
+We can convert `GradOpDescMakerBase` to `std::function<std::vector<OpDescBind>(const OpDescBind&)>` by
+
+```cpp
+using GradOpMaker = ...;
+std::function<std::vector<OpDescBind>(const OpDescBind&)> func;
+func = [] (const OpDescBind& fwd_op) {
+  GradOpMaker maker(fwd_op);
+  return maker();
+};
+```
+
+We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator.
+
 We should chagne register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
 
 The user interface should be

From 62de57e1ee28bdb349148028d079b4e3192ecb46 Mon Sep 17 00:00:00 2001
From: Yi Wang <yiwang01@baidu.com>
Date: Tue, 3 Oct 2017 14:01:22 -0700
Subject: [PATCH 28/37] Update lod_tensor.md

---
 paddle/framework/lod_tensor.md | 168 +++++++++++++++++++--------------
 1 file changed, 97 insertions(+), 71 deletions(-)

diff --git a/paddle/framework/lod_tensor.md b/paddle/framework/lod_tensor.md
index 07bbdf9416..0fa14f3470 100644
--- a/paddle/framework/lod_tensor.md
+++ b/paddle/framework/lod_tensor.md
@@ -1,147 +1,173 @@
 # Design Doc: LoD (Level-of-Detail) Tensor
 
-PaddlePaddle's RNN doesn't require that all instances have the same length.  To do so, we introduce an extension to Tensor, namely, LoD Tensor.
+As other deep learning systems, PaddlePaddle supports training models from sequence data.  Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor.  What is different is that PaddlePaddle doesn't require that all sequences in a mini-batch are of the same length. Thus no need for padding zeros.
 
-## Challenge of Variable-length Inputs
+|                       | TensorFlow | PaddlePaddle |
+|-----------------------|------------|--------------|
+| RNN                   | Support    | Support      |
+| recursive RNN         | Support    | Support      |
+| padding zeros         | Must       | No need      |
+| blob data type        | Tensor     | LoDTensor    |
 
-People usually represent a mini-batch by a Tensor. For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor.  So a transformation, T, of all images can be a matrix multiplication of the 10xOx32-dimensional tensor T and the 10x32x32 Tensor.
+PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators.  The LoD index doesn't only segments a tensor, but also recursively segments sub-sequences.  This document presents the design of LoD and LoDTensor.
 
-Another example is that each mini-batch contains 32 sentences, where each word is a D-dimensional one-hot vector.  If all sentences have the same length L, we can represent this mini-batch by a 32xLxD tensor.  However, in most cases, sentences have variable lengths, and we will need an index data structure to record these variable lengths.
 
-## LoD as a Solution
+## The Challenge: Variable-length Sequences
 
-### Mini-Batch of variable-length sentences
+Most deep learning systems represent a mini-batch as a Tensor.  For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor.  Another example is that each mini-batch contains N sentences, where each word is a D-dimensional one-hot vector.  Suppose that all sentences have the same length L, we can represent this mini-batch by a NxLxD tensor.
 
-Let's imagine a mini-batch of 3 variable lengths sentences, containing 3, 1, and 2 words respectively.  We can represent it by a (3+1+2)xD tensor plus some index information:
+Both examples show that the elements of sequences are usually of the same size.  In the first example, all images are 32x32, and in the second one, all words are D-dimensional vectors.  It doesn't make sense to allow variable-sized images, as that would require transformations like convolution represented by variable-sized Tensors.
+
+The real challenge is that in most cases, sentences have variable lengths, and we will need an index data structure to segment the tensor into sequences.  Also, sequences might consist of sub-sequences.
+
+## A Solution: The LoD Index
+
+Let is visit this challenge from examples.
+
+### A Mini-Batch of Sentences
+
+Let's imagine a mini-batch of 3 variable lengths sentences composed by 3, 1, and 2 words respectively.  We can represent it by a (3+1+2)xD tensor plus some index information:
 
 ```
-   3
 3   1 2
 ||| | ||
 ```
 
-Each `|` represents a D-dimensional word vectors.  The number 3 on top indicate 3 sentences, and numbers 3, 1, and 2 on the second level represent the number of words in each sentence.
+where each `|` represents a D-dimensional word vector.  The numbers, 3, 1, and 2, form a 1-level LoD.
+
+### Recursive Sequences
+
+Let check another example of a 2-level LoD Tensor.  Consider a mini-batch of three articles with 3, 1, and 2 sentences, and each sentence consists of words:
+
+```
+3           1  2
+3   2  4    1  2  3
+||| || |||| |  || |||
+```
 
-### Mini-Batch of variable-length videos
+### A Mini-Batch of Videos
 
-This approach generalizes to the case where elements are not words, but higher dimensional objects, like images.  Suppose that a mini-batch contains videos of the same frame size 640x480.  If a mini-batch contains 3 videos of 3, 1, and 2 frames respectively.  The underlying tensor is of size (3+1+2)x640x480.  The index information illustrates as:
+LoD Tensor generalizes to the case where elements are higher dimensional objects, like images.  Suppose that a mini-batch contains videos of the same frame size 640x480.  Here is a mini-batch of 3 videos with 3, 1, and 2 frames respectively.
 
 ```
-     3
 3     1  2
 口口口 口 口口
 ```
 
-where each `口` represents an image.
+The underlying tensor is of size (3+1+2)x640x480, and each `口` represents a 640x480 image.
 
-### Mini-Batch of fixed-size images
+### A Mini-Batch of Images
 
-Let's get back to a typical example, image classification, where each mini-batch has M fixed-sized images.  The LoD Tensor representation is
+In traditional cases like a mini-batch with N fixed-sized images,  the LoD Tensor representation is as
 
 ```
-     M
 1 1 1 1     1
 口口口口 ... 口
 ```
 
-The many 1's on the second level seem duplicated.  For this particular case of 2 levels and the second level always have length 1, we can ignore the LoD index.
-
-### Design and summarization
-
-In summary, as long as that the essential elements (words  or images) have the same size, we can represent mini-batches by a LoD Tensor:
+It doesn't loss anything to ignore the many 1's in the index and to consider this LoD Tensor a usual Tensor:
 
-- The underlying tensor has size LxD1xD2x..., where D1xD2... is the size of the essential elements, and
-- The first dimension size L has an additonal property -- a LoD index as a nested vector:
+```
+口口口口 ... 口
+```
 
-  ```c++
-  typedef std::vector<std::<vector>> LoD;
-  ```
+### Model Parameters
 
-- The LoD index is not necessary when there are only two levels and all elements of the second level have length 1.
+A model parameter is just a usual Tensor, which, just like the above example, is a **0-level LoD Tensor**.
 
-## Slicing of LoD Tensor
+## The LoD Tensor
 
-Consider that we have a network with three levels of RNN: the top level one handles articles, the second level one handles sentences, and the basic level one handles words.  This network requires that mini-batches represented by 3 level LoD Tensor, for example,
+Let us revisit above example of the 2-level LoD Tensor
 
 ```
-         3
 3           1  2
 3   2  4    1  2  3
 ||| || |||| |  || |||
 ```
 
-To allow each level of RNN to handle its input, we define **the slicing of a LoD Tensor is defined as getting the j-th sequence on level i, or the <i,j>-slice**
+It is indeed a tree, where leaves are elementary sequences identified by **branches**.
+
+For example, the third sentence in above example is identified by branch <0,2>, where 0 indicates the first article with length 3, and 2 indicates the third sentence in this article with length 4.
+
+### The LoD Index
 
-For example, the <2,1>-slice of above slice is
+We can save the LoD index in above example
 
 ```
-2
-||
+3           1  2
+3   2  4    1  2  3
 ```
 
-and the <1,2>-slice of above example is
+in a not-full 2D matrix:
 
+```c++
+typedef std::vector<std::vector<int> > LoD;
 ```
-2
-2  3
-|| |||
-```
 
-Let's go on slicing this slice.  Its <1,1>-slice is
+where
+
+- `LoD.size()` is the number of levels, or the maximum length of branches,
+- `LoD[i][j]` is the length of the j-th segment at the i-th level.
+
+## The Offset Representation
+
+To quickly access elementary sequences, we adopt an offset representation -- instead of saving the lengths, we save the beginning and ending elements of sequences.
+
+In the above example, we accumulate the length of elementary sequences:
 
 ```
-1
-1
-|
+3 2 4 1 2 3
 ```
 
-### The Slicing Algorithm
+into offsets
 
-The algorithm, with over-simplified data structure, is defined as
+```
+0  3  5   9   10  12   15
+   =  =   =   =   =    =
+   3  2+3 4+5 1+9 2+10 3+12
+```
 
-```c++
-typedef std::vector<std::vector<int>> LoD;
+so we know that the first sentence is from word 0 to word 3, and the second sentence from work 3 to word 5.
 
-struct LoDTensor {
-  LoD lod_;
-  float* tensor_;
-};
+Similarly, lengths in the top level LoD
 
-LoDTensor Slice(const LoDTensor& lodt, int level, int sequence);
+```
+3 1 2
 ```
 
-Let us revisit the example above
+is transformed into offsets of elements/words:
 
 ```
-         3
-3           1  2
-3   2  4    1  2  3
-||| || |||| |  || |||
+0 9     10  15
+  =     =   =
+  3+2+4 1+9 2+3+10
 ```
 
-Suppose that we want to retrieve the <1,2>-slice
+so we can tell that the first article is from word 0 to word 9, and the second article is from word 9 to word 10.
+
+The complete offset representation is as follows:
 
 ```
-2
-2  3
-|| |||
+0          9 10      15
+0  3  5    9 10  12  15
+||| || |||| |  || |||
 ```
 
-we will need to find out the starting position of this slice by summing over all leaf nodes in `LoD` to the left of the slice, i.e., 3 + 2 + 4 + 1 = 10.
+## Slicing of LoD Tensors
+
+When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences.  Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.
 
-To avoid the traversal of the LoD tree at slicing time,  we can do it at the construction time -- instead of saving the lengths of the next level in the LoD tree, we can save the starting offset of the next level.  For example, above LoD Tensor can be transformed into
+For example, the <2>-slice of above example is
 
 ```
-        0
-0           9  10
-0   3  5    9  10 12
-||| || |||| |  || |||
+10      15
+10  12  15
+  || |||
 ```
 
-We don't really need the 0 on top, so the LoD Tensor could be
+and the <2,0>-slice of above slice is
 
 ```
-0           9  10
-0   3  5    9  10 12
-||| || |||| |  || |||
+10  12
+  ||
 ```

From 48a9ab4a0896b3102637fb7606b27bbf6b097bc3 Mon Sep 17 00:00:00 2001
From: Markus Kliegl <klieglmarkus@baidu.com>
Date: Tue, 3 Oct 2017 14:21:42 -0700
Subject: [PATCH 29/37] minor language fixes

---
 paddle/framework/lod_tensor.md | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/paddle/framework/lod_tensor.md b/paddle/framework/lod_tensor.md
index 0fa14f3470..597bc48cf3 100644
--- a/paddle/framework/lod_tensor.md
+++ b/paddle/framework/lod_tensor.md
@@ -1,6 +1,6 @@
 # Design Doc: LoD (Level-of-Detail) Tensor
 
-As other deep learning systems, PaddlePaddle supports training models from sequence data.  Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor.  What is different is that PaddlePaddle doesn't require that all sequences in a mini-batch are of the same length. Thus no need for padding zeros.
+Like other deep learning systems, PaddlePaddle supports training models from sequence data.  Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor.  What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
 
 |                       | TensorFlow | PaddlePaddle |
 |-----------------------|------------|--------------|
@@ -9,24 +9,24 @@ As other deep learning systems, PaddlePaddle supports training models from seque
 | padding zeros         | Must       | No need      |
 | blob data type        | Tensor     | LoDTensor    |
 
-PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators.  The LoD index doesn't only segments a tensor, but also recursively segments sub-sequences.  This document presents the design of LoD and LoDTensor.
+PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators.  The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences.  This document presents the design of LoD and LoDTensor.
 
 
 ## The Challenge: Variable-length Sequences
 
 Most deep learning systems represent a mini-batch as a Tensor.  For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor.  Another example is that each mini-batch contains N sentences, where each word is a D-dimensional one-hot vector.  Suppose that all sentences have the same length L, we can represent this mini-batch by a NxLxD tensor.
 
-Both examples show that the elements of sequences are usually of the same size.  In the first example, all images are 32x32, and in the second one, all words are D-dimensional vectors.  It doesn't make sense to allow variable-sized images, as that would require transformations like convolution represented by variable-sized Tensors.
+Both examples show that the elements of sequences are usually of the same size.  In the first example, all images are 32x32, and in the second one, all words are D-dimensional vectors.  It doesn't make sense to allow variable-sized images, as that would require transformations like convolution to handle variable-sized Tensors.
 
 The real challenge is that in most cases, sentences have variable lengths, and we will need an index data structure to segment the tensor into sequences.  Also, sequences might consist of sub-sequences.
 
 ## A Solution: The LoD Index
 
-Let is visit this challenge from examples.
+To understand our solution, it is best to look at some examples.
 
 ### A Mini-Batch of Sentences
 
-Let's imagine a mini-batch of 3 variable lengths sentences composed by 3, 1, and 2 words respectively.  We can represent it by a (3+1+2)xD tensor plus some index information:
+Let's imagine a mini-batch of 3 variable lengths sentences composed of 3, 1, and 2 words, respectively.  We can represent the mini-batch by a (3+1+2)xD tensor plus some index information:
 
 ```
 3   1 2
@@ -37,7 +37,7 @@ where each `|` represents a D-dimensional word vector.  The numbers, 3, 1, and 2
 
 ### Recursive Sequences
 
-Let check another example of a 2-level LoD Tensor.  Consider a mini-batch of three articles with 3, 1, and 2 sentences, and each sentence consists of words:
+Let check another example of a 2-level LoD Tensor.  Consider a mini-batch of three articles with 3, 1, and 2 sentences, and each sentence consists of a variable number of words:
 
 ```
 3           1  2
@@ -47,7 +47,7 @@ Let check another example of a 2-level LoD Tensor.  Consider a mini-batch of thr
 
 ### A Mini-Batch of Videos
 
-LoD Tensor generalizes to the case where elements are higher dimensional objects, like images.  Suppose that a mini-batch contains videos of the same frame size 640x480.  Here is a mini-batch of 3 videos with 3, 1, and 2 frames respectively.
+LoD tensors generalize to the case where elements are higher dimensional objects, like images.  Suppose that a mini-batch contains videos of the same frame size 640x480.  Here is a mini-batch of 3 videos with 3, 1, and 2 frames, respectively.
 
 ```
 3     1  2
@@ -65,7 +65,7 @@ In traditional cases like a mini-batch with N fixed-sized images,  the LoD Tenso
 口口口口 ... 口
 ```
 
-It doesn't loss anything to ignore the many 1's in the index and to consider this LoD Tensor a usual Tensor:
+In this case, we don't lose any information by ignoring the many 1's in the index and simply considering this LoD Tensor as a usual Tensor:
 
 ```
 口口口口 ... 口
@@ -91,7 +91,7 @@ For example, the third sentence in above example is identified by branch <0,2>,
 
 ### The LoD Index
 
-We can save the LoD index in above example
+We can save the LoD index in the above example
 
 ```
 3           1  2
@@ -129,13 +129,13 @@ into offsets
 
 so we know that the first sentence is from word 0 to word 3, and the second sentence from work 3 to word 5.
 
-Similarly, lengths in the top level LoD
+Similarly, the lengths in the top level LoD
 
 ```
 3 1 2
 ```
 
-is transformed into offsets of elements/words:
+are transformed into offsets of elements/words as follows:
 
 ```
 0 9     10  15
@@ -148,9 +148,9 @@ so we can tell that the first article is from word 0 to word 9, and the second a
 The complete offset representation is as follows:
 
 ```
-0          9 10      15
-0  3  5    9 10  12  15
-||| || |||| |  || |||
+0           9 10       15
+0   3  5    9 10  12   15
+ ||| || |||| |  ||  |||
 ```
 
 ## Slicing of LoD Tensors

From b2806135a53cbe85fbc764375d9cecc2596ab4be Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 3 Oct 2017 14:41:21 -0700
Subject: [PATCH 30/37] Change Interface to unique_ptr

---
 doc/design/register_grad_op.md        |  6 +++---
 paddle/framework/grad_op_desc_maker.h | 28 +++++++++++++++------------
 paddle/framework/op_info.h            |  2 +-
 paddle/framework/type_defs.h          |  4 ++++
 4 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/doc/design/register_grad_op.md b/doc/design/register_grad_op.md
index cdb7a8435b..3cf8a59446 100644
--- a/doc/design/register_grad_op.md
+++ b/doc/design/register_grad_op.md
@@ -42,7 +42,7 @@ The `GradOpDescMaker` will be registered in `OpInfo`, to replace `grad_op_type_`
 
 ```cpp
 struct OpInfo {
-  std::function<std::vector<OpDescBind>(const OpDescBind&)>  grad_op_maker_;
+  std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>  grad_op_maker_;
   ...
 };
 ```
@@ -55,11 +55,11 @@ We propose a base class called `GradOpDescMakerBase` to let operator developers
 class GradOpDescMakerBase {
 public:
   GradOpDescMakerBase(const OpDescBind& );
-  virtual std::vector<OpDescBind> operator()()const = 0;
+  virtual std::vector<std::unique_ptr<OpDescBind>> operator()()const = 0;
 };
 ```
 
-We can convert `GradOpDescMakerBase` to `std::function<std::vector<OpDescBind>(const OpDescBind&)>` by
+We can convert `GradOpDescMakerBase` to `std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>` by
 
 ```cpp
 using GradOpMaker = ...;
diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h
index e6d63e4b8a..e9ae6e2206 100644
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -24,7 +24,7 @@ class GradOpDescMakerBase {
   explicit GradOpDescMakerBase(const OpDescBind& fwd_op) : fwd_op_(fwd_op) {}
 
   virtual ~GradOpDescMakerBase() = default;
-  virtual std::vector<OpDescBind> operator()() const = 0;
+  virtual std::vector<std::unique_ptr<OpDescBind>> operator()() const = 0;
 
  protected:
   static std::vector<std::string> ToGradNames(
@@ -81,10 +81,14 @@ class SingleGradOpDescMaker : public GradOpDescMakerBase {
  public:
   using GradOpDescMakerBase::GradOpDescMakerBase;
 
-  std::vector<OpDescBind> operator()() const { return {this->Apply()}; }
+  std::vector<std::unique_ptr<OpDescBind>> operator()() const {
+    std::vector<std::unique_ptr<OpDescBind>> retv;
+    retv.emplace_back(this->Apply());
+    return retv;
+  }
 
  protected:
-  virtual OpDescBind Apply() const = 0;
+  virtual std::unique_ptr<OpDescBind> Apply() const = 0;
 };
 
 class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
@@ -92,23 +96,23 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
   using SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  virtual OpDescBind Apply() const {
-    OpDescBind grad;
-    grad.SetType(this->GradOpType());
+  virtual std::unique_ptr<OpDescBind> Apply() const {
+    auto* grad = new OpDescBind();
+    grad->SetType(this->GradOpType());
 
     for (auto& input_param : this->InputNames()) {
-      grad.SetInput(input_param, this->Input(input_param));
-      grad.SetOutput(GradVarName(input_param), this->InputGrad(input_param));
+      grad->SetInput(input_param, this->Input(input_param));
+      grad->SetOutput(GradVarName(input_param), this->InputGrad(input_param));
     }
 
     for (auto& output_param : this->OutputNames()) {
-      grad.SetInput(output_param, this->Output(output_param));
-      grad.SetInput(GradVarName(output_param), this->OutputGrad(output_param));
+      grad->SetInput(output_param, this->Output(output_param));
+      grad->SetInput(GradVarName(output_param), this->OutputGrad(output_param));
     }
 
-    grad.SetAttrMap(this->Attrs());
+    grad->SetAttrMap(this->Attrs());
 
-    return grad;
+    return std::unique_ptr<OpDescBind>(grad);
   }
 
   virtual std::string GradOpType() const {
diff --git a/paddle/framework/op_info.h b/paddle/framework/op_info.h
index 806a960018..8b7882485f 100644
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@@ -28,7 +28,7 @@ namespace framework {
 struct OpInfo {
   OpCreator creator_;
   std::string grad_op_type_;
-  std::function<std::vector<OpDescBind>(const OpDescBind&)> grad_op_maker_;
+  GradOpMakerFN grad_op_maker_;
   OpProto* proto_{nullptr};
   OpAttrChecker* checker_{nullptr};
 
diff --git a/paddle/framework/type_defs.h b/paddle/framework/type_defs.h
index dec5066f1e..a5b9472213 100644
--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
@@ -20,6 +20,7 @@
 namespace paddle {
 namespace framework {
 class OperatorBase;
+class OpDescBind;
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 
 // The order should be as same as framework.proto
@@ -34,5 +35,8 @@ using OpCreator = std::function<OperatorBase*(
     const std::string& /*type*/, const VariableNameMap& /*inputs*/,
     const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
 
+using GradOpMakerFN =
+    std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>;
+
 }  // namespace framework
 }  // namespace paddle

From e08367c80678804fc388004ba2ab72f754bc1143 Mon Sep 17 00:00:00 2001
From: Yi Wang <yiwang01@baidu.com>
Date: Tue, 3 Oct 2017 15:53:42 -0700
Subject: [PATCH 31/37] Add few blank lines

---
 paddle/framework/lod_tensor.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/framework/lod_tensor.md b/paddle/framework/lod_tensor.md
index 597bc48cf3..d147f1c425 100644
--- a/paddle/framework/lod_tensor.md
+++ b/paddle/framework/lod_tensor.md
@@ -20,6 +20,7 @@ Both examples show that the elements of sequences are usually of the same size.
 
 The real challenge is that in most cases, sentences have variable lengths, and we will need an index data structure to segment the tensor into sequences.  Also, sequences might consist of sub-sequences.
 
+
 ## A Solution: The LoD Index
 
 To understand our solution, it is best to look at some examples.
@@ -75,6 +76,7 @@ In this case, we don't lose any information by ignoring the many 1's in the inde
 
 A model parameter is just a usual Tensor, which, just like the above example, is a **0-level LoD Tensor**.
 
+
 ## The LoD Tensor
 
 Let us revisit above example of the 2-level LoD Tensor

From 703321e2be91d8e70e6578fb3c91f76607f2e587 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 3 Oct 2017 16:17:49 -0700
Subject: [PATCH 32/37] Fix CI

---
 paddle/framework/op_desc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index 508bcaa67e..397393f796 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -76,7 +76,7 @@ class OpDescBind {
     std::vector<typename MapType::key_type> ret_val;
     ret_val.reserve(map.size());
     std::transform(
-        map.begin(), map.end(), ret_val.begin(),
+        map.begin(), map.end(), std::back_inserter(ret_val),
         [](const typename MapType::value_type &pair) { return pair.first; });
     return ret_val;
   }

From b0d2235834cd1a94c39c1e937f95c58bd7319abc Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Tue, 3 Oct 2017 16:24:24 -0700
Subject: [PATCH 33/37] Bug fix

---
 paddle/framework/backward.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 2c13ddd8d0..89583ade95 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -147,7 +147,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
       for (size_t output_idx = 0; output_idx < dup_outputs.size() - 1;
            ++output_idx) {
         auto insert_add_x = dup_outputs[output_idx];
-        auto insert_add_y = dup_outputs[output_idx];
+        auto insert_add_y = dup_outputs[output_idx + 1];
         auto insert_add_out = name + "@SHARED@" + std::to_string(output_idx);
         // first add op inserted
         if (output_idx == dup_outputs.size() - 2) {

From f4491fa46d1583caa7f007a581995435a32f8dab Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Tue, 3 Oct 2017 16:34:21 -0700
Subject: [PATCH 34/37] Fix bug

---
 paddle/framework/backward.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 89583ade95..c0188c0e55 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -158,9 +158,8 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
         }
         insert_position.push_back(
             {dup_op.back(),
-             OpRegistry::CreateOp(
-                 "sum", {{"X", {insert_add_x}}, {"X", {insert_add_y}}},
-                 {{"Out", {insert_add_out}}}, {})});
+             OpRegistry::CreateOp("sum", {{"X", {insert_add_x, insert_add_y}}},
+                                  {{"Out", {insert_add_out}}}, {})});
       }
     }
 
@@ -200,7 +199,8 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
 
     // process recurrent gradient op as a special operator.
     if (forwardOp.Type() == "recurrent") {
-      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself), or
+      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
+      // or
       // this will result in infinite loop.
       const auto& rnnop =
           *static_cast<const operators::RecurrentOp*>(&forwardOp);

From 324876bbbfb0dd84f2172f951a2a4880bee32df4 Mon Sep 17 00:00:00 2001
From: Abhinav Arora <abhinavarora28@gmail.com>
Date: Tue, 3 Oct 2017 17:26:02 -0700
Subject: [PATCH 35/37] Changing learning rate from type Input(float) to
 Input(tensor) (#4578)

---
 paddle/operators/sgd_op.cc                      | 3 +++
 paddle/operators/sgd_op.h                       | 2 +-
 python/paddle/v2/framework/tests/test_sgd_op.py | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 8f9eae4186..1a4d3fb8c5 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -32,6 +32,9 @@ class SGDOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("param_out"),
                    "Output(param_out) of SGDOp should not be null.");
 
+    auto lr_dims = ctx->GetInputDim("learning_rate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 element");
     auto param_dim = ctx->GetInputDim("param");
     PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("grad"),
                       "Two input of SGD Op's dimension must be same.");
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index 977d201ced..e2ae65beb0 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -31,7 +31,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
     auto param = ctx.Input<Tensor>("param");
     auto grad = ctx.Input<Tensor>("grad");
     auto param_out = ctx.Output<Tensor>("param_out");
-    float lr = *ctx.Input<float>("learning_rate");
+    float lr = ctx.Input<Tensor>("learning_rate")->data<float>()[0];
 
     param_out->mutable_data<T>(ctx.GetPlace());
 
diff --git a/python/paddle/v2/framework/tests/test_sgd_op.py b/python/paddle/v2/framework/tests/test_sgd_op.py
index f1125f4edb..c05364490f 100644
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
@@ -8,7 +8,7 @@ class TestSGDOp(OpTest):
         self.op_type = "sgd"
         w = np.random.random((102, 105)).astype("float32")
         g = np.random.random((102, 105)).astype("float32")
-        lr = 0.1
+        lr = np.array([0.1]).astype("float32")
 
         self.inputs = {'param': w, 'grad': g, 'learning_rate': lr}
         self.outputs = {'param_out': w - lr * g}

From eed2c1e1d6237f421c9b8c0bbd2fd51d53beddcf Mon Sep 17 00:00:00 2001
From: Abhinav Arora <abhinavarora28@gmail.com>
Date: Wed, 4 Oct 2017 09:29:13 -0700
Subject: [PATCH 36/37] Changing SGD inputs and outputs to conform to Operator
 naming convention (#4586)

---
 paddle/operators/sgd_op.cc                    | 32 +++++++++----------
 paddle/operators/sgd_op.h                     |  8 ++---
 .../paddle/v2/framework/tests/test_sgd_op.py  |  4 +--
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 1a4d3fb8c5..31d491f130 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -23,22 +23,22 @@ class SGDOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContextBase *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("param"),
-                   "Input(param) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("grad"),
-                   "Input(grad) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("learning_rate"),
-                   "Input(learning_rate) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("param_out"),
-                   "Output(param_out) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of SGDOp should not be null.");
 
-    auto lr_dims = ctx->GetInputDim("learning_rate");
+    auto lr_dims = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
                       "Learning rate should have 1 element");
-    auto param_dim = ctx->GetInputDim("param");
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("grad"),
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"),
                       "Two input of SGD Op's dimension must be same.");
-    ctx->SetOutputDim("param_out", param_dim);
+    ctx->SetOutputDim("ParamOut", param_dim);
   }
 };
 
@@ -46,10 +46,10 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SGDOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("param", "input parameter");
-    AddInput("learning_rate", "learning rate of sgd");
-    AddInput("grad", "input gradient");
-    AddOutput("param_out", "output parameter");
+    AddInput("Param", "Input parameter");
+    AddInput("LearningRate", "Learning rate of SGD");
+    AddInput("Grad", "Input gradient");
+    AddOutput("ParamOut", "output parameter");
     AddComment(R"DOC(
 
 Simplest sgd algorithm.
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index e2ae65beb0..d72d333a9a 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -28,10 +28,10 @@ template <typename Place, typename T>
 class SGDOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto param = ctx.Input<Tensor>("param");
-    auto grad = ctx.Input<Tensor>("grad");
-    auto param_out = ctx.Output<Tensor>("param_out");
-    float lr = ctx.Input<Tensor>("learning_rate")->data<float>()[0];
+    auto param = ctx.Input<Tensor>("Param");
+    auto grad = ctx.Input<Tensor>("Grad");
+    auto param_out = ctx.Output<Tensor>("ParamOut");
+    float lr = ctx.Input<Tensor>("LearningRate")->data<float>()[0];
 
     param_out->mutable_data<T>(ctx.GetPlace());
 
diff --git a/python/paddle/v2/framework/tests/test_sgd_op.py b/python/paddle/v2/framework/tests/test_sgd_op.py
index c05364490f..2dd881e5e1 100644
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
@@ -10,8 +10,8 @@ class TestSGDOp(OpTest):
         g = np.random.random((102, 105)).astype("float32")
         lr = np.array([0.1]).astype("float32")
 
-        self.inputs = {'param': w, 'grad': g, 'learning_rate': lr}
-        self.outputs = {'param_out': w - lr * g}
+        self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
+        self.outputs = {'ParamOut': w - lr * g}
 
     def test_check_output(self):
         self.check_output()

From 84500f9487164f3cf17625c876c15d754b932ced Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 4 Oct 2017 11:25:46 -0700
Subject: [PATCH 37/37] Change `PADDLE_ONLY_CPU` to `PADDLE_WITH_GPU`

By shell command

```bash
sed -i 's#ifdef PADDLE_ONLY_CPU#ifndef PADDLE_WITH_GPU#g' `find ./paddle/ -name '*.h' -o -name '*.cc' -o -name '*.cpp' -o -name '*.c' -o -name '*.cu'`
sed -i 's#ifndef PADDLE_ONLY_CPU#ifdef PADDLE_WITH_GPU#g' `find ./paddle/ -name '*.h' -o -name '*.cc' -o -name '*.cpp' -o -name '*.c' -o -name '*.cu'`
```
---
 cmake/configure.cmake                         |  2 +-
 paddle/api/Util.cpp                           |  2 +-
 paddle/capi/Matrix.cpp                        |  2 +-
 paddle/framework/lod_tensor.h                 |  4 +--
 paddle/framework/op_registry.h                |  2 +-
 paddle/framework/operator.cc                  |  2 +-
 paddle/framework/tensor_impl.h                |  4 +--
 paddle/framework/tensor_test.cc               |  8 +++---
 paddle/function/BlockExpandOp.cpp             |  2 +-
 paddle/function/ContextProjectionOp.cpp       |  2 +-
 paddle/function/CosSimOp.cpp                  |  2 +-
 paddle/function/CropOp.cpp                    |  2 +-
 paddle/function/CrossMapNormalOp.cpp          |  2 +-
 paddle/function/DepthwiseConvOp.cpp           |  2 +-
 paddle/function/DepthwiseConvOpTest.cpp       |  2 +-
 paddle/function/GemmConvOp.cpp                |  2 +-
 paddle/function/GemmConvOpTest.cpp            |  2 +-
 paddle/function/Im2ColTest.cpp                |  2 +-
 paddle/function/MulOp.cpp                     |  2 +-
 paddle/function/PadOp.cpp                     |  2 +-
 paddle/function/RowConvOp.cpp                 |  2 +-
 paddle/function/SwitchOp.cpp                  |  2 +-
 paddle/gserver/layers/BatchNormBaseLayer.cpp  |  2 +-
 .../layers/BatchNormalizationLayer.cpp        |  6 ++---
 paddle/gserver/layers/PoolLayer.cpp           |  4 +--
 paddle/gserver/tests/LayerGradUtil.cpp        |  2 +-
 paddle/gserver/tests/test_BatchNorm.cpp       |  2 +-
 paddle/gserver/tests/test_ConvUnify.cpp       |  2 +-
 paddle/gserver/tests/test_DetectionOutput.cpp |  2 +-
 paddle/gserver/tests/test_Evaluator.cpp       |  2 +-
 paddle/gserver/tests/test_KmaxSeqScore.cpp    |  2 +-
 paddle/gserver/tests/test_LayerGrad.cpp       | 26 +++++++++----------
 paddle/gserver/tests/test_NetworkCompare.cpp  |  2 +-
 paddle/gserver/tests/test_PriorBox.cpp        |  2 +-
 .../gserver/tests/test_ProtoDataProvider.cpp  |  6 ++---
 paddle/gserver/tests/test_PyDataProvider.cpp  |  4 +--
 .../gserver/tests/test_SelectiveFCLayer.cpp   |  8 +++---
 .../gserver/tests/test_SeqSliceLayerGrad.cpp  |  2 +-
 paddle/gserver/tests/test_WarpCTCLayer.cpp    |  2 +-
 paddle/math/Matrix.cpp                        |  6 ++---
 paddle/math/SparseMatrix.cpp                  |  2 +-
 paddle/math/Vector.cpp                        |  6 ++---
 paddle/math/tests/test_Allocator.cpp          |  4 +--
 paddle/math/tests/test_BaseMatrix.cpp         |  2 +-
 paddle/math/tests/test_CpuGpuVector.cpp       |  2 +-
 paddle/math/tests/test_ExecViaCpu.cpp         |  2 +-
 paddle/math/tests/test_GpuProfiler.cpp        |  2 +-
 paddle/math/tests/test_Matrix.cpp             |  2 +-
 paddle/math/tests/test_SparseMatrix.cpp       |  6 ++---
 paddle/math/tests/test_Tensor.cu              | 20 +++++++-------
 paddle/math/tests/test_TrainingAlgorithm.cpp  |  2 +-
 paddle/math/tests/test_batchTranspose.cpp     |  2 +-
 paddle/math/tests/test_lazyAssign.cu          |  4 +--
 paddle/math/tests/test_matrixCompare.cpp      |  2 +-
 paddle/math/tests/test_perturbation.cpp       |  2 +-
 .../math/tests/test_sparseMatrixCompare.cpp   |  2 +-
 paddle/memory/detail/buddy_allocator.cc       |  2 +-
 paddle/memory/detail/system_allocator.cc      |  2 +-
 paddle/memory/detail/system_allocator.h       |  2 +-
 paddle/memory/detail/system_allocator_test.cc |  2 +-
 paddle/memory/memcpy.cc                       |  2 +-
 paddle/memory/memcpy.h                        |  2 +-
 paddle/memory/memory.cc                       |  2 +-
 paddle/memory/memory_test.cc                  |  2 +-
 paddle/operators/detail/strided_memcpy.h      |  2 +-
 paddle/operators/math/im2col_test.cc          |  4 +--
 paddle/operators/math/math_function_test.cc   |  2 +-
 paddle/operators/strided_memcpy_test.cc       |  2 +-
 paddle/platform/device_context.cc             |  2 +-
 paddle/platform/device_context.h              |  4 +--
 paddle/platform/enforce.h                     |  4 +--
 paddle/platform/gpu_info.h                    |  2 +-
 paddle/platform/variant.h                     |  2 +-
 paddle/pserver/test/SocketTest.cpp            |  2 +-
 paddle/pserver/test/test_ProtoServer.cpp      |  2 +-
 paddle/pybind/pybind.cc                       | 12 ++++-----
 paddle/pybind/tensor_py.h                     |  2 +-
 paddle/trainer/MergeModel.cpp                 |  2 +-
 paddle/trainer/tests/test_Compare.cpp         |  2 +-
 paddle/trainer/tests/test_CompareSparse.cpp   |  4 +--
 paddle/trainer/tests/test_Trainer.cpp         |  4 +--
 paddle/trainer/tests/test_TrainerOnePass.cpp  |  6 ++---
 .../test_recurrent_machine_generation.cpp     |  2 +-
 paddle/utils/Flags.cpp                        |  2 +-
 paddle/utils/Util.h                           |  2 +-
 paddle/utils/Version.h                        |  2 +-
 86 files changed, 141 insertions(+), 141 deletions(-)

diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 51c3b918cc..926a7b1d69 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -49,11 +49,11 @@ if(NOT WITH_GOLANG)
 endif(NOT WITH_GOLANG)
 
 if(NOT WITH_GPU)
-    add_definitions(-DPADDLE_ONLY_CPU)
     add_definitions(-DHPPL_STUB_FUNC)
 
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
+    add_definitions(-DPADDLE_WITH_GPU)
     FIND_PACKAGE(CUDA REQUIRED)
 
     if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index d369df5d4e..7446d892fd 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -47,7 +47,7 @@ bool isUsingGpu() { return FLAGS_use_gpu; }
 void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
 
 bool isGpuVersion() {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
   return false;
 #else
   return true;
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index d898ebe261..5b3737a759 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -46,7 +46,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
   if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
   paddle::real* buf = ptr->mat->getRowBuf(rowID);
   size_t width = ptr->mat->getWidth();
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   hl_memcpy(buf, rowArray, sizeof(paddle::real) * width);
 #else
   std::copy(rowArray, rowArray + width, buf);
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 49786a4a66..b12c95b6b7 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -15,7 +15,7 @@
 #pragma once
 
 #include <memory>
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include <thrust/system/cuda/experimental/pinned_allocator.h>
@@ -29,7 +29,7 @@
 namespace paddle {
 namespace framework {
 
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
 template <typename T>
 using Vector = std::vector<T>;
 #else
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 4ee2c7d275..aca6579f36 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -211,7 +211,7 @@ class OpKernelRegistrar : public Registrar {
 // TODO(fengjiayi): The following macros
 // seems ugly, do we have better method?
 
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
 #define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
 #else
 #define USE_OP_KERNEL(op_type)        \
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 1012a30b0a..21c1c6f9e6 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -25,7 +25,7 @@ Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
   return *device_context_.GetEigenDevice<platform::CPUPlace>();
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 template <>
 Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index a5405f9c31..1cde1f74b8 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -65,7 +65,7 @@ inline T* Tensor::mutable_data(platform::Place place) {
       holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
           boost::get<platform::CPUPlace>(place), size));
     } else if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
       PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
     }
 #else
@@ -103,7 +103,7 @@ inline void Tensor::CopyFrom(const Tensor& src,
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   else if (platform::is_gpu_place(src_place) &&
            platform::is_cpu_place(dst_place)) {
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index e2ec738de3..86c6945ab5 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -74,7 +74,7 @@ TEST(Tensor, MutableData) {
     EXPECT_EQ(p1, p2);
   }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   {
     Tensor src_tensor;
     float* p1 = nullptr;
@@ -126,7 +126,7 @@ TEST(Tensor, ShareDataWith) {
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   {
     Tensor src_tensor;
     Tensor dst_tensor;
@@ -163,7 +163,7 @@ TEST(Tensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   {
     Tensor src_tensor;
     src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
@@ -218,7 +218,7 @@ TEST(Tensor, CopyFrom) {
       EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
     }
   }
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   {
     Tensor src_tensor;
     Tensor gpu_tensor;
diff --git a/paddle/function/BlockExpandOp.cpp b/paddle/function/BlockExpandOp.cpp
index a89b6bba45..ad78f5f584 100644
--- a/paddle/function/BlockExpandOp.cpp
+++ b/paddle/function/BlockExpandOp.cpp
@@ -194,7 +194,7 @@ public:
 
 REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward);
 REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward);
 REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward);
 #endif
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index b87750b742..ab18c39df8 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -395,7 +395,7 @@ REGISTER_TYPED_FUNC(ContextProjectionForward,
 REGISTER_TYPED_FUNC(ContextProjectionBackward,
                     CPU,
                     ContextProjectionBackwardFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 REGISTER_TYPED_FUNC(ContextProjectionForward,
                     GPU,
                     ContextProjectionForwardFunc);
diff --git a/paddle/function/CosSimOp.cpp b/paddle/function/CosSimOp.cpp
index 7ece7b2dfe..4418f144d3 100644
--- a/paddle/function/CosSimOp.cpp
+++ b/paddle/function/CosSimOp.cpp
@@ -233,7 +233,7 @@ private:
 
 REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
 REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
 REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
 #endif
diff --git a/paddle/function/CropOp.cpp b/paddle/function/CropOp.cpp
index f12ee43e3d..39504cc2c1 100644
--- a/paddle/function/CropOp.cpp
+++ b/paddle/function/CropOp.cpp
@@ -169,7 +169,7 @@ private:
 
 REGISTER_TYPED_FUNC(Crop, CPU, CropFunc);
 REGISTER_TYPED_FUNC(CropGrad, CPU, CropGradFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 REGISTER_TYPED_FUNC(Crop, GPU, CropFunc);
 REGISTER_TYPED_FUNC(CropGrad, GPU, CropGradFunc);
 #endif
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index ef878bfbba..1cf0918bed 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -336,7 +336,7 @@ private:
 
 REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
 REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
 REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc);
 #endif
diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index 2f3112fe65..7656ab3d0a 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -292,7 +292,7 @@ REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
 REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
                     CPU,
                     DepthwiseConvGradFilterFunction);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 REGISTER_TYPED_FUNC(DepthwiseConv, GPU, DepthwiseConvFunction);
 REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
                     GPU,
diff --git a/paddle/function/DepthwiseConvOpTest.cpp b/paddle/function/DepthwiseConvOpTest.cpp
index d8e8c889d5..39033ecb2b 100644
--- a/paddle/function/DepthwiseConvOpTest.cpp
+++ b/paddle/function/DepthwiseConvOpTest.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 
 namespace paddle {
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(DepthwiseConv, Forward) {
   DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
       "GemmConv-CPU", "DepthwiseConv-GPU", forward);
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index f8cf4ebea8..68e08c1480 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -340,7 +340,7 @@ public:
 REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
 REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
 REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction);
 REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction);
 REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction);
diff --git a/paddle/function/GemmConvOpTest.cpp b/paddle/function/GemmConvOpTest.cpp
index 5283d79a5a..bd1cf3c6a4 100644
--- a/paddle/function/GemmConvOpTest.cpp
+++ b/paddle/function/GemmConvOpTest.cpp
@@ -24,7 +24,7 @@ TEST(GemmConv, NaiveConv) {
       "NaiveConv-CPU", "GemmConv-CPU", forward);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(GemmConv, Forward) {
   Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
       "GemmConv-CPU", "GemmConv-GPU", forward);
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
index acc88a553a..55325e94b5 100644
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -116,7 +116,7 @@ void TestIm2ColFunctor() {
 
 TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 
 TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
 
diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
index 25e41edad5..655026320c 100644
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
@@ -341,7 +341,7 @@ private:
 };
 
 REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
 #endif
 }  // namespace paddle
diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
index adba7c92ec..24c9bf4e72 100644
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
@@ -207,7 +207,7 @@ private:
 
 REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
 REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
 REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
 #endif
diff --git a/paddle/function/RowConvOp.cpp b/paddle/function/RowConvOp.cpp
index b6501e8f4d..09e702f71a 100644
--- a/paddle/function/RowConvOp.cpp
+++ b/paddle/function/RowConvOp.cpp
@@ -217,7 +217,7 @@ public:
 
 REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
 REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
 REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
 #endif
diff --git a/paddle/function/SwitchOp.cpp b/paddle/function/SwitchOp.cpp
index 01e252a8dc..db839b5b76 100644
--- a/paddle/function/SwitchOp.cpp
+++ b/paddle/function/SwitchOp.cpp
@@ -132,7 +132,7 @@ public:
 
 REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
 REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
 REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
 #endif
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp
index f7a80e23e1..55f52816ab 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "BatchNormalizationLayer.h"
 #include "Layer.h"
 #include "paddle/utils/Stat.h"
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 #include "CudnnBatchNormLayer.h"
 #endif
 
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp
index 412762d384..33cf24431d 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/utils/Stat.h"
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 #include "hl_batch_transpose.h"
 #endif
 #include "BatchNormalizationLayer.h"
@@ -90,7 +90,7 @@ void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
   size_t batchSize = in->getHeight();
   CHECK_EQ(out->getHeight(), batchSize * imgPixels_);
   if (useGpu_) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
     LOG(FATAL) << "paddle is compiled only for cpu";
 #else
     batchTranspose(
@@ -127,7 +127,7 @@ void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) {
   }
   CHECK_EQ(in->getHeight(), static_cast<size_t>(batchSize * imgPixels_));
   if (useGpu_) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
     LOG(FATAL) << "paddle is compiled only for cpu";
 #else
     batchTranspose(
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 96d5c54acc..43ab4e4d47 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "PoolLayer.h"
 #include "PoolProjectionLayer.h"
 #include "paddle/utils/Logging.h"
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 #include "CudnnPoolLayer.h"
 #endif
 namespace paddle {
@@ -53,7 +53,7 @@ Layer* PoolLayer::create(const LayerConfig& config) {
   const std::string& pool = config.inputs(0).pool_conf().pool_type();
   if (pool == "max-projection" || pool == "avg-projection") {
     return new PoolProjectionLayer(config);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   } else if (CudnnPoolLayer::typeCheck(pool)) {
     return new CudnnPoolLayer(config);
 #endif
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index a38880e14c..59df057a80 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -674,7 +674,7 @@ void testLayerGradKernel(TestConfig testConf,
                          bool useGpu,
                          bool useWeight,
                          float epsilon) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
   if (useGpu) return;
 #endif
   FLAGS_use_gpu = useGpu;
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
index 659eefa31b..c1c85f8fac 100644
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -119,7 +119,7 @@ TEST(Layer, batchNorm) {
   CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 void batchNormInference(int n, int c, int h, int w) {
   MatrixPtr input = std::make_shared<GpuMatrix>(n, c * h * w);
   MatrixPtr cudnnOut = std::make_shared<GpuMatrix>(n, c * h * w);
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
index e7325e0cc3..16556469cb 100644
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -117,7 +117,7 @@ MatrixPtr doOneConvTest(size_t imgSize,
 }
 
 TEST(Layer, convParaUnified) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   MatrixPtr input, resultCpu, resultGpu;
 
   /// TEST1 for conv ///
diff --git a/paddle/gserver/tests/test_DetectionOutput.cpp b/paddle/gserver/tests/test_DetectionOutput.cpp
index af43dc51fa..1a83f48fae 100644
--- a/paddle/gserver/tests/test_DetectionOutput.cpp
+++ b/paddle/gserver/tests/test_DetectionOutput.cpp
@@ -150,7 +150,7 @@ TEST(Layer, detectionOutputLayerFwd) {
                            useGpu,
                            result2);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   // GPU case 1.
   useGpu = true;
   inputLoc = Matrix::create(1, 16, false, useGpu);
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 93996392d2..42bb570572 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -51,7 +51,7 @@ void testEvaluator(TestConfig testConf,
                    string testEvaluatorName,
                    size_t batchSize,
                    bool useGpu) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
   if (useGpu) return;
 #endif
   FLAGS_use_gpu = useGpu;
diff --git a/paddle/gserver/tests/test_KmaxSeqScore.cpp b/paddle/gserver/tests/test_KmaxSeqScore.cpp
index 308abe6816..1594de8502 100644
--- a/paddle/gserver/tests/test_KmaxSeqScore.cpp
+++ b/paddle/gserver/tests/test_KmaxSeqScore.cpp
@@ -97,7 +97,7 @@ TEST(Layer, kmaxSeqScoreLayer) {
       Matrix::create(subSeqStartPosition.back(), 1, false, false);
 
   std::vector<bool> mode = {false};
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   mode.push_back(true);
 #endif
 
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 090bde7b20..e887dee5f9 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 #include <cudnn.h>
 #endif
 #include <gtest/gtest.h>
@@ -258,7 +258,7 @@ void testProjectionConv(size_t groups, bool isDeconv) {
                      true);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(Projection, conv) {
   /// test ConvProjection
   testProjectionConv(1, false);
@@ -422,7 +422,7 @@ TEST(Layer, depthwiseConvLayer) {
   //  'depthwise_conv' is a sepecial case of 'exconv' whose
   //  groups size equals to the input channels size.
   testDepthwiseConvLayer("exconv", /* useGpu= */ false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   testDepthwiseConvLayer("exconv", /* useGpu= */ true);
 #endif
 }
@@ -480,7 +480,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, convLayer) {
   testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
   testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
 #endif
@@ -525,7 +525,7 @@ TEST(Layer, convTransLayer) {
   for (auto useGpu : {false, true}) {
     testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
   }
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
@@ -638,7 +638,7 @@ TEST(Layer, SelectiveFullyConnectedLayer) {
                 /* trans= */ false,
                 /* useGup= */ false,
                 false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   testLayerGrad(config,
                 "selective_fc",
                 100,
@@ -1210,7 +1210,7 @@ void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
   testLayerGrad(config, "pool", 100, trans, useGpu);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
   TestConfig config;
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
@@ -1236,7 +1236,7 @@ TEST(Layer, PoolLayer) {
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
@@ -1309,7 +1309,7 @@ void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
 TEST(Layer, Pool3DLayer) {
   testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
   testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
   testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
 #endif
@@ -1695,7 +1695,7 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, BatchNormalizationLayer) {
   testBatchNormLayer("batch_norm", false, false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   testBatchNormLayer("batch_norm", false, true);
   if (hl_get_cudnn_lib_version() >= int(4000)) {
     testBatchNormLayer("cudnn_batch_norm", false, true);
@@ -1744,7 +1744,7 @@ void testBatchNorm3DLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, testBatchNorm3DLayer) {
   testBatchNorm3DLayer("batch_norm", false, false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   testBatchNorm3DLayer("batch_norm", false, true);
   if (hl_get_cudnn_lib_version() >= int(4000)) {
     testBatchNorm3DLayer("cudnn_batch_norm", false, true);
@@ -2262,7 +2262,7 @@ void test3DConvLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, test3DConvLayer) {
   test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
@@ -2339,7 +2339,7 @@ void test3DDeConvLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, test3DDeConvLayer) {
   test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index d36f72360f..e322fef9a4 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -243,7 +243,7 @@ TEST(Compare, concat_slice) {
   compareNetwork(config_file_a, config_file_b);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(Compare, img_pool) {
   std::string config_file_a = "./gserver/tests/img_pool_a.conf";
   std::string config_file_b = "./gserver/tests/img_pool_b.conf";
diff --git a/paddle/gserver/tests/test_PriorBox.cpp b/paddle/gserver/tests/test_PriorBox.cpp
index ae0e3bc3d2..cbc0fff7b8 100644
--- a/paddle/gserver/tests/test_PriorBox.cpp
+++ b/paddle/gserver/tests/test_PriorBox.cpp
@@ -151,7 +151,7 @@ TEST(Layer, priorBoxLayerFwd) {
                     useGpu,
                     result);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   // reset the input parameters
   variance[1] = 0.1;
   variance[3] = 0.2;
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
index e11bf402c2..988dbc2513 100644
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
@@ -485,7 +485,7 @@ TEST(ProtoDataProvider, test) {
               // Currently in async mode, useGpu is not supported
               continue;
             }
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
             if (useGpu) {
               continue;
             }
@@ -525,7 +525,7 @@ TEST(ProtoDataProvider, constant_slots) {
       for (int numConstantSlots : {1, 2}) {
         for (int useGpu : numTwoArray) {
           for (int dataCompression : numTwoArray) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
             if (useGpu) {
               continue;
             }
@@ -708,7 +708,7 @@ TEST(ProtoSequenceDataProvider, test) {
               // Currently in async mode, useGpu is not supported
               continue;
             }
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
             if (useGpu) {
               continue;
             }
diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp
index db883543c3..f6522febf8 100644
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider.cpp
@@ -37,7 +37,7 @@ TEST(PyDataProvider, py_fill_slots) {
   config.clear_files();
   std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
   config.set_files(dataFile);
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
   bool useGpu = false;
 #else
   bool useGpu = true;
@@ -71,7 +71,7 @@ TEST(PyDataProvider, py_fill_nest_slots) {
   std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
   config.set_files(dataFile);
   EXPECT_EQ(config.IsInitialized(), true);
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
   bool useGpu = false;
 #else
   bool useGpu = true;
diff --git a/paddle/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
index ab23d00a2c..b25d32fb2c 100644
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
@@ -321,7 +321,7 @@ TEST(Layer, SelectiveFcLayer_train_dense_mul) {
       "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
 
   for (auto useGpu : {false, true}) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
     if (useGpu) {
       break;
     }
@@ -388,7 +388,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
                           outMatSelfc->getWidth(),
                           outMatSelfc->getElementCnt()));
   cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   if (useGpu) {
     hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   }
@@ -418,7 +418,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
   MatrixPtr cpuOutMatFc(
       new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
   cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   if (useGpu) {
     hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   }
@@ -443,7 +443,7 @@ TEST(Layer, SelectiveFcLayer_train_sparse_mul) {
   selLayerConfig.set_size(fcLayerWidth);
 
   testSelectiveFcLayerTrainSparseMul(selLayerConfig, false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   testSelectiveFcLayerTrainSparseMul(selLayerConfig, true);
 #endif
 }
diff --git a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
index e1d4ae1617..f28149081b 100644
--- a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
+++ b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
@@ -195,7 +195,7 @@ TEST(Layer, SeqSliceLayer) {
   vector<vector<real>> ends;
 
   std::vector<bool> mode = {false};
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   mode.push_back(true);
 #endif
   genSeqInfo(seqStartPos, subSeqStartPos);
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
index 55427e2f12..ae5b64257f 100644
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -199,7 +199,7 @@ TEST(Layer, WarpCTCLayer) {
     for (auto batchSize : {1, 10, 32}) {
       for (auto normByTimes : {false, true}) {
         for (auto useGpu : {false, true}) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
           if (useGpu) continue;
 #endif
           LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 0023b4d0f5..de02f9c0d5 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -670,7 +670,7 @@ void GpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
 }
 
 void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   CHECK(dynamic_cast<GpuMatrix*>(&table));
   CHECK(table.useGpu());
   CHECK(ids.useGpu());
@@ -694,7 +694,7 @@ void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
 }
 
 void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   CHECK(dynamic_cast<GpuMatrix*>(&table));
   CHECK(table.useGpu());
   CHECK(ids.useGpu());
@@ -741,7 +741,7 @@ void GpuMatrix::rowMax(Matrix& max) {
 }
 
 void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
   size_t numSamples = getHeight();
   size_t beam = maxVal.getWidth();
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
index 6370c77386..1f31082ae8 100644
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
@@ -836,7 +836,7 @@ void GpuSparseMatrix::zeroMem() {
 }
 
 void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
   size_t numSamples = getHeight();
   size_t beam = maxVal.getWidth();
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index eb87ee9bb7..54e57b255d 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -172,7 +172,7 @@ void GpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
 
 template <class T>
 void GpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   hl_vector_select_from<T>(this->getData(),
                            this->getSize(),
                            src.getData(),
@@ -850,7 +850,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
                                 size_t size)
     : sync_(nullptr) {
   CHECK_LE(offset + size, static_cast<size_t>(src.getSize()));
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   SyncedFlag* flag = src.getSync();
   if (*flag == DATA_AT_CPU) {
     src.copyToGpu();  // will set synchronous data between CPU and GPU
@@ -861,7 +861,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
   auto cMemHandle = (src.getVector(false))->getMemoryHandle();
   cpuVectorT_ = std::make_shared<CpuVectorT<T>>(
       size, std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   auto gMemHandle = (src.getVector(true))->getMemoryHandle();
   gpuVectorT_ = std::make_shared<GpuVectorT<T>>(
       size, std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
index 1ca70ea84c..cf2f66aea1 100644
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
@@ -68,7 +68,7 @@ void testPoolAllocator() {
 
 TEST(Allocator, Pool) {
   testPoolAllocator<CpuAllocator>();
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   testPoolAllocator<GpuAllocator>();
 #endif
 }
@@ -92,7 +92,7 @@ TEST(MemoryHandle, Cpu) {
   EXPECT_EQ(ptr1, ptr2);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(MemoryHandle, Gpu) {
   int numGpu = hl_get_device_count();
 
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
index 22ce39701f..730759f3db 100644
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 /**
  * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
  * implementation of CPU and GPU member function in
diff --git a/paddle/math/tests/test_CpuGpuVector.cpp b/paddle/math/tests/test_CpuGpuVector.cpp
index 58bc43a38b..ccb4a902b0 100644
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 
 #include <gtest/gtest.h>
 #include "paddle/math/Vector.h"
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
index 04c856453d..2d439cd060 100644
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -94,7 +94,7 @@ void testWrapper(F&& f) {
   }
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(ExecViaCpu, test1) {
   testWrapper(f);
   testWrapper(&f);
diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp
index e6b5dba446..6dab187e3e 100644
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 
 #include <gtest/gtest.h>
 #include "paddle/math/Matrix.h"
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/math/tests/test_Matrix.cpp
index 1c21da5b76..7a145eae6a 100644
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 /**
  * This test file use autotest::AutoCompare and cmpWithArg to compares the
  * implementation of CPU and GPU member function in Matrix.cpp.
diff --git a/paddle/math/tests/test_SparseMatrix.cpp b/paddle/math/tests/test_SparseMatrix.cpp
index c0572dfdbf..8151dde106 100644
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
@@ -47,7 +47,7 @@ struct MatrixPara {
   SparseFormat format;
 };
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 void test_sparse_matrix_mul(MatrixPara paraA,
                             MatrixPara paraB,
                             MatrixPara paraC) {
@@ -452,7 +452,7 @@ TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
   matB->trimFrom(*mat);
   checkSMatrixEqual2(matA, matB);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
       height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true);
   matC->trimFrom(*mat);
@@ -546,7 +546,7 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
   matB->trimFrom(*mat);
   checkSMatrixEqual2(matA, matB);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
       height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true);
   matC->trimFrom(*mat);
diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu
index 31b693afa8..d03698dee2 100644
--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
@@ -270,7 +270,7 @@ TEST(Unary, BaseOp) {
   TestUnaryVectorT<CpuIVector, int> testCpuIVector(
       testUnaryBaseOpInt<CpuIVector>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
   TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
   TestUnaryVectorT<GpuIVector, int> testGpuIVector(
@@ -317,7 +317,7 @@ void testUnayrMathOp(Tensor& A1, Tensor& A2) {
 TEST(Unary, MathOp) {
   TestUnaryMatrix<CpuMatrix> testCpu(testUnayrMathOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestUnaryMatrix<GpuMatrix> testGpu(testUnayrMathOp<GpuMatrix>);
 #endif
 }
@@ -374,7 +374,7 @@ void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
 TEST(Unary, CompareOp) {
   TestUnaryMatrix<CpuMatrix> testCpu(testUnayrCompareOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestUnaryMatrix<GpuMatrix> testGpu(testUnayrCompareOp<GpuMatrix>);
 #endif
 }
@@ -536,7 +536,7 @@ void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
 TEST(Binary, BaseOp) {
   TestBinaryMatrix<CpuMatrix> testCpu(testBinaryBaseOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestBinaryMatrix<GpuMatrix> testGpu(testBinaryBaseOp<GpuMatrix>);
 #endif
 }
@@ -710,7 +710,7 @@ void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
 TEST(Binary, MathOp) {
   TestBinaryMatrix<CpuMatrix> testCpu(testBinaryMathOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestBinaryMatrix<GpuMatrix> testGpu(testBinaryMathOp<GpuMatrix>);
 #endif
 }
@@ -810,7 +810,7 @@ void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
 TEST(Binary, CompareOp) {
   TestBinaryMatrix<CpuMatrix> testCpu(testBinaryCompareOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestBinaryMatrix<GpuMatrix> testGpu(testBinaryCompareOp<GpuMatrix>);
 #endif
 }
@@ -955,7 +955,7 @@ void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
 TEST(Ternary, BaseOp) {
   TestTernaryMatrix<CpuMatrix> testCpu(testTernaryBaseOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestTernaryMatrix<GpuMatrix> testGpu(testTernaryBaseOp<GpuMatrix>);
 #endif
 }
@@ -1058,7 +1058,7 @@ void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
 TEST(Ternary, CompareOp) {
   TestTernaryMatrix<CpuMatrix> testCpu(testTernaryCompareOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestTernaryMatrix<GpuMatrix> testGpu(testTernaryCompareOp<GpuMatrix>);
 #endif
 }
@@ -1086,7 +1086,7 @@ void testQuaternaryAdd(
 TEST(Quaternary, BaseOp) {
   TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryAdd<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryAdd<GpuMatrix>);
 #endif
 }
@@ -1156,7 +1156,7 @@ void testQuaternaryCompareOp(
 TEST(Quaternary, CompareOp) {
   TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryCompareOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
 #endif
 }
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
index 4a88844b43..36ac024007 100644
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -91,7 +91,7 @@ int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
 typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
 
 void testCase(testMatrixFunc matrixFunc) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   for (auto useGpu : {false, true}) {
 #else
   for (auto useGpu : {false}) {
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/math/tests/test_batchTranspose.cpp
index 4eb9837909..0189e534eb 100644
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 
 using namespace paddle;  // NOLINT
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
   const int nx = 100;
   const int ny = 50;
diff --git a/paddle/math/tests/test_lazyAssign.cu b/paddle/math/tests/test_lazyAssign.cu
index 92afab4ff7..04f23cff55 100644
--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
@@ -72,7 +72,7 @@ void testLazyAssign(int height, int width) {
 
 TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign<CpuMatrix>); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign<GpuMatrix>); }
 #endif
 
@@ -142,6 +142,6 @@ void testSgdUpdate(int height, int width) {
 
 TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate<CpuMatrix>); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate<GpuMatrix>); }
 #endif
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 061fb22e3f..7735877ac8 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 /// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when
 /// only cpu version.
 
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index 60ebae0153..dff18136ae 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
index a9185a4b24..e39cc0a2f6 100644
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 /// This unittest checks GpuSparseMatrix/CpuSparseMatrix get same result,
 //  so disable when
 /// only cpu version.
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index bb44970109..ed0c3374ff 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -175,7 +175,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
 }
 
 BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   if (system_allocator_->UseGpu()) {
     if ((total_used_ + total_free_) == 0) {
       // Compute the maximum allocation size for the first allocation.
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index a270bd5958..64f8182b5c 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -62,7 +62,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
 
 bool CPUAllocator::UseGpu() const { return false; }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 
 void* GPUAllocator::Alloc(size_t& index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 82ba322e05..6b1f40347b 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -40,7 +40,7 @@ class CPUAllocator : public SystemAllocator {
   virtual bool UseGpu() const;
 };
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 class GPUAllocator : public SystemAllocator {
  public:
   virtual void* Alloc(size_t& index, size_t size);
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index ba44e06ddb..57d5443d50 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -56,7 +56,7 @@ TEST(CPUAllocator, LockMem) {
   TestAllocator(a, 0);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(GPUAllocator, Alloc) {
   paddle::memory::detail::GPUAllocator a;
   TestAllocator(a, 2048);
diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
index c96a697a7e..184d0f8fa7 100644
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -26,7 +26,7 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
   std::memcpy(dst, src, num);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 template <>
 void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
                                                   void* dst,
diff --git a/paddle/memory/memcpy.h b/paddle/memory/memcpy.h
index 2b9c0eada6..7142831d43 100644
--- a/paddle/memory/memcpy.h
+++ b/paddle/memory/memcpy.h
@@ -33,7 +33,7 @@ namespace memory {
 template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 
 /**
  * \brief   Copy memory from one place to another place.
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 29bc26f9d3..6d5a74dafe 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -62,7 +62,7 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 
 BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   using BuddyAllocVec = std::vector<BuddyAllocator*>;
diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 53cc63a098..7a617f04dc 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -80,7 +80,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
   }
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 
 size_t align(size_t size, paddle::platform::GPUPlace place) {
   size += sizeof(paddle::memory::detail::Metadata);
diff --git a/paddle/operators/detail/strided_memcpy.h b/paddle/operators/detail/strided_memcpy.h
index b165224b37..9f05a26322 100644
--- a/paddle/operators/detail/strided_memcpy.h
+++ b/paddle/operators/detail/strided_memcpy.h
@@ -34,7 +34,7 @@ struct StridedMemcpyFunctor<T, 1> {
       auto& cpu_place = boost::get<platform::CPUPlace>(place);
       memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
     } else {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
       auto& gpu_place = boost::get<platform::GPUPlace>(place);
       auto& cuda_ctx =
           reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc
index f0b8c88591..3d040ca2b5 100644
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@@ -71,7 +71,7 @@ void testIm2col() {
     context =
         new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
   } else {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
     context =
         new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
 #else
@@ -116,7 +116,7 @@ void testIm2col() {
 
 TEST(math, im2col) {
   testIm2col<paddle::platform::CPUPlace>();
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   testIm2col<paddle::platform::GPUPlace>();
 #endif
 }
diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc
index 22468a0c4a..2252268620 100644
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
@@ -1,7 +1,7 @@
 #include "paddle/operators/math/math_function.h"
 #include "gtest/gtest.h"
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(math_function, notrans_mul_trans) {
   paddle::framework::Tensor input1;
   paddle::framework::Tensor input1_gpu;
diff --git a/paddle/operators/strided_memcpy_test.cc b/paddle/operators/strided_memcpy_test.cc
index 05882a8873..e0dd7b19f1 100644
--- a/paddle/operators/strided_memcpy_test.cc
+++ b/paddle/operators/strided_memcpy_test.cc
@@ -72,7 +72,7 @@ TEST(StridedMemcpy, CPUConcat) {
   }
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(StridedMemcpy, GPUCrop) {
   // clang-format off
   int src[] = {
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 36af1ac677..8dcc357a16 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -35,7 +35,7 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
 
 Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 
 template <>
 Eigen::GpuDevice*
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index d805d2ab08..c1c4c7f760 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/gpu_info.h"
@@ -61,7 +61,7 @@ class CPUDeviceContext : public DeviceContext {
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 template <>
 struct EigenDeviceConverter<platform::GPUPlace> {
   using EigenDeviceType = Eigen::GpuDevice;
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index 52bd23039b..f9fe521d50 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -29,7 +29,7 @@ limitations under the License. */
 #include <cxxabi.h>  // for __cxa_demangle
 #endif
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
@@ -113,7 +113,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   }
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index f0c825bd9b..ac884386dd 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 
 #include <cuda_runtime.h>
 #include <stddef.h>
diff --git a/paddle/platform/variant.h b/paddle/platform/variant.h
index 16ee00efe7..8145799dfd 100644
--- a/paddle/platform/variant.h
+++ b/paddle/platform/variant.h
@@ -16,7 +16,7 @@
 
 #include <boost/config.hpp>
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 
 // Because boost's variadic templates has bug on nvcc, boost will disable
 // variadic template support when GPU enabled on nvcc.
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
index 6f6c9e596c..96724530f5 100644
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -215,7 +215,7 @@ int main(int argc, char** argv) {
 
   uint64_t dataSize = FLAGS_dim * sizeof(real);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   GpuVector gpuParam(FLAGS_dim);
   GpuVector gpuGrad(FLAGS_dim);
 #else
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index 04236fda2f..74ab1f2f77 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -99,7 +99,7 @@ TEST(ProtoServer, regular) {
 }
 
 TEST(ProtoServer, extended) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   ProtoClient* client;
   if (FLAGS_rdma_tcp == "rdma")
     client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index d480427f59..761d82fc4d 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -34,7 +34,7 @@ static size_t UniqueIntegerGenerator() {
 }
 
 bool IsCompileGPU() {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
   return false;
 #else
   return true;
@@ -78,7 +78,7 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCPUTensorSetFromArray<float>)
       .def("set", PyCPUTensorSetFromArray<int>)
       .def("set", PyCPUTensorSetFromArray<double>)
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
       .def("set", PyCUDATensorSetFromArray<float>)
       .def("set", PyCUDATensorSetFromArray<int>)
       .def("set", PyCUDATensorSetFromArray<double>)
@@ -96,7 +96,7 @@ PYBIND11_PLUGIN(core) {
       .def(
           "__init__",
           [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
             new (&instance) LoDTensor(lod);
 #else
              LoD new_lod;
@@ -107,7 +107,7 @@ PYBIND11_PLUGIN(core) {
           })
       .def("set_lod",
            [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
              self.set_lod(lod);
 #else
              LoD new_lod;
@@ -117,7 +117,7 @@ PYBIND11_PLUGIN(core) {
 #endif
            })
       .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
         return self.lod();
 #else
            auto lod = self.lod();
@@ -203,7 +203,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def_static("create",
                   [](paddle::platform::GPUPlace& place)
                       -> paddle::platform::DeviceContext* {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
                     PADDLE_THROW("GPUPlace is not supported in CPU device.");
 #else
                     return new paddle::platform::CUDADeviceContext(place);
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index 3e3e6bc031..62e85fa54f 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -106,7 +106,7 @@ void PyCPUTensorSetFromArray(
   std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 template <typename T>
 void PyCUDATensorSetFromArray(
     framework::Tensor &self,
diff --git a/paddle/trainer/MergeModel.cpp b/paddle/trainer/MergeModel.cpp
index 91d89b61a3..a37d53bc72 100644
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/trainer/MergeModel.cpp
@@ -29,7 +29,7 @@ int main(int argc, char** argv) {
   initMain(argc, argv);
   initPython(argc, argv);
   string confFile = TrainerConfigHelper::getConfigNameFromPath(FLAGS_model_dir);
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
   FLAGS_use_gpu = false;
 #endif
   auto config = std::make_shared<TrainerConfigHelper>(confFile);
diff --git a/paddle/trainer/tests/test_Compare.cpp b/paddle/trainer/tests/test_Compare.cpp
index e855a8fe2e..b5d29da45a 100644
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/trainer/tests/test_Compare.cpp
@@ -146,7 +146,7 @@ void compareGradient(comData& comDataCpu, comData& comDataGpu) {
 }
 
 int main(int argc, char** argv) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
   exit(0);
 #endif
   paddle::initMain(argc, argv);
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp
index 813275518e..4da9ce20fb 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -174,7 +174,7 @@ TEST(compareSparse, multiGradientMachine) {
     FLAGS_local = local;
     FLAGS_ports_num_for_sparse = 5;
     for (bool useGpu : {false, true}) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
       if (useGpu) continue;
 #endif
       FLAGS_parallel_nn = useGpu;
@@ -198,7 +198,7 @@ TEST(compareSparse, NeuralNetwork) {
     FLAGS_local = local;
     FLAGS_ports_num_for_sparse = 5;
     for (bool useGpu : {false, true}) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
       if (useGpu) continue;
 #endif
       FLAGS_parallel_nn = useGpu;
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index 264bc46ebc..f69e1aafee 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -51,7 +51,7 @@ void checkGradientTest(const string& configFile,
 
 TEST(checkGradient, cpu) { checkGradientTest(configFile1, false, false); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(checkGradient, gpu) { checkGradientTest(configFile1, true, false); }
 
 TEST(checkGradient, multiGpu) {
@@ -97,7 +97,7 @@ TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
 TEST(checkGradient, chunk) {
   checkGradientTest(configFile3, false, false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   checkGradientTest(configFile3, true, true);
 #endif
 }
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index 00ba61377a..4c4d124fa9 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -79,7 +79,7 @@ void trainerOnePassTest(const string& configFile,
 // 1. test trainer (cpu, gpu).
 TEST(trainerOnePass, cpu) { trainerOnePassTest(configFile1, false, false); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(trainerOnePass, gpu) { trainerOnePassTest(configFile1, true, false); }
 
 TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); }
@@ -94,7 +94,7 @@ TEST(trainerOnePass, parallel) {
 #endif
 
 // 2. test average_window.
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(average_window, gpu) {
   trainerOnePassTest(configFile1, true, false, 4, 0.01);
 }
@@ -266,7 +266,7 @@ TEST(checkRemoteUpdater, cpuTrainerOldUpdater) {
   checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(checkRemoteUpdater, gpuTrainer) {
   checkRemoteParameterUpdaterTest(configFile1, true, false);
 }
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
index 1322e77178..74b4fed7ed 100644
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
@@ -113,7 +113,7 @@ void testGeneration(const string& configFile,
 #ifndef PADDLE_TYPE_DOUBLE
 
 TEST(RecurrentGradientMachine, test_generation) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
   const auto useGpuConfs = {false};
 #else
   const auto useGpuConfs = {true, false};
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index ab1c181c62..32155ded35 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "Flags.h"
 
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
 DEFINE_bool(use_gpu, false, "Only support CPU training");
 #else
 DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index 22ce2534d3..904d0f5061 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -218,7 +218,7 @@ protected:
  * *d2* is peer device to enable direct access to by the d1 device.
  */
 inline void enablePeerAccess(int d1, int d2) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   if (hl_device_can_access_peer(d1, d2)) {
     SetDevice dev(d1);
     hl_device_enable_peer_access(d2);
diff --git a/paddle/utils/Version.h b/paddle/utils/Version.h
index f53d6420bb..611fda83d9 100644
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
@@ -48,7 +48,7 @@ void printVersion(std::ostream& os);
  * @return return true if paddle compiled with GPU
  */
 constexpr bool isWithGpu() {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_GPU
   return false;
 #else
   return true;