From 06b42e9ec5a19fcc0bb393066425f95cd231da06 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 6 Sep 2017 14:47:21 +0800
Subject: [PATCH 01/50] Add crop op.

---
 paddle/operators/crop_op.cc                   |  81 ++++++++++
 paddle/operators/crop_op.cu                   |  22 +++
 paddle/operators/crop_op.h                    | 138 ++++++++++++++++++
 paddle/pybind/pybind.cc                       |   1 +
 .../paddle/v2/framework/tests/test_crop_op.py |  35 +++++
 5 files changed, 277 insertions(+)
 create mode 100644 paddle/operators/crop_op.cc
 create mode 100644 paddle/operators/crop_op.cu
 create mode 100644 paddle/operators/crop_op.h
 create mode 100644 python/paddle/v2/framework/tests/test_crop_op.py
diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
new file mode 100644
index 0000000000..75fa42fc18
--- /dev/null
+++ b/paddle/operators/crop_op.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/crop_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class CropOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto dim0 = ctx.Input<Tensor>("X")->dims();
+    auto Y = ctx.Input<Tensor>("Y");
+    if (Y == nullptr) {
+      auto shape = GetAttr<std::vector<int>>("shape");
+      PADDLE_ENFORCE_EQ(
+          shape.size(), dim0.size(),
+          "Shape size should be equal to dimention size of input tensor.");
+      ctx.Output<Tensor>("Out")->Resize(paddle::framework::make_ddim(shape));
+    } else {
+      ctx.Output<Tensor>("Out")->Resize(Y->dims());
+    }
+  }
+};
+
+class CropOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CropOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of crop op");
+    AddInput("Y", "The input used as reference for cropping. ");
+    AddOutput("Out", "The output of crop op.");
+    AddComment(R"DOC(
+Crop Operator.
+)DOC");
+    AddAttr<std::vector<int>>("offsets", "The offsets for cropping.");
+    AddAttr<std::vector<int>>("shape", "The shape for cropping.");
+  }
+};
+
+class CropOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    x_grad->Resize(x_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad);
+REGISTER_OP_CPU_KERNEL(crop,
+                       ops::CropKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(crop_grad,
+                       ops::CropGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/crop_op.cu b/paddle/operators/crop_op.cu
new file mode 100644
index 0000000000..5afed49465
--- /dev/null
+++ b/paddle/operators/crop_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/crop_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(crop,
+                       ops::CropKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(crop_grad,
+                       ops::CropGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h
new file mode 100644
index 0000000000..40e05869dd
--- /dev/null
+++ b/paddle/operators/crop_op.h
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 CropdleCropdle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+using Tensor = framework::Tensor;
+
+template <typename Place, typename T, size_t D>
+void CropFunction(const framework::ExecutionContext& context) {
+  auto* x = context.Input<Tensor>("X");
+  auto* out = context.Output<Tensor>("Out");
+  out->mutable_data<T>(context.GetPlace());
+  auto x_dims = x->dims();
+  auto out_dims = out->dims();
+
+  auto offsets = context.op().GetAttr<std::vector<int>>("offsets");
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(), offsets.size(),
+      "Offsets size should be equal to dimension size of input tensor.");
+
+  Eigen::array<std::pair<int, int>, D> paddings;
+  for (size_t i = 0; i < D; ++i) {
+    paddings[i].first = -(offsets[i]);
+    paddings[i].second = -(x_dims[i] - out_dims[i] - offsets[i]);
+  }
+
+  auto x_tensor = EigenTensor<T, D>::From(*x);
+  auto out_tensor = EigenTensor<T, D>::From(*out);
+  auto place = context.GetEigenDevice<Place>();
+  out_tensor.device(place) = x_tensor.pad(paddings, 0);
+}
+
+template <typename Place, typename T>
+class CropKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    int dim = context.Input<Tensor>("X")->dims().size();
+    switch (dim) {
+      case 1:
+        CropFunction<Place, T, 1>(context);
+        break;
+      case 2:
+        CropFunction<Place, T, 2>(context);
+        break;
+      case 3:
+        CropFunction<Place, T, 3>(context);
+        break;
+      case 4:
+        CropFunction<Place, T, 4>(context);
+        break;
+      case 5:
+        CropFunction<Place, T, 5>(context);
+        break;
+      case 6:
+        CropFunction<Place, T, 6>(context);
+        break;
+      default:
+        LOG(ERROR) << "Only ranks up to 6 supported.";
+    }
+  }
+};
+
+template <typename Place, typename T, size_t D>
+void CropGradFunction(const framework::ExecutionContext& context) {
+  auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+  auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+  d_x->mutable_data<T>(context.GetPlace());
+  auto d_x_dims = d_x->dims();
+  auto d_out_dims = d_out->dims();
+
+  auto offsets = context.op().GetAttr<std::vector<int>>("offsets");
+
+  Eigen::array<std::pair<int, int>, D> paddings;
+  for (int i = 0; i < d_out_dims.size(); ++i) {
+    paddings[i].first = offsets[i];
+    paddings[i].second = d_x_dims[i] - d_out_dims[i] - offsets[i];
+  }
+
+  auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
+  auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
+  auto place = context.GetEigenDevice<Place>();
+  d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0);
+}
+
+template <typename Place, typename T>
+class CropGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    size_t dim =
+        context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
+    switch (dim) {
+      case 1:
+        CropGradFunction<Place, T, 1>(context);
+        break;
+      case 2:
+        CropGradFunction<Place, T, 2>(context);
+        break;
+      case 3:
+        CropGradFunction<Place, T, 3>(context);
+        break;
+      case 4:
+        CropGradFunction<Place, T, 4>(context);
+        break;
+      case 5:
+        CropGradFunction<Place, T, 5>(context);
+        break;
+      case 6:
+        CropGradFunction<Place, T, 6>(context);
+        break;
+      default:
+        LOG(ERROR) << "Only ranks up to 6 supported.";
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 6896422617..e2ea5c92af 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -48,6 +48,7 @@ USE_NO_KERNEL_OP(identity);
 USE_OP(minus);
 USE_CPU_ONLY_OP(gather);
 USE_CPU_ONLY_OP(scatter);
+USE_OP(crop);
 
 namespace paddle {
 namespace framework {
diff --git a/python/paddle/v2/framework/tests/test_crop_op.py b/python/paddle/v2/framework/tests/test_crop_op.py
new file mode 100644
index 0000000000..27d8332acf
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_crop_op.py
@@ -0,0 +1,35 @@
+import unittest
+import numpy as np
+from paddle.v2.framework.op import Operator
+from gradient_checker import GradientChecker
+from op_test_util import OpTestMeta
+
+
+class TestCropOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "crop"
+        self.inputs = {'X': np.random.random((16, 16)).astype("float32"), }
+        self.attrs = {}
+        self.attrs['offsets'] = [2, 3]
+        self.attrs['shape'] = [8, 8]
+        self.outputs = {'Out': self.inputs['X'][2:10, 3:11]}
+
+
+class TestCropGradOp(GradientChecker):
+    def setUp(self):
+        self.op = Operator(
+            type="crop", X="X", Out="Out", offsets=[2, 3], shape=[8, 8])
+        self.inputs = {'X': np.random.random((16, 16)).astype("float32"), }
+
+    def test_normal(self):
+        self.check_grad(
+            self.op, self.inputs, set(["X"]), "Out", max_relative_error=0.5)
+
+    def test_cpu_gpu_compare(self):
+        self.compare_grad(self.op, self.inputs)
+
+
+if __name__ == '__main__':
+    unittest.main()

From f23ab48468a588d3766ed8db4f6bfa4af9fd8ff2 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 6 Sep 2017 17:24:20 +0800
Subject: [PATCH 02/50] Fix attr int_64 error.

---
 paddle/operators/crop_op.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
index 75fa42fc18..77ea51ea79 100644
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/operators/crop_op.h"
+#include <boost/lexical_cast.hpp>
 
 namespace paddle {
 namespace operators {
@@ -32,7 +33,12 @@ class CropOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           shape.size(), dim0.size(),
           "Shape size should be equal to dimention size of input tensor.");
-      ctx.Output<Tensor>("Out")->Resize(paddle::framework::make_ddim(shape));
+      std::vector<int64_t> tensor_shape(shape.size());
+      for (int i = 0; i < shape.size(); ++i) {
+        tensor_shape[i] = (int64_t)shape[i];
+      }
+      ctx.Output<Tensor>("Out")->Resize(
+          paddle::framework::make_ddim(tensor_shape));
     } else {
       ctx.Output<Tensor>("Out")->Resize(Y->dims());
     }

From 3c49e7b1e4b7b9f8f67fa4b12b05cf648808a40c Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 13 Sep 2017 14:17:51 +0800
Subject: [PATCH 03/50] move EigenDeviceConverter to device_context.h

---
 paddle/framework/operator.cc           |  4 ++--
 paddle/framework/operator.h            | 19 ++-----------------
 paddle/operators/math/activation.h     | 20 ++++++++++++++++++++
 paddle/platform/device_context.cc      |  7 ++++---
 paddle/platform/device_context.h       | 19 ++++++++++++++++++-
 paddle/platform/device_context_test.cc |  2 +-
 6 files changed, 47 insertions(+), 24 deletions(-)
 create mode 100644 paddle/operators/math/activation.h

diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index e1e122091f..25c545d3f9 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,14 +22,14 @@ namespace framework {
 template <>
 Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
     platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return *device_context_->get_eigen_device<Eigen::DefaultDevice>();
+  return *device_context_->get_eigen_device<platform::CPUPlace>();
 }
 
 #ifndef PADDLE_ONLY_CPU
 template <>
 Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return *device_context_->get_eigen_device<Eigen::GpuDevice>();
+  return *device_context_->get_eigen_device<platform::GPUPlace>();
 }
 #endif
 
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 4600b06009..bfa2190557 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -331,21 +331,6 @@ class InferShapeContext {
   const Scope& scope_;
 };
 
-template <typename T>
-struct EigenDeviceConverter;
-
-template <>
-struct EigenDeviceConverter<platform::CPUPlace> {
-  using EigenDeviceType = Eigen::DefaultDevice;
-};
-
-#ifndef PADDLE_ONLY_CPU
-template <>
-struct EigenDeviceConverter<platform::GPUPlace> {
-  using EigenDeviceType = Eigen::GpuDevice;
-};
-#endif
-
 class ExecutionContext : public InferShapeContext {
  public:
   ExecutionContext(const OperatorBase& op, const Scope& scope,
@@ -353,8 +338,8 @@ class ExecutionContext : public InferShapeContext {
       : InferShapeContext(op, scope), device_context_(device_context) {}
 
   template <typename PlaceType,
-            typename DeviceType =
-                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
+            typename DeviceType = typename platform::EigenDeviceConverter<
+                PlaceType>::EigenDeviceType>
   DeviceType& GetEigenDevice() const;
 
   platform::Place GetPlace() const { return device_context_->GetPlace(); }
diff --git a/paddle/operators/math/activation.h b/paddle/operators/math/activation.h
new file mode 100644
index 0000000000..b6af478d82
--- /dev/null
+++ b/paddle/operators/math/activation.h
@@ -0,0 +1,20 @@
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename Place, typename T>
+struct sigmoid {
+  void operator()(const platform::DeviceContext& deice_context,
+                  const framework::Tensor& input, framework::Tensor* output) {
+    auto x = framework::EigenVector<T>::Flatten(*output);
+    auto y = framework::EigenVector<T>::Flatten(input);
+    auto* place = device_context.get_eigen_device<Place>();
+    y.device(*place) = 1. / (1. + (-x).exp());
+  }
+};
+}
+}
+}
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index ad212c5b2c..cf5c3eec81 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -16,8 +16,8 @@ namespace paddle {
 namespace platform {
 
 template <>
-Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>()
-    const {
+Eigen::DefaultDevice*
+DeviceContext::get_eigen_device<CPUPlace, Eigen::DefaultDevice>() const {
   return reinterpret_cast<const CPUDeviceContext*>(this)->eigen_device();
 }
 
@@ -91,7 +91,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
 };
 
 template <>
-Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
+Eigen::GpuDevice* DeviceContext::get_eigen_device<GPUPlace, Eigen::GpuDevice>()
+    const {
   return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
 }
 
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 11528e1194..a46ba4c703 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -27,12 +27,29 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+template <typename T>
+struct EigenDeviceConverter;
+
+template <>
+struct EigenDeviceConverter<platform::CPUPlace> {
+  using EigenDeviceType = Eigen::DefaultDevice;
+};
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+struct EigenDeviceConverter<platform::GPUPlace> {
+  using EigenDeviceType = Eigen::GpuDevice;
+};
+#endif
+
 class DeviceContext {
  public:
   virtual ~DeviceContext() {}
   virtual Place GetPlace() const = 0;
 
-  template <typename DeviceType>
+  template <typename PlaceType,
+            typename DeviceType =
+                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
   DeviceType* get_eigen_device() const;
 };
 
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index 5883a55272..d71e0aae58 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -24,7 +24,7 @@ TEST(Device, Init) {
   for (int i = 0; i < count; i++) {
     DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
     Eigen::GpuDevice* gpu_device =
-        device_context->template get_eigen_device<Eigen::GpuDevice>();
+        device_context->template get_eigen_device<GPUPlace>();
     ASSERT_NE(nullptr, gpu_device);
     delete device_context;
   }

From d736fc0e00108384853a996aef9d51dbe81f1564 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 13 Sep 2017 17:33:36 +0800
Subject: [PATCH 04/50] add activation macro

---
 paddle/framework/operator.h                   |   6 +-
 paddle/operators/activation_op.cc             | 115 ++++++++++++++++++
 .../{sigmoid_op.cu => activation_op.cu}       |  11 +-
 paddle/operators/activation_op.h              |  71 +++++++++++
 paddle/operators/math/activation.h            |  20 ---
 paddle/operators/math/activation_functor.h    |  96 +++++++++++++++
 paddle/operators/sigmoid_op.cc                |  61 ----------
 paddle/operators/sigmoid_op.h                 |  62 ----------
 paddle/pybind/pybind.cc                       |   4 +-
 .../paddle/v2/framework/tests/test_exp_op.py  |  22 ++++
 .../paddle/v2/framework/tests/test_relu_op.py |  22 ++++
 11 files changed, 342 insertions(+), 148 deletions(-)
 create mode 100644 paddle/operators/activation_op.cc
 rename paddle/operators/{sigmoid_op.cu => activation_op.cu} (66%)
 create mode 100644 paddle/operators/activation_op.h
 delete mode 100644 paddle/operators/math/activation.h
 create mode 100644 paddle/operators/math/activation_functor.h
 delete mode 100644 paddle/operators/sigmoid_op.cc
 delete mode 100644 paddle/operators/sigmoid_op.h
 create mode 100644 python/paddle/v2/framework/tests/test_exp_op.py
 create mode 100644 python/paddle/v2/framework/tests/test_relu_op.py

diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index bfa2190557..0970797e02 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -139,9 +139,9 @@ class OperatorBase {
 // Macro for define a clone method.
 // If you are writing an kernel operator, `Clone` will be defined when you
 // register it. i.e. `Clone` method is not needed to define by yourself.
-#define DEFINE_OP_CLONE_METHOD(cls)                       \
-  std::unique_ptr<OperatorBase> Clone() const final {     \
-    return std::unique_ptr<OperatorBase>(new cls(*this)); \
+#define DEFINE_OP_CLONE_METHOD(cls)                                            \
+  std::unique_ptr<::paddle::framework::OperatorBase> Clone() const final {     \
+    return std::unique_ptr<::paddle::framework::OperatorBase>(new cls(*this)); \
   }
 
 // Macro for define a default constructor for Operator.
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
new file mode 100644
index 0000000000..d2c2378fef
--- /dev/null
+++ b/paddle/operators/activation_op.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/activation_op.h"
+
+#define FILL_ACTIVATION_OP                                                  \
+ public:                                                                    \
+  using framework::OperatorWithKernel::OperatorWithKernel;                  \
+                                                                            \
+ protected:                                                                 \
+  void InferShape(const framework::InferShapeContext &ctx) const override { \
+    ctx.Output<framework::Tensor>("Y")->Resize(                             \
+        ctx.Input<framework::Tensor>("X")->dims());                         \
+  }
+
+#define FILL_ACTIVATION_GRAD_OP                                             \
+ public:                                                                    \
+  using framework::OperatorWithKernel::OperatorWithKernel;                  \
+                                                                            \
+ protected:                                                                 \
+  void InferShape(const framework::InferShapeContext &ctx) const override { \
+    ctx.Output<framework::Tensor>(framework::GradVarName("X"))              \
+        ->Resize(ctx.Input<framework::Tensor>("Y")->dims());                \
+  }
+
+namespace paddle {
+namespace operators {
+
+class SigmoidOp : public framework::OperatorWithKernel {
+  FILL_ACTIVATION_OP
+};
+
+class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SigmoidOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Sigmoid operator");
+    AddOutput("Y", "Output of Sigmoid operator");
+    AddComment("Sigmoid activation operator");
+  }
+};
+
+class SigmoidOpGrad : public framework::OperatorWithKernel {
+  FILL_ACTIVATION_GRAD_OP
+};
+
+class ExpOp : public framework::OperatorWithKernel {
+  FILL_ACTIVATION_OP
+};
+
+class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ExpOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Exp operator");
+    AddOutput("Y", "Output of Exp operator");
+    AddComment("Exp activation operator");
+  }
+};
+
+class ExpOpGrad : public framework::OperatorWithKernel {
+  FILL_ACTIVATION_GRAD_OP
+};
+
+class ReluOp : public framework::OperatorWithKernel {
+  FILL_ACTIVATION_OP
+};
+
+class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Relu operator");
+    AddOutput("Y", "Output of Relu operator");
+    AddComment("Relu activation operator");
+  }
+};
+
+class ReluOpGrad : public framework::OperatorWithKernel {
+  FILL_ACTIVATION_GRAD_OP
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker, sigmoid_grad,
+            ops::SigmoidOpGrad);
+REGISTER_OP_CPU_KERNEL(sigmoid,
+                       ops::SigmoidKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    sigmoid_grad, ops::SigmoidGradKernel<paddle::platform::CPUPlace, float>);
+
+REGISTER_OP(exp, ops::ExpOp, ops::ExpOpMaker, exp_grad, ops::ExpOpGrad);
+REGISTER_OP_CPU_KERNEL(exp, ops::ExpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(exp_grad,
+                       ops::ExpGradKernel<paddle::platform::CPUPlace, float>);
+
+REGISTER_OP(relu, ops::ReluOp, ops::ReluOpMaker, relu_grad, ops::ReluOpGrad);
+REGISTER_OP_CPU_KERNEL(relu,
+                       ops::ReluKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(relu_grad,
+                       ops::ReluGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.cu b/paddle/operators/activation_op.cu
similarity index 66%
rename from paddle/operators/sigmoid_op.cu
rename to paddle/operators/activation_op.cu
index 1a50dfe14a..55d9f52124 100644
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/activation_op.cu
@@ -13,7 +13,7 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/sigmoid_op.h"
+#include "paddle/operators/activation_op.h"
 
 namespace ops = paddle::operators;
 
@@ -21,3 +21,12 @@ REGISTER_OP_GPU_KERNEL(sigmoid,
                        ops::SigmoidKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
     sigmoid_grad, ops::SigmoidGradKernel<paddle::platform::GPUPlace, float>);
+
+REGISTER_OP_GPU_KERNEL(exp, ops::ExpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(exp_grad,
+                       ops::ExpGradKernel<paddle::platform::GPUPlace, float>);
+
+REGISTER_OP_GPU_KERNEL(relu,
+                       ops::ReluKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(relu_grad,
+                       ops::ReluGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
new file mode 100644
index 0000000000..9e4101805e
--- /dev/null
+++ b/paddle/operators/activation_op.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/activation_functor.h"
+
+#define ACTIVATION_KERNEL_NAME(ACTIVATION_NAME) ACTIVATION_NAME##Kernel
+
+#define DEFINE_ACTIVATION_KERNEL(ACTIVATION_NAME)                              \
+  template <typename Place, typename T>                                        \
+  class ACTIVATION_KERNEL_NAME(ACTIVATION_NAME) : public framework::OpKernel { \
+   public:                                                                     \
+    void Compute(const framework::ExecutionContext& context) const override {  \
+      auto* X = context.Input<framework::Tensor>("X");                         \
+      auto* Y = context.Output<framework::Tensor>("Y");                        \
+      Y->mutable_data<T>(context.GetPlace());                                  \
+      math::ACTIVATION_NAME<Place, T> functor;                                 \
+      auto* device_context = context.device_context();                         \
+      functor(*device_context, *X, Y);                                         \
+    }                                                                          \
+  };
+
+#define DEFINE_ACTIVATION_GRAD_KERNEL(ACTIVATION_GRAD_NAME)                   \
+  template <typename Place, typename T>                                       \
+  class ACTIVATION_KERNEL_NAME(ACTIVATION_GRAD_NAME)                          \
+      : public framework::OpKernel {                                          \
+   public:                                                                    \
+    void Compute(const framework::ExecutionContext& context) const override { \
+      auto* X = context.Input<framework::Tensor>("X");                        \
+      auto* Y = context.Input<framework::Tensor>("Y");                        \
+      auto* dY =                                                              \
+          context.Input<framework::Tensor>(framework::GradVarName("Y"));      \
+      auto* dX =                                                              \
+          context.Output<framework::Tensor>(framework::GradVarName("X"));     \
+      dX->mutable_data<T>(context.GetPlace());                                \
+      math::ACTIVATION_GRAD_NAME<Place, T> functor;                           \
+      auto* device_context = context.device_context();                        \
+      functor(*device_context, *X, *Y, *dY, dX);                              \
+    }                                                                         \
+  };
+
+namespace paddle {
+namespace operators {
+
+DEFINE_ACTIVATION_KERNEL(Sigmoid);
+
+DEFINE_ACTIVATION_GRAD_KERNEL(SigmoidGrad);
+
+DEFINE_ACTIVATION_KERNEL(Exp);
+
+DEFINE_ACTIVATION_GRAD_KERNEL(ExpGrad);
+
+DEFINE_ACTIVATION_KERNEL(Relu);
+
+DEFINE_ACTIVATION_GRAD_KERNEL(ReluGrad);
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/activation.h b/paddle/operators/math/activation.h
deleted file mode 100644
index b6af478d82..0000000000
--- a/paddle/operators/math/activation.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename Place, typename T>
-struct sigmoid {
-  void operator()(const platform::DeviceContext& deice_context,
-                  const framework::Tensor& input, framework::Tensor* output) {
-    auto x = framework::EigenVector<T>::Flatten(*output);
-    auto y = framework::EigenVector<T>::Flatten(input);
-    auto* place = device_context.get_eigen_device<Place>();
-    y.device(*place) = 1. / (1. + (-x).exp());
-  }
-};
-}
-}
-}
diff --git a/paddle/operators/math/activation_functor.h b/paddle/operators/math/activation_functor.h
new file mode 100644
index 0000000000..7e15607f46
--- /dev/null
+++ b/paddle/operators/math/activation_functor.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename Place, typename T>
+struct Sigmoid {
+  void operator()(const platform::DeviceContext& device_context,
+                  const framework::Tensor& X, framework::Tensor* Y) {
+    auto x = framework::EigenVector<T>::Flatten(X);
+    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto* place = device_context.template get_eigen_device<Place>();
+    y.device(*place) = 1. / (1. + (-x).exp());
+  }
+};
+
+template <typename Place, typename T>
+struct SigmoidGrad {
+  void operator()(const platform::DeviceContext& device_context,
+                  const framework::Tensor& X, const framework::Tensor& Y,
+                  const framework::Tensor& dY, framework::Tensor* dX) {
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto y = framework::EigenVector<T>::Flatten(Y);
+    auto dy = framework::EigenVector<T>::Flatten(dY);
+    auto* place = device_context.template get_eigen_device<Place>();
+    dx.device(*place) = dy * y * (1. - y);
+  }
+};
+
+template <typename Place, typename T>
+struct Exp {
+  void operator()(const platform::DeviceContext& device_context,
+                  const framework::Tensor& input, framework::Tensor* output) {
+    auto x = framework::EigenVector<T>::Flatten(input);
+    auto y = framework::EigenVector<T>::Flatten(*output);
+    auto* place = device_context.template get_eigen_device<Place>();
+    y.device(*place) = x.exp();
+  }
+};
+
+template <typename Place, typename T>
+struct ExpGrad {
+  void operator()(const platform::DeviceContext& device_context,
+                  const framework::Tensor& X, const framework::Tensor& Y,
+                  const framework::Tensor& dY, framework::Tensor* dX) {
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto dy = framework::EigenVector<T>::Flatten(dY);
+    auto* place = device_context.template get_eigen_device<Place>();
+    dx.device(*place) = dy.exp();
+  }
+};
+
+template <typename Place, typename T>
+struct Relu {
+  void operator()(const platform::DeviceContext& device_context,
+                  const framework::Tensor& input, framework::Tensor* output) {
+    auto x = framework::EigenVector<T>::Flatten(input);
+    auto y = framework::EigenVector<T>::Flatten(*output);
+    auto* place = device_context.template get_eigen_device<Place>();
+    y.device(*place) = x.cwiseMax(static_cast<T>(0));
+  }
+};
+
+template <typename Place, typename T>
+struct ReluGrad {
+  void operator()(const platform::DeviceContext& device_context,
+                  const framework::Tensor& X, const framework::Tensor& Y,
+                  const framework::Tensor& dY, framework::Tensor* dX) {
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto dy = framework::EigenVector<T>::Flatten(dY);
+    auto x = framework::EigenVector<T>::Flatten(X);
+    auto* place = device_context.template get_eigen_device<Place>();
+    dx.device(*place) = dy * (x > static_cast<T>(0)).template cast<T>();
+  }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
deleted file mode 100644
index 761c6de8d4..0000000000
--- a/paddle/operators/sigmoid_op.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/sigmoid_op.h"
-
-namespace paddle {
-namespace operators {
-
-class SigmoidOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
-  }
-};
-
-class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SigmoidOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "sigmoid input");
-    AddOutput("Y", "sigmoid output");
-    AddComment("Sigmoid function");
-  }
-};
-
-class SigmoidOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<Tensor>(framework::GradVarName("X"))
-        ->Resize(ctx.Input<Tensor>("Y")->dims());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker, sigmoid_grad,
-            ops::SigmoidOpGrad);
-REGISTER_OP_CPU_KERNEL(sigmoid,
-                       ops::SigmoidKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_grad, ops::SigmoidGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h
deleted file mode 100644
index b01a9b3f23..0000000000
--- a/paddle/operators/sigmoid_op.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename Place, typename T>
-class SigmoidKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto input = context.Input<Tensor>("X");
-    auto output = context.Output<Tensor>("Y");
-    output->mutable_data<T>(context.GetPlace());
-
-    // The clipping is used in Paddle's raw implenmention
-    auto X = EigenVector<T>::Flatten(*input);
-    auto Y = EigenVector<T>::Flatten(*output);
-    auto place = context.GetEigenDevice<Place>();
-
-    Y.device(place) = 1. / (1. + (-X).exp());
-  }
-};
-
-template <typename Place, typename T>
-class SigmoidGradKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto Y_t = context.Input<Tensor>("Y");
-    auto dY_t = context.Input<Tensor>(framework::GradVarName("Y"));
-    auto dX_t = context.Output<Tensor>(framework::GradVarName("X"));
-
-    dX_t->mutable_data<T>(context.GetPlace());
-
-    auto dX = EigenVector<T>::Flatten(*dX_t);
-    auto Y = EigenVector<T>::Flatten(*Y_t);
-    auto dY = EigenVector<T>::Flatten(*dY_t);
-    dX.device(context.GetEigenDevice<Place>()) = dY * Y * (1. - Y);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 16a2368aae..bd964c5d07 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -36,7 +36,6 @@ USE_OP(onehot_cross_entropy);
 USE_OP(sgd);
 USE_OP(mul);
 USE_OP(mean);
-USE_OP(sigmoid);
 USE_OP(softmax);
 USE_OP(rowwise_add);
 USE_OP(fill_zeros_like);
@@ -55,6 +54,9 @@ USE_OP(top_k);
 USE_OP(squared_l2_distance);
 USE_OP(sum);
 USE_OP(reshape);
+USE_OP(sigmoid);
+USE_OP(exp);
+USE_OP(relu);
 
 namespace paddle {
 namespace framework {
diff --git a/python/paddle/v2/framework/tests/test_exp_op.py b/python/paddle/v2/framework/tests/test_exp_op.py
new file mode 100644
index 0000000000..5a004f6fe2
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_exp_op.py
@@ -0,0 +1,22 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestExp(OpTest):
+    def setUp(self):
+        self.op_type = "exp"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Y': np.exp(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Y", max_relative_error=0.007)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_relu_op.py b/python/paddle/v2/framework/tests/test_relu_op.py
new file mode 100644
index 0000000000..07b7113d79
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_relu_op.py
@@ -0,0 +1,22 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestExp(OpTest):
+    def setUp(self):
+        self.op_type = "exp"
+        self.inputs = {
+            'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Y': np.maximum(self.inputs['X'], 0)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Y", max_relative_error=0.007)
+
+
+if __name__ == '__main__':
+    unittest.main()

From b50a50761760d124aa4a38c81599a1069bc6fbf0 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 13 Sep 2017 17:45:11 +0800
Subject: [PATCH 05/50] add activation operator python test

---
 paddle/operators/math/activation_functor.h       | 4 ++--
 python/paddle/v2/framework/tests/test_relu_op.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/operators/math/activation_functor.h b/paddle/operators/math/activation_functor.h
index 7e15607f46..1e9bdd142e 100644
--- a/paddle/operators/math/activation_functor.h
+++ b/paddle/operators/math/activation_functor.h
@@ -61,9 +61,9 @@ struct ExpGrad {
                   const framework::Tensor& X, const framework::Tensor& Y,
                   const framework::Tensor& dY, framework::Tensor* dX) {
     auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto dy = framework::EigenVector<T>::Flatten(dY);
+    auto y = framework::EigenVector<T>::Flatten(Y);
     auto* place = device_context.template get_eigen_device<Place>();
-    dx.device(*place) = dy.exp();
+    dx.device(*place) = y;
   }
 };
 
diff --git a/python/paddle/v2/framework/tests/test_relu_op.py b/python/paddle/v2/framework/tests/test_relu_op.py
index 07b7113d79..58a0872db4 100644
--- a/python/paddle/v2/framework/tests/test_relu_op.py
+++ b/python/paddle/v2/framework/tests/test_relu_op.py
@@ -3,9 +3,9 @@ import numpy as np
 from op_test import OpTest
 
 
-class TestExp(OpTest):
+class TestRelu(OpTest):
     def setUp(self):
-        self.op_type = "exp"
+        self.op_type = "relu"
         self.inputs = {
             'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
         }

From 4e173527c1650ed86df714392e53801a498b0078 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Wed, 13 Sep 2017 17:57:41 +0800
Subject: [PATCH 06/50] fix op python tests

---
 python/paddle/v2/framework/tests/test_exp_op.py     | 4 ++--
 python/paddle/v2/framework/tests/test_relu_op.py    | 8 +++-----
 python/paddle/v2/framework/tests/test_sigmoid_op.py | 4 ++--
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/python/paddle/v2/framework/tests/test_exp_op.py b/python/paddle/v2/framework/tests/test_exp_op.py
index 5a004f6fe2..0ec41e56a0 100644
--- a/python/paddle/v2/framework/tests/test_exp_op.py
+++ b/python/paddle/v2/framework/tests/test_exp_op.py
@@ -15,8 +15,8 @@ class TestExp(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Y", max_relative_error=0.007)
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_relu_op.py b/python/paddle/v2/framework/tests/test_relu_op.py
index 58a0872db4..c9af0c2ba7 100644
--- a/python/paddle/v2/framework/tests/test_relu_op.py
+++ b/python/paddle/v2/framework/tests/test_relu_op.py
@@ -6,17 +6,15 @@ from op_test import OpTest
 class TestRelu(OpTest):
     def setUp(self):
         self.op_type = "relu"
-        self.inputs = {
-            'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
-        }
+        self.inputs = {'X': np.random.uniform(-1, 1, [4, 4]).astype("float32")}
         self.outputs = {'Y': np.maximum(self.inputs['X'], 0)}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Y", max_relative_error=0.007)
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_sigmoid_op.py b/python/paddle/v2/framework/tests/test_sigmoid_op.py
index 2316e49eff..cf05e934d5 100644
--- a/python/paddle/v2/framework/tests/test_sigmoid_op.py
+++ b/python/paddle/v2/framework/tests/test_sigmoid_op.py
@@ -15,8 +15,8 @@ class TestSigmoid(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Y", max_relative_error=0.007)
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()

From c18ebc3022961f404265a80400fcc29d216b4534 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Thu, 14 Sep 2017 07:10:43 +0800
Subject: [PATCH 07/50] remove macros

---
 paddle/operators/activation_op.cc | 134 ++++++++++++++----------
 paddle/operators/activation_op.h  | 162 ++++++++++++++++++++++--------
 paddle/pybind/pybind.cc           |   2 +-
 3 files changed, 203 insertions(+), 95 deletions(-)

diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index d2c2378fef..e713b5a211 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -14,33 +14,55 @@
 
 #include "paddle/operators/activation_op.h"
 
-#define FILL_ACTIVATION_OP                                                  \
- public:                                                                    \
-  using framework::OperatorWithKernel::OperatorWithKernel;                  \
-                                                                            \
- protected:                                                                 \
-  void InferShape(const framework::InferShapeContext &ctx) const override { \
-    ctx.Output<framework::Tensor>("Y")->Resize(                             \
-        ctx.Input<framework::Tensor>("X")->dims());                         \
-  }
-
-#define FILL_ACTIVATION_GRAD_OP                                             \
- public:                                                                    \
-  using framework::OperatorWithKernel::OperatorWithKernel;                  \
-                                                                            \
- protected:                                                                 \
-  void InferShape(const framework::InferShapeContext &ctx) const override { \
-    ctx.Output<framework::Tensor>(framework::GradVarName("X"))              \
-        ->Resize(ctx.Input<framework::Tensor>("Y")->dims());                \
-  }
+// #define FILL_ACTIVATION_OP                                                  \
+//  public:                                                                    \
+//   using framework::OperatorWithKernel::OperatorWithKernel;                  \
+//                                                                             \
+//  protected:                                                                 \
+//   void InferShape(const framework::InferShapeContext &ctx) const override { \
+//     ctx.Output<framework::Tensor>("Y")->Resize(                             \
+//         ctx.Input<framework::Tensor>("X")->dims());                         \
+//   }
+
+// #define FILL_ACTIVATION_GRAD_OP                                             \
+//  public:                                                                    \
+//   using framework::OperatorWithKernel::OperatorWithKernel;                  \
+//                                                                             \
+//  protected:                                                                 \
+//   void InferShape(const framework::InferShapeContext &ctx) const override { \
+//     ctx.Output<framework::Tensor>(framework::GradVarName("X"))              \
+//         ->Resize(ctx.Input<framework::Tensor>("Y")->dims());                \
+//   }
 
 namespace paddle {
 namespace operators {
 
-class SigmoidOp : public framework::OperatorWithKernel {
-  FILL_ACTIVATION_OP
+class ActivationOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    ctx.Output<framework::Tensor>("Y")->Resize(
+        ctx.Input<framework::Tensor>("X")->dims());
+  }
 };
 
+class ActivationOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    ctx.Output<framework::Tensor>(framework::GradVarName("X"))
+        ->Resize(ctx.Input<framework::Tensor>("Y")->dims());
+  }
+};
+
+// class SigmoidOp : public framework::OperatorWithKernel {
+//   FILL_ACTIVATION_OP
+// };
+
 class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SigmoidOpMaker(framework::OpProto *proto,
@@ -52,13 +74,13 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-class SigmoidOpGrad : public framework::OperatorWithKernel {
-  FILL_ACTIVATION_GRAD_OP
-};
+// class SigmoidOpGrad : public framework::OperatorWithKernel {
+//   FILL_ACTIVATION_GRAD_OP
+// };
 
-class ExpOp : public framework::OperatorWithKernel {
-  FILL_ACTIVATION_OP
-};
+// class ExpOp : public framework::OperatorWithKernel {
+//   FILL_ACTIVATION_OP
+// };
 
 class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -70,13 +92,13 @@ class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-class ExpOpGrad : public framework::OperatorWithKernel {
-  FILL_ACTIVATION_GRAD_OP
-};
+// class ExpOpGrad : public framework::OperatorWithKernel {
+//   FILL_ACTIVATION_GRAD_OP
+// };
 
-class ReluOp : public framework::OperatorWithKernel {
-  FILL_ACTIVATION_OP
-};
+// class ReluOp : public framework::OperatorWithKernel {
+//   FILL_ACTIVATION_OP
+// };
 
 class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -88,28 +110,36 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-class ReluOpGrad : public framework::OperatorWithKernel {
-  FILL_ACTIVATION_GRAD_OP
-};
+// class ReluOpGrad : public framework::OperatorWithKernel {
+//   FILL_ACTIVATION_GRAD_OP
+// };
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker, sigmoid_grad,
-            ops::SigmoidOpGrad);
-REGISTER_OP_CPU_KERNEL(sigmoid,
-                       ops::SigmoidKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    sigmoid,
+    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::Sigmoid>);
+REGISTER_OP_CPU_KERNEL(sigmoid_grad,
+                       ops::ActivationGradKernel<paddle::platform::CPUPlace,
+                                                 float, ops::SigmoidGrad>);
+
+REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    exp, ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::Exp>);
 REGISTER_OP_CPU_KERNEL(
-    sigmoid_grad, ops::SigmoidGradKernel<paddle::platform::CPUPlace, float>);
-
-REGISTER_OP(exp, ops::ExpOp, ops::ExpOpMaker, exp_grad, ops::ExpOpGrad);
-REGISTER_OP_CPU_KERNEL(exp, ops::ExpKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(exp_grad,
-                       ops::ExpGradKernel<paddle::platform::CPUPlace, float>);
-
-REGISTER_OP(relu, ops::ReluOp, ops::ReluOpMaker, relu_grad, ops::ReluOpGrad);
-REGISTER_OP_CPU_KERNEL(relu,
-                       ops::ReluKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(relu_grad,
-                       ops::ReluGradKernel<paddle::platform::CPUPlace, float>);
+    exp_grad,
+    ops::ActivationGradKernel<paddle::platform::CPUPlace, float, ops::ExpGrad>);
+
+// REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
+// ops::ActivationOpGrad);
+// REGISTER_OP_CPU_KERNEL(relu,
+//                        ops::ReluKernel<paddle::platform::CPUPlace, float,
+//                        ops::Relu>);
+// REGISTER_OP_CPU_KERNEL(relu_grad,
+//                        ops::ReluGradKernel<paddle::platform::CPUPlace, float,
+//                        ops::ReluGrad>);
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 9e4101805e..7d5c5bb26f 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -15,57 +15,135 @@
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/activation_functor.h"
-
-#define ACTIVATION_KERNEL_NAME(ACTIVATION_NAME) ACTIVATION_NAME##Kernel
-
-#define DEFINE_ACTIVATION_KERNEL(ACTIVATION_NAME)                              \
-  template <typename Place, typename T>                                        \
-  class ACTIVATION_KERNEL_NAME(ACTIVATION_NAME) : public framework::OpKernel { \
-   public:                                                                     \
-    void Compute(const framework::ExecutionContext& context) const override {  \
-      auto* X = context.Input<framework::Tensor>("X");                         \
-      auto* Y = context.Output<framework::Tensor>("Y");                        \
-      Y->mutable_data<T>(context.GetPlace());                                  \
-      math::ACTIVATION_NAME<Place, T> functor;                                 \
-      auto* device_context = context.device_context();                         \
-      functor(*device_context, *X, Y);                                         \
-    }                                                                          \
-  };
-
-#define DEFINE_ACTIVATION_GRAD_KERNEL(ACTIVATION_GRAD_NAME)                   \
-  template <typename Place, typename T>                                       \
-  class ACTIVATION_KERNEL_NAME(ACTIVATION_GRAD_NAME)                          \
-      : public framework::OpKernel {                                          \
-   public:                                                                    \
-    void Compute(const framework::ExecutionContext& context) const override { \
-      auto* X = context.Input<framework::Tensor>("X");                        \
-      auto* Y = context.Input<framework::Tensor>("Y");                        \
-      auto* dY =                                                              \
-          context.Input<framework::Tensor>(framework::GradVarName("Y"));      \
-      auto* dX =                                                              \
-          context.Output<framework::Tensor>(framework::GradVarName("X"));     \
-      dX->mutable_data<T>(context.GetPlace());                                \
-      math::ACTIVATION_GRAD_NAME<Place, T> functor;                           \
-      auto* device_context = context.device_context();                        \
-      functor(*device_context, *X, *Y, *dY, dX);                              \
-    }                                                                         \
-  };
+// #include "paddle/operators/math/activation_functor.h"
+
+// #define ACTIVATION_KERNEL_NAME(ACTIVATION_NAME) ACTIVATION_NAME##Kernel
+
+// #define DEFINE_ACTIVATION_KERNEL(ACTIVATION_NAME)                              \
+//   template <typename Place, typename T>                                        \
+//   class ACTIVATION_KERNEL_NAME(ACTIVATION_NAME) : public framework::OpKernel { \
+//    public:                                                                     \
+//     void Compute(const framework::ExecutionContext& context) const override {  \
+//       auto* X = context.Input<framework::Tensor>("X");                         \
+//       auto* Y = context.Output<framework::Tensor>("Y");                        \
+//       Y->mutable_data<T>(context.GetPlace());                                  \
+//       math::ACTIVATION_NAME<Place, T> functor;                                 \
+//       auto* device_context = context.device_context();                         \
+//       functor(*device_context, *X, Y);                                         \
+//     }                                                                          \
+//   };
+
+// #define DEFINE_ACTIVATION_GRAD_KERNEL(ACTIVATION_GRAD_NAME)                   \
+//   template <typename Place, typename T>                                       \
+//   class ACTIVATION_KERNEL_NAME(ACTIVATION_GRAD_NAME)                          \
+//       : public framework::OpKernel {                                          \
+//    public:                                                                    \
+//     void Compute(const framework::ExecutionContext& context) const override { \
+//       auto* X = context.Input<framework::Tensor>("X");                        \
+//       auto* Y = context.Input<framework::Tensor>("Y");                        \
+//       auto* dY =                                                              \
+//           context.Input<framework::Tensor>(framework::GradVarName("Y"));      \
+//       auto* dX =                                                              \
+//           context.Output<framework::Tensor>(framework::GradVarName("X"));     \
+//       dX->mutable_data<T>(context.GetPlace());                                \
+//       math::ACTIVATION_GRAD_NAME<Place, T> functor;                           \
+//       auto* device_context = context.device_context();                        \
+//       functor(*device_context, *X, *Y, *dY, dX);                              \
+//     }                                                                         \
+//   };
 
 namespace paddle {
 namespace operators {
 
-DEFINE_ACTIVATION_KERNEL(Sigmoid);
+template <typename Place, typename T, typename Functor>
+class ActivationKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Y = context.Output<framework::Tensor>("Y");
+    Y->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto place = context.GetEigenDevice<Place>();
+    Functor functor;
+    functor(place, x, y);
+  }
+};
+
+template <typename Place, typename T, typename Functor>
+class ActivationGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Y = context.Input<framework::Tensor>("Y");
+    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    dX->mutable_data<T>(context.GetPlace());
+
+    auto dy = framework::EigenVector<T>::Flatten(*dY);
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto place = context.GetEigenDevice<Place>();
+    Functor functor;
+    functor(place, x, y, dy, dx);
+  }
+};
+
+struct Sigmoid {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = 1. / (1. + (-x).exp());
+  }
+};
+
+struct SigmoidGrad {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) = dy * y * (1. - y);
+  }
+};
+
+struct Exp {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = x.exp();
+  }
+};
+
+struct ExpGrad {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) = y;
+  }
+};
+
+// template <typename Device, typename X, typename Y>
+// struct Relu {
+//   void operator()(Device d, X x, Y y) {
+//     y.device(d) = x.cwiseMax(static_cast<T>(0));
+//   }
+// };
+
+// template <typename Device, typename X, typename Y, typename dY, typename dX>
+// struct ReluGrad {
+//   void operator()(Device d, X x, Y y, dY dy, dX dx) {
+//     dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>();
+//   }
+// };
+
+// DEFINE_ACTIVATION_KERNEL(Sigmoid);
 
-DEFINE_ACTIVATION_GRAD_KERNEL(SigmoidGrad);
+// DEFINE_ACTIVATION_GRAD_KERNEL(SigmoidGrad);
 
-DEFINE_ACTIVATION_KERNEL(Exp);
+// DEFINE_ACTIVATION_KERNEL(Exp);
 
-DEFINE_ACTIVATION_GRAD_KERNEL(ExpGrad);
+// DEFINE_ACTIVATION_GRAD_KERNEL(ExpGrad);
 
-DEFINE_ACTIVATION_KERNEL(Relu);
+// DEFINE_ACTIVATION_KERNEL(Relu);
 
-DEFINE_ACTIVATION_GRAD_KERNEL(ReluGrad);
+// DEFINE_ACTIVATION_GRAD_KERNEL(ReluGrad);
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index bd964c5d07..bed35d7822 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -56,7 +56,7 @@ USE_OP(sum);
 USE_OP(reshape);
 USE_OP(sigmoid);
 USE_OP(exp);
-USE_OP(relu);
+// USE_OP(relu);
 
 namespace paddle {
 namespace framework {

From 0957fa7b3c8b8929aa3a8fd94e33a75af3c314dc Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Thu, 14 Sep 2017 07:33:07 +0800
Subject: [PATCH 08/50] fix relu functor and revert some codes

---
 paddle/framework/operator.cc               |  4 +-
 paddle/framework/operator.h                | 25 ++++--
 paddle/operators/activation_op.cc          | 79 ++++--------------
 paddle/operators/activation_op.cu          | 22 +++--
 paddle/operators/activation_op.h           | 82 ++++--------------
 paddle/operators/math/activation_functor.h | 96 ----------------------
 paddle/platform/device_context.cc          |  7 +-
 paddle/platform/device_context.h           | 19 +----
 paddle/platform/device_context_test.cc     |  2 +-
 paddle/pybind/pybind.cc                    |  2 +-
 10 files changed, 78 insertions(+), 260 deletions(-)
 delete mode 100644 paddle/operators/math/activation_functor.h

diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 25c545d3f9..e1e122091f 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,14 +22,14 @@ namespace framework {
 template <>
 Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
     platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return *device_context_->get_eigen_device<platform::CPUPlace>();
+  return *device_context_->get_eigen_device<Eigen::DefaultDevice>();
 }
 
 #ifndef PADDLE_ONLY_CPU
 template <>
 Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return *device_context_->get_eigen_device<platform::GPUPlace>();
+  return *device_context_->get_eigen_device<Eigen::GpuDevice>();
 }
 #endif
 
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 0970797e02..4600b06009 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -139,9 +139,9 @@ class OperatorBase {
 // Macro for define a clone method.
 // If you are writing an kernel operator, `Clone` will be defined when you
 // register it. i.e. `Clone` method is not needed to define by yourself.
-#define DEFINE_OP_CLONE_METHOD(cls)                                            \
-  std::unique_ptr<::paddle::framework::OperatorBase> Clone() const final {     \
-    return std::unique_ptr<::paddle::framework::OperatorBase>(new cls(*this)); \
+#define DEFINE_OP_CLONE_METHOD(cls)                       \
+  std::unique_ptr<OperatorBase> Clone() const final {     \
+    return std::unique_ptr<OperatorBase>(new cls(*this)); \
   }
 
 // Macro for define a default constructor for Operator.
@@ -331,6 +331,21 @@ class InferShapeContext {
   const Scope& scope_;
 };
 
+template <typename T>
+struct EigenDeviceConverter;
+
+template <>
+struct EigenDeviceConverter<platform::CPUPlace> {
+  using EigenDeviceType = Eigen::DefaultDevice;
+};
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+struct EigenDeviceConverter<platform::GPUPlace> {
+  using EigenDeviceType = Eigen::GpuDevice;
+};
+#endif
+
 class ExecutionContext : public InferShapeContext {
  public:
   ExecutionContext(const OperatorBase& op, const Scope& scope,
@@ -338,8 +353,8 @@ class ExecutionContext : public InferShapeContext {
       : InferShapeContext(op, scope), device_context_(device_context) {}
 
   template <typename PlaceType,
-            typename DeviceType = typename platform::EigenDeviceConverter<
-                PlaceType>::EigenDeviceType>
+            typename DeviceType =
+                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
   DeviceType& GetEigenDevice() const;
 
   platform::Place GetPlace() const { return device_context_->GetPlace(); }
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index e713b5a211..ffa5c26da3 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -14,26 +14,6 @@
 
 #include "paddle/operators/activation_op.h"
 
-// #define FILL_ACTIVATION_OP                                                  \
-//  public:                                                                    \
-//   using framework::OperatorWithKernel::OperatorWithKernel;                  \
-//                                                                             \
-//  protected:                                                                 \
-//   void InferShape(const framework::InferShapeContext &ctx) const override { \
-//     ctx.Output<framework::Tensor>("Y")->Resize(                             \
-//         ctx.Input<framework::Tensor>("X")->dims());                         \
-//   }
-
-// #define FILL_ACTIVATION_GRAD_OP                                             \
-//  public:                                                                    \
-//   using framework::OperatorWithKernel::OperatorWithKernel;                  \
-//                                                                             \
-//  protected:                                                                 \
-//   void InferShape(const framework::InferShapeContext &ctx) const override { \
-//     ctx.Output<framework::Tensor>(framework::GradVarName("X"))              \
-//         ->Resize(ctx.Input<framework::Tensor>("Y")->dims());                \
-//   }
-
 namespace paddle {
 namespace operators {
 
@@ -59,10 +39,6 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
   }
 };
 
-// class SigmoidOp : public framework::OperatorWithKernel {
-//   FILL_ACTIVATION_OP
-// };
-
 class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SigmoidOpMaker(framework::OpProto *proto,
@@ -74,14 +50,6 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-// class SigmoidOpGrad : public framework::OperatorWithKernel {
-//   FILL_ACTIVATION_GRAD_OP
-// };
-
-// class ExpOp : public framework::OperatorWithKernel {
-//   FILL_ACTIVATION_OP
-// };
-
 class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ExpOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
@@ -92,14 +60,6 @@ class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-// class ExpOpGrad : public framework::OperatorWithKernel {
-//   FILL_ACTIVATION_GRAD_OP
-// };
-
-// class ReluOp : public framework::OperatorWithKernel {
-//   FILL_ACTIVATION_OP
-// };
-
 class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
@@ -110,36 +70,33 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-// class ReluOpGrad : public framework::OperatorWithKernel {
-//   FILL_ACTIVATION_GRAD_OP
-// };
-
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad,
             ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(sigmoid,
+                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
+                                             ops::SigmoidFunctor>);
 REGISTER_OP_CPU_KERNEL(
-    sigmoid,
-    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::Sigmoid>);
-REGISTER_OP_CPU_KERNEL(sigmoid_grad,
-                       ops::ActivationGradKernel<paddle::platform::CPUPlace,
-                                                 float, ops::SigmoidGrad>);
+    sigmoid_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
+                                            ops::SigmoidGradFunctor>);
 
 REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
             ops::ActivationOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    exp, ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::Exp>);
+    exp,
+    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::ExpFunctor>);
+REGISTER_OP_CPU_KERNEL(exp_grad,
+                       ops::ActivationGradKernel<paddle::platform::CPUPlace,
+                                                 float, ops::ExpGradFunctor>);
+
+REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(relu,
+                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
+                                             ops::ReluFunctor<float>>);
 REGISTER_OP_CPU_KERNEL(
-    exp_grad,
-    ops::ActivationGradKernel<paddle::platform::CPUPlace, float, ops::ExpGrad>);
-
-// REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
-// ops::ActivationOpGrad);
-// REGISTER_OP_CPU_KERNEL(relu,
-//                        ops::ReluKernel<paddle::platform::CPUPlace, float,
-//                        ops::Relu>);
-// REGISTER_OP_CPU_KERNEL(relu_grad,
-//                        ops::ReluGradKernel<paddle::platform::CPUPlace, float,
-//                        ops::ReluGrad>);
+    relu_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
+                                         ops::ReluGradFunctor<float>>);
diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu
index 55d9f52124..3b2c147f46 100644
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@@ -18,15 +18,21 @@
 namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(sigmoid,
-                       ops::SigmoidKernel<paddle::platform::GPUPlace, float>);
+                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
+                                             ops::SigmoidFunctor>);
 REGISTER_OP_GPU_KERNEL(
-    sigmoid_grad, ops::SigmoidGradKernel<paddle::platform::GPUPlace, float>);
+    sigmoid_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
+                                            ops::SigmoidGradFunctor>);
 
-REGISTER_OP_GPU_KERNEL(exp, ops::ExpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    exp,
+    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::ExpFunctor>);
 REGISTER_OP_GPU_KERNEL(exp_grad,
-                       ops::ExpGradKernel<paddle::platform::GPUPlace, float>);
-
+                       ops::ActivationGradKernel<paddle::platform::GPUPlace,
+                                                 float, ops::ExpGradFunctor>);
 REGISTER_OP_GPU_KERNEL(relu,
-                       ops::ReluKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(relu_grad,
-                       ops::ReluGradKernel<paddle::platform::GPUPlace, float>);
+                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
+                                             ops::ReluFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    relu_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
+                                         ops::ReluGradFunctor<float>>);
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 7d5c5bb26f..0b7e171e72 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -15,42 +15,6 @@
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-// #include "paddle/operators/math/activation_functor.h"
-
-// #define ACTIVATION_KERNEL_NAME(ACTIVATION_NAME) ACTIVATION_NAME##Kernel
-
-// #define DEFINE_ACTIVATION_KERNEL(ACTIVATION_NAME)                              \
-//   template <typename Place, typename T>                                        \
-//   class ACTIVATION_KERNEL_NAME(ACTIVATION_NAME) : public framework::OpKernel { \
-//    public:                                                                     \
-//     void Compute(const framework::ExecutionContext& context) const override {  \
-//       auto* X = context.Input<framework::Tensor>("X");                         \
-//       auto* Y = context.Output<framework::Tensor>("Y");                        \
-//       Y->mutable_data<T>(context.GetPlace());                                  \
-//       math::ACTIVATION_NAME<Place, T> functor;                                 \
-//       auto* device_context = context.device_context();                         \
-//       functor(*device_context, *X, Y);                                         \
-//     }                                                                          \
-//   };
-
-// #define DEFINE_ACTIVATION_GRAD_KERNEL(ACTIVATION_GRAD_NAME)                   \
-//   template <typename Place, typename T>                                       \
-//   class ACTIVATION_KERNEL_NAME(ACTIVATION_GRAD_NAME)                          \
-//       : public framework::OpKernel {                                          \
-//    public:                                                                    \
-//     void Compute(const framework::ExecutionContext& context) const override { \
-//       auto* X = context.Input<framework::Tensor>("X");                        \
-//       auto* Y = context.Input<framework::Tensor>("Y");                        \
-//       auto* dY =                                                              \
-//           context.Input<framework::Tensor>(framework::GradVarName("Y"));      \
-//       auto* dX =                                                              \
-//           context.Output<framework::Tensor>(framework::GradVarName("X"));     \
-//       dX->mutable_data<T>(context.GetPlace());                                \
-//       math::ACTIVATION_GRAD_NAME<Place, T> functor;                           \
-//       auto* device_context = context.device_context();                        \
-//       functor(*device_context, *X, *Y, *dY, dX);                              \
-//     }                                                                         \
-//   };
 
 namespace paddle {
 namespace operators {
@@ -91,59 +55,49 @@ class ActivationGradKernel : public framework::OpKernel {
   }
 };
 
-struct Sigmoid {
+struct SigmoidFunctor {
   template <typename Device, typename X, typename Y>
   void operator()(Device d, X x, Y y) {
     y.device(d) = 1. / (1. + (-x).exp());
   }
 };
 
-struct SigmoidGrad {
+struct SigmoidGradFunctor {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
   void operator()(Device d, X x, Y y, dY dy, dX dx) {
     dx.device(d) = dy * y * (1. - y);
   }
 };
 
-struct Exp {
+struct ExpFunctor {
   template <typename Device, typename X, typename Y>
   void operator()(Device d, X x, Y y) {
     y.device(d) = x.exp();
   }
 };
 
-struct ExpGrad {
+struct ExpGradFunctor {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
   void operator()(Device d, X x, Y y, dY dy, dX dx) {
     dx.device(d) = y;
   }
 };
 
-// template <typename Device, typename X, typename Y>
-// struct Relu {
-//   void operator()(Device d, X x, Y y) {
-//     y.device(d) = x.cwiseMax(static_cast<T>(0));
-//   }
-// };
-
-// template <typename Device, typename X, typename Y, typename dY, typename dX>
-// struct ReluGrad {
-//   void operator()(Device d, X x, Y y, dY dy, dX dx) {
-//     dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>();
-//   }
-// };
-
-// DEFINE_ACTIVATION_KERNEL(Sigmoid);
-
-// DEFINE_ACTIVATION_GRAD_KERNEL(SigmoidGrad);
-
-// DEFINE_ACTIVATION_KERNEL(Exp);
-
-// DEFINE_ACTIVATION_GRAD_KERNEL(ExpGrad);
-
-// DEFINE_ACTIVATION_KERNEL(Relu);
+template <typename T>
+struct ReluFunctor {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = x.cwiseMax(static_cast<T>(0));
+  }
+};
 
-// DEFINE_ACTIVATION_GRAD_KERNEL(ReluGrad);
+template <typename T>
+struct ReluGradFunctor {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>();
+  }
+};
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/activation_functor.h b/paddle/operators/math/activation_functor.h
deleted file mode 100644
index 1e9bdd142e..0000000000
--- a/paddle/operators/math/activation_functor.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename Place, typename T>
-struct Sigmoid {
-  void operator()(const platform::DeviceContext& device_context,
-                  const framework::Tensor& X, framework::Tensor* Y) {
-    auto x = framework::EigenVector<T>::Flatten(X);
-    auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto* place = device_context.template get_eigen_device<Place>();
-    y.device(*place) = 1. / (1. + (-x).exp());
-  }
-};
-
-template <typename Place, typename T>
-struct SigmoidGrad {
-  void operator()(const platform::DeviceContext& device_context,
-                  const framework::Tensor& X, const framework::Tensor& Y,
-                  const framework::Tensor& dY, framework::Tensor* dX) {
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto y = framework::EigenVector<T>::Flatten(Y);
-    auto dy = framework::EigenVector<T>::Flatten(dY);
-    auto* place = device_context.template get_eigen_device<Place>();
-    dx.device(*place) = dy * y * (1. - y);
-  }
-};
-
-template <typename Place, typename T>
-struct Exp {
-  void operator()(const platform::DeviceContext& device_context,
-                  const framework::Tensor& input, framework::Tensor* output) {
-    auto x = framework::EigenVector<T>::Flatten(input);
-    auto y = framework::EigenVector<T>::Flatten(*output);
-    auto* place = device_context.template get_eigen_device<Place>();
-    y.device(*place) = x.exp();
-  }
-};
-
-template <typename Place, typename T>
-struct ExpGrad {
-  void operator()(const platform::DeviceContext& device_context,
-                  const framework::Tensor& X, const framework::Tensor& Y,
-                  const framework::Tensor& dY, framework::Tensor* dX) {
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto y = framework::EigenVector<T>::Flatten(Y);
-    auto* place = device_context.template get_eigen_device<Place>();
-    dx.device(*place) = y;
-  }
-};
-
-template <typename Place, typename T>
-struct Relu {
-  void operator()(const platform::DeviceContext& device_context,
-                  const framework::Tensor& input, framework::Tensor* output) {
-    auto x = framework::EigenVector<T>::Flatten(input);
-    auto y = framework::EigenVector<T>::Flatten(*output);
-    auto* place = device_context.template get_eigen_device<Place>();
-    y.device(*place) = x.cwiseMax(static_cast<T>(0));
-  }
-};
-
-template <typename Place, typename T>
-struct ReluGrad {
-  void operator()(const platform::DeviceContext& device_context,
-                  const framework::Tensor& X, const framework::Tensor& Y,
-                  const framework::Tensor& dY, framework::Tensor* dX) {
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto dy = framework::EigenVector<T>::Flatten(dY);
-    auto x = framework::EigenVector<T>::Flatten(X);
-    auto* place = device_context.template get_eigen_device<Place>();
-    dx.device(*place) = dy * (x > static_cast<T>(0)).template cast<T>();
-  }
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index cf5c3eec81..ad212c5b2c 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -16,8 +16,8 @@ namespace paddle {
 namespace platform {
 
 template <>
-Eigen::DefaultDevice*
-DeviceContext::get_eigen_device<CPUPlace, Eigen::DefaultDevice>() const {
+Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>()
+    const {
   return reinterpret_cast<const CPUDeviceContext*>(this)->eigen_device();
 }
 
@@ -91,8 +91,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
 };
 
 template <>
-Eigen::GpuDevice* DeviceContext::get_eigen_device<GPUPlace, Eigen::GpuDevice>()
-    const {
+Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
   return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
 }
 
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index a46ba4c703..11528e1194 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -27,29 +27,12 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-template <typename T>
-struct EigenDeviceConverter;
-
-template <>
-struct EigenDeviceConverter<platform::CPUPlace> {
-  using EigenDeviceType = Eigen::DefaultDevice;
-};
-
-#ifndef PADDLE_ONLY_CPU
-template <>
-struct EigenDeviceConverter<platform::GPUPlace> {
-  using EigenDeviceType = Eigen::GpuDevice;
-};
-#endif
-
 class DeviceContext {
  public:
   virtual ~DeviceContext() {}
   virtual Place GetPlace() const = 0;
 
-  template <typename PlaceType,
-            typename DeviceType =
-                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
+  template <typename DeviceType>
   DeviceType* get_eigen_device() const;
 };
 
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index d71e0aae58..5883a55272 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -24,7 +24,7 @@ TEST(Device, Init) {
   for (int i = 0; i < count; i++) {
     DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
     Eigen::GpuDevice* gpu_device =
-        device_context->template get_eigen_device<GPUPlace>();
+        device_context->template get_eigen_device<Eigen::GpuDevice>();
     ASSERT_NE(nullptr, gpu_device);
     delete device_context;
   }
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index bed35d7822..bd964c5d07 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -56,7 +56,7 @@ USE_OP(sum);
 USE_OP(reshape);
 USE_OP(sigmoid);
 USE_OP(exp);
-// USE_OP(relu);
+USE_OP(relu);
 
 namespace paddle {
 namespace framework {

From 3110bf9a9aaa8cbc3f52bd171b12340b299481db Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Thu, 14 Sep 2017 07:37:31 +0800
Subject: [PATCH 09/50] merge activation operator python tests

---
 .../v2/framework/tests/test_activation_op.py  | 50 +++++++++++++++++++
 .../paddle/v2/framework/tests/test_exp_op.py  | 22 --------
 .../paddle/v2/framework/tests/test_relu_op.py | 20 --------
 .../v2/framework/tests/test_sigmoid_op.py     | 22 --------
 4 files changed, 50 insertions(+), 64 deletions(-)
 create mode 100644 python/paddle/v2/framework/tests/test_activation_op.py
 delete mode 100644 python/paddle/v2/framework/tests/test_exp_op.py
 delete mode 100644 python/paddle/v2/framework/tests/test_relu_op.py
 delete mode 100644 python/paddle/v2/framework/tests/test_sigmoid_op.py

diff --git a/python/paddle/v2/framework/tests/test_activation_op.py b/python/paddle/v2/framework/tests/test_activation_op.py
new file mode 100644
index 0000000000..23ff584396
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
@@ -0,0 +1,50 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestExp(OpTest):
+    def setUp(self):
+        self.op_type = "exp"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Y': np.exp(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
+class TestRelu(OpTest):
+    def setUp(self):
+        self.op_type = "relu"
+        self.inputs = {'X': np.random.uniform(-1, 1, [4, 4]).astype("float32")}
+        self.outputs = {'Y': np.maximum(self.inputs['X'], 0)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
+class TestSigmoid(OpTest):
+    def setUp(self):
+        self.op_type = "sigmoid"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Y': 1 / (1 + np.exp(-self.inputs['X']))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_exp_op.py b/python/paddle/v2/framework/tests/test_exp_op.py
deleted file mode 100644
index 0ec41e56a0..0000000000
--- a/python/paddle/v2/framework/tests/test_exp_op.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestExp(OpTest):
-    def setUp(self):
-        self.op_type = "exp"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Y': np.exp(self.inputs['X'])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_relu_op.py b/python/paddle/v2/framework/tests/test_relu_op.py
deleted file mode 100644
index c9af0c2ba7..0000000000
--- a/python/paddle/v2/framework/tests/test_relu_op.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestRelu(OpTest):
-    def setUp(self):
-        self.op_type = "relu"
-        self.inputs = {'X': np.random.uniform(-1, 1, [4, 4]).astype("float32")}
-        self.outputs = {'Y': np.maximum(self.inputs['X'], 0)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_sigmoid_op.py b/python/paddle/v2/framework/tests/test_sigmoid_op.py
deleted file mode 100644
index cf05e934d5..0000000000
--- a/python/paddle/v2/framework/tests/test_sigmoid_op.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestSigmoid(OpTest):
-    def setUp(self):
-        self.op_type = "sigmoid"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Y': 1 / (1 + np.exp(-self.inputs['X']))}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
-
-
-if __name__ == "__main__":
-    unittest.main()

From e515f18dd857d2f9f986955cd76208a965eb5c5c Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Thu, 14 Sep 2017 10:26:41 +0800
Subject: [PATCH 10/50] add tanh and sqrt activation operators

---
 paddle/operators/activation_op.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 0b7e171e72..4421c10957 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -99,5 +99,36 @@ struct ReluGradFunctor {
   }
 };
 
+struct TanhFunctor {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = x.tanh();
+  }
+};
+
+template <typename T>
+struct TanhGradFunctor {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) = dy * (T(1) - y * y);
+  }
+};
+
+struct SqrtFunctor {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = x.sqrt();
+  }
+};
+
+template <typename T>
+struct SqrtGradFunctor {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    const T y_conj = Eigen::numext::conj(y);
+    dx.device(d) = static_cast<T>(0.5) * dy / y_conj;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

From 96500af64b07913b8cd3be09dceb8fe02db86168 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Thu, 14 Sep 2017 11:12:37 +0800
Subject: [PATCH 11/50] add rank_loss operator

---
 paddle/operators/rank_loss_op.cc | 103 +++++++++++++++++++++++++++++++
 paddle/operators/rank_loss_op.cu |  22 +++++++
 paddle/operators/rank_loss_op.h  |  90 +++++++++++++++++++++++++++
 paddle/pybind/pybind.cc          |   1 +
 4 files changed, 216 insertions(+)
 create mode 100644 paddle/operators/rank_loss_op.cc
 create mode 100644 paddle/operators/rank_loss_op.cu
 create mode 100644 paddle/operators/rank_loss_op.h

diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
new file mode 100644
index 0000000000..14cddb609f
--- /dev/null
+++ b/paddle/operators/rank_loss_op.cc
@@ -0,0 +1,103 @@
+
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/rank_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class RankLossOp : public framework::OperatorWithKernel {
+ public:
+  RankLossOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    // input check
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("P"), "Input(P) shouldn't be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Oi"), "Input(Oi) shouldn't be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Oj"), "Input(Oj) shouldn't be null");
+    auto p_dims = ctx.Input<framework::Tensor>("P")->dims();
+    auto oi_dims = ctx.Input<framework::Tensor>("Oi")->dims();
+    auto oj_dims = ctx.Input<framework::Tensor>("Oj")->dims();
+    PADDLE_ENFORCE_EQ(oi_dims, oj_dims,
+                      "Input(Oi) and Input(Oj) must have the same size");
+    PADDLE_ENFORCE_EQ(
+        p_dims, oi_dims,
+        "Input(P) must have the same size with Input(Oi) & Input(Oj)");
+    ctx.Output<framework::Tensor>("Out")->Resize(p_dims);
+  }
+};
+
+class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RankLossOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("P", "The first input of RankLoss operator.");
+    AddInput("Oi", "The second input of RankLoss operator.");
+    AddInput("Oj", "The third input of RankLoss operator.");
+    AddOutput("Out", "The output tensor of RankLoss operator.");
+    AddComment(R"DOC(RankLoss operator
+
+A rank loss operator for learning to rank (LTR) task. This operator contains
+three inputs: P, Oi, and Oj, and the rank cost can be expressed as
+
+\f[
+  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
+  o_{i,j} =  o_i - o_j  \\
+  \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
+\f]
+
+[1]. Chris Burges, Tal Shaked, Erin Renshaw, et al. Learning to
+     Rank useing Gradient Descent.
+)DOC");
+  }
+};
+
+class RankLossGradOp : public framework::OperatorWithKernel {
+ public:
+  RankLossGradOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("P"), "Input(P) shouldn't be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Oi"), "Input(Oi) shouldn't be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Oj"), "Input(Oj) shouldn't be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) shouldn't be null.");
+    auto dims = ctx.Input<framework::Tensor>("P")->dims();
+    ctx.Output<framework::Tensor>(framework::GradVarName("P"))->Resize(dims);
+    ctx.Output<framework::Tensor>(framework::GradVarName("Oi"))->Resize(dims);
+    ctx.Output<framework::Tensor>(framework::GradVarName("Oj"))->Resize(dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad,
+            ops::RankLossGradOp);
+REGISTER_OP_CPU_KERNEL(rank_loss,
+                       ops::RankLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    rank_loss_grad, ops::RankLossGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/rank_loss_op.cu b/paddle/operators/rank_loss_op.cu
new file mode 100644
index 0000000000..779588ff36
--- /dev/null
+++ b/paddle/operators/rank_loss_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/rank_loss_op.h"
+
+REGISTER_OP_GPU_KERNEL(
+    rank_loss,
+    paddle::operators::RankLossKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    rank_loss_grad,
+    paddle::operators::RankLossGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h
new file mode 100644
index 0000000000..d21871107a
--- /dev/null
+++ b/paddle/operators/rank_loss_op.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class RankLossKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* p_t = ctx.Input<framework::Tensor>("P");
+    auto* oi_t = ctx.Input<framework::Tensor>("Oi");
+    auto* oj_t = ctx.Input<framework::Tensor>("Oj");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto& dev = ctx.GetEigenDevice<Place>();
+    auto out_eig = framework::EigenVector<T>::Flatten(*out);
+    auto p_eig = framework::EigenVector<T>::Flatten(*p_t);
+    auto oi_eig = framework::EigenVector<T>::Flatten(*oi_t);
+    auto oj_eig = framework::EigenVector<T>::Flatten(*oj_t);
+
+    framework::Tensor o_t;
+    o_t.Resize(oi_t->dims());
+    o_t.mutable_data<T>(ctx.GetPlace());
+    auto o_eig = framework::EigenVector<T>::Flatten(o_t);
+    o_eig.device(dev) = oi_eig - oj_eig;
+
+    out_eig.device(dev) = (1. + (o_eig).exp()).log() - p_eig * o_eig;
+  }
+};
+
+template <typename Place, typename T>
+class RankLossGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_oi = ctx.Output<framework::Tensor>(framework::GradVarName("Oi"));
+    auto* d_oj = ctx.Output<framework::Tensor>(framework::GradVarName("Oj"));
+    auto* d_p = ctx.Output<framework::Tensor>(framework::GradVarName("P"));
+
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* p_t = ctx.Input<framework::Tensor>("P");
+    auto* oi_t = ctx.Input<framework::Tensor>("Oi");
+    auto* oj_t = ctx.Input<framework::Tensor>("Oj");
+
+    d_oi->mutable_data<T>(ctx.GetPlace());
+    d_oj->mutable_data<T>(ctx.GetPlace());
+    d_p->mutable_data<T>(ctx.GetPlace());
+
+    auto& dev = ctx.GetEigenDevice<Place>();
+    auto d_out_eig = framework::EigenVector<T>::Flatten(*d_out);
+    auto p_eig = framework::EigenVector<T>::Flatten(*p_t);
+    auto oi_eig = framework::EigenVector<T>::Flatten(*oi_t);
+    auto oj_eig = framework::EigenVector<T>::Flatten(*oj_t);
+
+    auto d_oi_eig = framework::EigenVector<T>::Flatten(*d_oi);
+    auto d_oj_eig = framework::EigenVector<T>::Flatten(*d_oj);
+
+    framework::Tensor o_t;
+    o_t.Resize(oi_t->dims());
+    o_t.mutable_data<T>(ctx.GetPlace());
+    auto o_eig = framework::EigenVector<T>::Flatten(o_t);
+    o_eig.device(dev) = oi_eig - oj_eig;
+
+    // dOi & dOj
+    d_oi_eig.device(dev) =
+        d_out_eig * (o_eig.exp() / (1. + o_eig.exp()) - p_eig);
+    d_oj_eig.device(dev) = -d_oi_eig;
+    // dP
+    framework::EigenVector<T>::Flatten(*d_p).device(dev) = -o_eig;
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index ef62d6e997..1805a830b3 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -56,6 +56,7 @@ USE_OP(top_k);
 USE_OP(squared_l2_distance);
 USE_OP(sum);
 USE_OP(reshape);
+USE_OP(rank_loss);
 
 namespace paddle {
 namespace framework {

From 7c423e4b0db7657e526ad05b0dd0e20e6582acf0 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Thu, 14 Sep 2017 11:17:04 +0800
Subject: [PATCH 12/50] add unit test for rank_loss_op

---
 .../v2/framework/tests/test_rank_loss_op.py   | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 python/paddle/v2/framework/tests/test_rank_loss_op.py

diff --git a/python/paddle/v2/framework/tests/test_rank_loss_op.py b/python/paddle/v2/framework/tests/test_rank_loss_op.py
new file mode 100644
index 0000000000..48354b7f7b
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_rank_loss_op.py
@@ -0,0 +1,27 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestReshapeOp(OpTest):
+    def setUp(self):
+        self.op_type = "rank_loss"
+        num = 5
+        # P = {0, 1.0} or {0, 0.5, 1.0}
+        P = np.random.randint(0, 2, size=(num, num)).astype("float32")
+        Oi = np.random.random((num, num)).astype("float32")
+        Oj = np.random.random((num, num)).astype("float32")
+        O = Oi - Oj
+        Out = np.log(1.0 + np.exp(O)) - P * O
+        self.inputs = {'P': P, 'Oi': Oi, 'Oj': Oj}
+        self.outputs = {'Out': Out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["Oj"], "Out")
+
+
+if __name__ == '__main__':
+    unittest.main()

From 87de31bf205a1ffb63c74f7f8b338bcce38dcb2c Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Thu, 14 Sep 2017 12:09:16 +0800
Subject: [PATCH 13/50] update doc information

---
 paddle/operators/rank_loss_op.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index 14cddb609f..66571bd9a6 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -48,9 +48,9 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
   RankLossOpMaker(framework::OpProto *proto,
                   framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("P", "The first input of RankLoss operator.");
-    AddInput("Oi", "The second input of RankLoss operator.");
-    AddInput("Oj", "The third input of RankLoss operator.");
+    AddInput("P", "The desired target values for posteriors.");
+    AddInput("Oi", "The model output for item i.");
+    AddInput("Oj", "The model output for item j.");
     AddOutput("Out", "The output tensor of RankLoss operator.");
     AddComment(R"DOC(RankLoss operator
 
@@ -63,6 +63,8 @@ three inputs: P, Oi, and Oj, and the rank cost can be expressed as
   \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
 \f]
 
+A detailed explanation about these notations can be found in
+
 [1]. Chris Burges, Tal Shaked, Erin Renshaw, et al. Learning to
      Rank useing Gradient Descent.
 )DOC");

From dadace3178ab1f038bec7d8fcdfb849e8fc6963f Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Thu, 14 Sep 2017 14:02:29 +0800
Subject: [PATCH 14/50] add more activation functors

---
 paddle/operators/activation_op.h | 62 +++++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 4421c10957..9bf340f2ed 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -55,6 +55,8 @@ class ActivationGradKernel : public framework::OpKernel {
   }
 };
 
+// sigmoid = 1 / (1 + exp(-x)
+template <typename T>
 struct SigmoidFunctor {
   template <typename Device, typename X, typename Y>
   void operator()(Device d, X x, Y y) {
@@ -69,6 +71,7 @@ struct SigmoidGradFunctor {
   }
 };
 
+// exp(x) = e^x
 struct ExpFunctor {
   template <typename Device, typename X, typename Y>
   void operator()(Device d, X x, Y y) {
@@ -79,10 +82,11 @@ struct ExpFunctor {
 struct ExpGradFunctor {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
   void operator()(Device d, X x, Y y, dY dy, dX dx) {
-    dx.device(d) = y;
+    dx.device(d) = dy * y;
   }
 };
 
+// relu(x) = max(x, 0)
 template <typename T>
 struct ReluFunctor {
   template <typename Device, typename X, typename Y>
@@ -99,6 +103,7 @@ struct ReluGradFunctor {
   }
 };
 
+// tanh = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 struct TanhFunctor {
   template <typename Device, typename X, typename Y>
   void operator()(Device d, X x, Y y) {
@@ -114,6 +119,7 @@ struct TanhGradFunctor {
   }
 };
 
+// sqrt(x) = x^(1/2)
 struct SqrtFunctor {
   template <typename Device, typename X, typename Y>
   void operator()(Device d, X x, Y y) {
@@ -130,5 +136,59 @@ struct SqrtGradFunctor {
   }
 };
 
+// abs(x) = |x|
+struct AbsFunctor {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = x.abs();
+  }
+};
+
+// reciprocal(x) = 1 / x
+template <typename T>
+struct ReciprocalFunctor {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = 1. / x;
+  }
+};
+
+struct ReciprocalGradFunctor {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) = dy * (-1.0) * y * y;
+  }
+};
+
+// log(x) = natural logarithm of x
+struct LogFunctor {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = x.log();
+  }
+};
+
+struct LogGradFunctor {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) = dy * (1. / x);
+  }
+};
+
+// square(x) = x^2
+struct SquareFunctor {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = x.square();
+  }
+}
+
+struct SquareGradFunctor {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) = dy * 2 * x;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

From 5824d850012e0c802e90f2ad7d23f4b8e3fc00d2 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Thu, 14 Sep 2017 18:19:13 +0800
Subject: [PATCH 15/50] add activation operators and python unittests

---
 paddle/operators/activation_op.cc             | 214 +++++++++++++++++-
 paddle/operators/activation_op.cu             |  82 +++++++
 paddle/operators/activation_op.h              | 181 ++++++++++++++-
 paddle/pybind/pybind.cc                       |   2 -
 python/paddle/v2/framework/tests/op_test.py   |   2 +-
 .../v2/framework/tests/test_activation_op.py  | 165 +++++++++++++-
 6 files changed, 626 insertions(+), 20 deletions(-)

diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index ffa5c26da3..8ada158ff3 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -46,7 +46,7 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Sigmoid operator");
     AddOutput("Y", "Output of Sigmoid operator");
-    AddComment("Sigmoid activation operator");
+    AddComment("Sigmoid activation operator, sigmoid = 1 / (1 + exp(-x))");
   }
 };
 
@@ -56,7 +56,7 @@ class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Exp operator");
     AddOutput("Y", "Output of Exp operator");
-    AddComment("Exp activation operator");
+    AddComment("Exp activation operator, exp(x) = e^x");
   }
 };
 
@@ -66,7 +66,129 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Relu operator");
     AddOutput("Y", "Output of Relu operator");
-    AddComment("Relu activation operator");
+    AddComment("Relu activation operator, relu(x) = max(x, 0)");
+  }
+};
+
+class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Tanh operator");
+    AddOutput("Y", "Output of Tanh operator");
+    AddComment(
+        "Tanh activation operator, tanh = (exp(x) - exp(-x)) / (exp(x) + "
+        "exp(-x))");
+  }
+};
+
+class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SqrtOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Sqrt operator");
+    AddOutput("Y", "Output of Sqrt operator");
+    AddComment("Sqrt activation operator, sqrt(x) = x^(1/2)");
+  }
+};
+
+class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AbsOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Abs operator");
+    AddOutput("Y", "Output of Abs operator");
+    AddComment("Abs activation operator, abs(x) = |x|");
+  }
+};
+
+class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReciprocalOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Reciprocal operator");
+    AddOutput("Y", "Output of Reciprocal operator");
+    AddComment("Reciprocal activation operator, reciprocal(x) = 1 / x");
+  }
+};
+
+class LogOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LogOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Log operator");
+    AddOutput("Y", "Output of Log operator");
+    AddComment("Log activation operator, log(x) = natural logarithm of x");
+  }
+};
+
+class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SquareOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Square operator");
+    AddOutput("Y", "Output of Square operator");
+    AddComment("Square activation operator, square(x) = x^2");
+  }
+};
+
+template <typename AttrType>
+class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of BRelu operator");
+    AddOutput("Y", "Output of BRelu operator");
+    AddComment("BRelu activation operator, brelu = max(min(x, t_min), t_max)");
+    AddAttr<AttrType>("t_min", "The min marginal value of BRelu")
+        .SetDefault(static_cast<AttrType>(0));
+    AddAttr<AttrType>("t_max", "The max marginal value of BRelu")
+        .SetDefault(static_cast<AttrType>(24));
+  }
+};
+
+template <typename AttrType>
+class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftReluOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of SoftRelu operator");
+    AddOutput("Y", "Output of SoftRelu operator");
+    AddComment(
+        "SoftRelu activation operator, soft_relu = log(1 + exp(max(min(x, "
+        "threshold), threshold)))");
+    AddAttr<AttrType>("threshold", "The threshold value of SoftRelu")
+        .SetDefault(static_cast<AttrType>(40));
+  }
+};
+
+template <typename AttrType>
+class PowOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PowOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Pow operator");
+    AddOutput("Y", "Output of Pow operator");
+    AddComment("Pow activation operator, pow(x, factor) = x^factor");
+    AddAttr<AttrType>("factor", "The exponential factor of Pow")
+        .SetDefault(static_cast<AttrType>(1));
+  }
+};
+
+template <typename AttrType>
+class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  STanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of STanh operator");
+    AddOutput("Y", "Output of STanh operator");
+    AddComment("STanh activation operator, stanh = b * tanh(a * x)");
+    AddAttr<AttrType>("scale_a", "The scale parameter of a for the input")
+        .SetDefault(static_cast<AttrType>(2 / 3));
+    AddAttr<AttrType>("scale_b", "The scale parameter of b for the input")
+        .SetDefault(static_cast<AttrType>(1.7159));
   }
 };
 
@@ -78,10 +200,10 @@ REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad,
             ops::ActivationOpGrad);
 REGISTER_OP_CPU_KERNEL(sigmoid,
                        ops::ActivationKernel<paddle::platform::CPUPlace, float,
-                                             ops::SigmoidFunctor>);
+                                             ops::SigmoidFunctor<float>>);
 REGISTER_OP_CPU_KERNEL(
     sigmoid_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                            ops::SigmoidGradFunctor>);
+                                            ops::SigmoidGradFunctor<float>>);
 
 REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
             ops::ActivationOpGrad);
@@ -100,3 +222,85 @@ REGISTER_OP_CPU_KERNEL(relu,
 REGISTER_OP_CPU_KERNEL(
     relu_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
                                          ops::ReluGradFunctor<float>>);
+
+REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    tanh,
+    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::TanhFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    tanh_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
+                                         ops::TanhGradFunctor<float>>);
+
+REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    sqrt,
+    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::SqrtFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    sqrt_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
+                                         ops::SqrtGradFunctor<float>>);
+
+REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    abs,
+    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::AbsFunctor>);
+REGISTER_OP_CPU_KERNEL(abs_grad,
+                       ops::ActivationGradKernel<paddle::platform::CPUPlace,
+                                                 float, ops::AbsGradFunctor>);
+
+REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
+            reciprocal_grad, ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(reciprocal,
+                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
+                                             ops::ReciprocalFunctor<float>>);
+REGISTER_OP_CPU_KERNEL(
+    reciprocal_grad,
+    ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
+                              ops::ReciprocalGradFunctor<float>>);
+
+REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    log,
+    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::LogFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    log_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
+                                        ops::LogGradFunctor<float>>);
+
+REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(square,
+                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
+                                             ops::SquareFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    square_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
+                                           ops::SquareGradFunctor<float>>);
+
+REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker<float>, brelu_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(brelu,
+                       ops::BReluKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(brelu_grad,
+                       ops::BReluGradKernel<paddle::platform::CPUPlace, float>);
+
+REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker<float>,
+            soft_relu_grad, ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(soft_relu,
+                       ops::SoftReluKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    soft_relu_grad, ops::SoftReluGradKernel<paddle::platform::CPUPlace, float>);
+
+REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker<float>, pow_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(pow, ops::PowKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(pow_grad,
+                       ops::PowGradKernel<paddle::platform::CPUPlace, float>);
+
+REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker<float>, stanh_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(stanh,
+                       ops::STanhKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(stanh_grad,
+                       ops::STanhGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu
index 3b2c147f46..112b33d225 100644
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@@ -36,3 +36,85 @@ REGISTER_OP_GPU_KERNEL(relu,
 REGISTER_OP_GPU_KERNEL(
     relu_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
                                          ops::ReluGradFunctor<float>>);
+
+REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_GPU_KERNEL(tanh,
+                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
+                                             ops::TanhFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    tanh_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
+                                         ops::TanhGradFunctor<float>>);
+
+REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_GPU_KERNEL(sqrt,
+                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
+                                             ops::SqrtFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    sqrt_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
+                                         ops::SqrtGradFunctor<float>>);
+
+REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_GPU_KERNEL(abs,
+                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
+                                             ops::AbsFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    abs_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
+                                        ops::AbsGradFunctor<float>>);
+
+REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
+            reciprocal_grad, ops::ActivationOpGrad);
+REGISTER_OP_GPU_KERNEL(reciprocal,
+                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
+                                             ops::ReciprocalFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    reciprocal_grad,
+    ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
+                              ops::ReciprocalGradFunctor<float>>);
+
+REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_GPU_KERNEL(log,
+                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
+                                             ops::LogFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    log_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
+                                        ops::LogGradFunctor<float>>);
+
+REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_GPU_KERNEL(square,
+                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
+                                             ops::squareFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    square_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
+                                           ops::SquareGradFunctor<float>>);
+
+REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker<float>, brelu_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_GPU_KERNEL(brelu,
+                       ops::BReluKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(brelu_grad,
+                       ops::BReluGradKernel<paddle::platform::GPUPlace, float>);
+
+REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker<float>,
+            soft_relu_grad, ops::ActivationOpGrad);
+REGISTER_OP_GPU_KERNEL(soft_relu,
+                       ops::SoftReluKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    soft_relu_grad, ops::SoftReluGradKernel<paddle::platform::GPUPlace, float>);
+
+REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker<float>, pow_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_GPU_KERNEL(pow, ops::PowKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(pow_grad,
+                       ops::PowGradKernel<paddle::platform::GPUPlace, float>);
+
+REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker<float>, stanh_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_GPU_KERNEL(stanh,
+                       ops::STanhKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(stanh_grad,
+                       ops::STanhGradKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 9bf340f2ed..15f8afb4ba 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -55,19 +55,20 @@ class ActivationGradKernel : public framework::OpKernel {
   }
 };
 
-// sigmoid = 1 / (1 + exp(-x)
+// sigmoid(x) = 1 / (1 + exp(-x))
 template <typename T>
 struct SigmoidFunctor {
   template <typename Device, typename X, typename Y>
   void operator()(Device d, X x, Y y) {
-    y.device(d) = 1. / (1. + (-x).exp());
+    y.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
   }
 };
 
+template <typename T>
 struct SigmoidGradFunctor {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
   void operator()(Device d, X x, Y y, dY dy, dX dx) {
-    dx.device(d) = dy * y * (1. - y);
+    dx.device(d) = dy * y * (static_cast<T>(1) - y);
   }
 };
 
@@ -103,7 +104,7 @@ struct ReluGradFunctor {
   }
 };
 
-// tanh = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 struct TanhFunctor {
   template <typename Device, typename X, typename Y>
   void operator()(Device d, X x, Y y) {
@@ -115,7 +116,7 @@ template <typename T>
 struct TanhGradFunctor {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
   void operator()(Device d, X x, Y y, dY dy, dX dx) {
-    dx.device(d) = dy * (T(1) - y * y);
+    dx.device(d) = dy * (static_cast<T>(1) - y * y);
   }
 };
 
@@ -131,7 +132,7 @@ template <typename T>
 struct SqrtGradFunctor {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
   void operator()(Device d, X x, Y y, dY dy, dX dx) {
-    const T y_conj = Eigen::numext::conj(y);
+    const Y y_conj = Eigen::numext::conj(y);
     dx.device(d) = static_cast<T>(0.5) * dy / y_conj;
   }
 };
@@ -144,19 +145,27 @@ struct AbsFunctor {
   }
 };
 
+struct AbsGradFunctor {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) = dy * x.sign();
+  }
+};
+
 // reciprocal(x) = 1 / x
 template <typename T>
 struct ReciprocalFunctor {
   template <typename Device, typename X, typename Y>
   void operator()(Device d, X x, Y y) {
-    y.device(d) = 1. / x;
+    y.device(d) = static_cast<T>(1) / x;
   }
 };
 
+template <typename T>
 struct ReciprocalGradFunctor {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
   void operator()(Device d, X x, Y y, dY dy, dX dx) {
-    dx.device(d) = dy * (-1.0) * y * y;
+    dx.device(d) = dy * static_cast<T>(-1) * y * y;
   }
 };
 
@@ -168,10 +177,11 @@ struct LogFunctor {
   }
 };
 
+template <typename T>
 struct LogGradFunctor {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
   void operator()(Device d, X x, Y y, dY dy, dX dx) {
-    dx.device(d) = dy * (1. / x);
+    dx.device(d) = dy * (static_cast<T>(1) / x);
   }
 };
 
@@ -181,12 +191,161 @@ struct SquareFunctor {
   void operator()(Device d, X x, Y y) {
     y.device(d) = x.square();
   }
-}
+};
 
+template <typename T>
 struct SquareGradFunctor {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
   void operator()(Device d, X x, Y y, dY dy, dX dx) {
-    dx.device(d) = dy * 2 * x;
+    dx.device(d) = dy * static_cast<T>(2) * x;
+  }
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class BReluKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Y = context.Output<framework::Tensor>("Y");
+    auto t_min = static_cast<T>(context.Attr<AttrType>("t_min"));
+    auto t_max = static_cast<T>(context.Attr<AttrType>("t_max"));
+    Y->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto place = context.GetEigenDevice<Place>();
+    y.device(place) = x.cwiseMax(t_min).cwiseMin(t_max);
+  }
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class BReluGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto t_min = static_cast<T>(context.Attr<AttrType>("t_min"));
+    auto t_max = static_cast<T>(context.Attr<AttrType>("t_max"));
+    dX->mutable_data<T>(context.GetPlace());
+
+    auto dy = framework::EigenVector<T>::Flatten(*dY);
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto place = context.GetEigenDevice<Place>();
+
+    dx.device(place) = dy * ((x > t_min) * (x < t_max)).template cast<T>();
+  }
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class SoftReluKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Y = context.Output<framework::Tensor>("Y");
+    auto threshold = static_cast<T>(context.Attr<AttrType>("threshold"));
+    Y->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto place = context.GetEigenDevice<Place>();
+    auto temp = x.cwiseMax(-threshold).cwiseMin(threshold).eval();
+    y.device(place) = (static_cast<T>(1) + temp.exp()).log();
+  }
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class SoftReluGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Y = context.Input<framework::Tensor>("Y");
+    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto threshold = static_cast<T>(context.Attr<AttrType>("threshold"));
+    dX->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto dy = framework::EigenVector<T>::Flatten(*dY);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto place = context.GetEigenDevice<Place>();
+    auto temp = ((x > -threshold) * (x < threshold)).template cast<T>().eval();
+    dx.device(place) = dy * (static_cast<T>(1) - (-y).exp()) * temp;
+  }
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class PowKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Y = context.Output<framework::Tensor>("Y");
+    auto factor = static_cast<T>(context.Attr<AttrType>("factor"));
+    Y->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto place = context.GetEigenDevice<Place>();
+    y.device(place) = x.pow(factor);
+  }
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class PowGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto factor = static_cast<T>(context.Attr<AttrType>("factor"));
+    dX->mutable_data<T>(context.GetPlace());
+
+    auto dy = framework::EigenVector<T>::Flatten(*dY);
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto place = context.GetEigenDevice<Place>();
+
+    dx.device(place) = dy * factor * x.pow(factor - static_cast<T>(1));
+  }
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class STanhKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Y = context.Output<framework::Tensor>("Y");
+    auto scale_a = static_cast<T>(context.Attr<AttrType>("scale_a"));
+    auto scale_b = static_cast<T>(context.Attr<AttrType>("scale_b"));
+    Y->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto place = context.GetEigenDevice<Place>();
+    y.device(place) = scale_b * (scale_a * x).tanh();
+  }
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class STanhGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto scale_a = static_cast<T>(context.Attr<AttrType>("scale_a"));
+    auto scale_b = static_cast<T>(context.Attr<AttrType>("scale_b"));
+    dX->mutable_data<T>(context.GetPlace());
+
+    auto dy = framework::EigenVector<T>::Flatten(*dY);
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto place = context.GetEigenDevice<Place>();
+
+    auto temp = (scale_a * x).tanh() * (scale_a * x).tanh();
+    dx.device(place) = dy * scale_a * scale_b * (static_cast<T>(1) - temp);
   }
 };
 
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index bd964c5d07..28195b1b0a 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -55,8 +55,6 @@ USE_OP(squared_l2_distance);
 USE_OP(sum);
 USE_OP(reshape);
 USE_OP(sigmoid);
-USE_OP(exp);
-USE_OP(relu);
 
 namespace paddle {
 namespace framework {
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 4fec4c9109..899d3ae991 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -196,7 +196,7 @@ class OpTest(unittest.TestCase):
                 self.assertTrue(
                     np.allclose(
                         actual, expect, atol=1e-05),
-                    "output name: " + out_name + "has diff")
+                    "output name: " + out_name + " has diff")
 
     def check_output(self):
         places = [core.CPUPlace()]
diff --git a/python/paddle/v2/framework/tests/test_activation_op.py b/python/paddle/v2/framework/tests/test_activation_op.py
index 23ff584396..7cd39dfe91 100644
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
@@ -21,7 +21,9 @@ class TestExp(OpTest):
 class TestRelu(OpTest):
     def setUp(self):
         self.op_type = "relu"
-        self.inputs = {'X': np.random.uniform(-1, 1, [4, 4]).astype("float32")}
+        x = np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        x = np.sign(x) * np.exp(np.abs(x))
+        self.inputs = {'X': x}
         self.outputs = {'Y': np.maximum(self.inputs['X'], 0)}
 
     def test_check_output(self):
@@ -42,6 +44,167 @@ class TestSigmoid(OpTest):
     def test_check_output(self):
         self.check_output()
 
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.008)
+
+
+class TestTanh(OpTest):
+    def setUp(self):
+        self.op_type = "tanh"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Y': np.tanh(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
+class TestSqrt(OpTest):
+    def setUp(self):
+        self.op_type = "sqrt"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Y': np.sqrt(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
+class TestAbs(OpTest):
+    def setUp(self):
+        self.op_type = "abs"
+        x = np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        x = np.sign(x) * np.exp(np.abs(x))
+        self.inputs = {'X': x}
+        self.outputs = {'Y': np.abs(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
+class TestReciprocal(OpTest):
+    def setUp(self):
+        self.op_type = "reciprocal"
+        self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
+        self.outputs = {'Y': np.reciprocal(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.01)
+
+
+class TestLog(OpTest):
+    def setUp(self):
+        self.op_type = "log"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Y': np.log(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
+class TestSquare(OpTest):
+    def setUp(self):
+        self.op_type = "square"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Y': np.square(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
+class TestBRelu(OpTest):
+    def setUp(self):
+        self.op_type = "brelu"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        x = 2 * np.sign(x) * np.exp(np.abs(x))
+        self.inputs = {'X': x}
+        t_min = 0
+        t_max = 4
+        self.attrs = {'t_min': t_min, 't_max': t_max}
+        t = np.copy(x)
+        t[t < t_min] = t_min
+        t[t > t_max] = t_max
+        self.outputs = {'Y': t}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.02)
+
+
+class TestSoftRelu(OpTest):
+    def setUp(self):
+        self.op_type = "soft_relu"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        x = 2 * np.sign(x) * np.exp(np.abs(x))
+        self.inputs = {'X': x}
+        threshold = 4
+        self.attrs = {'threshold': threshold}
+        t = np.copy(x)
+        t[t < -threshold] = -threshold
+        t[t > threshold] = threshold
+        self.outputs = {'Y': np.log((np.exp(t) + 1))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.02)
+
+
+class TestPow(OpTest):
+    def setUp(self):
+        self.op_type = "pow"
+        self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
+        self.attrs = {'factor': 3}
+        self.outputs = {'Y': np.power(self.inputs['X'], 3)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.02)
+
+
+class TestSTanh(OpTest):
+    def setUp(self):
+        self.op_type = "stanh"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        }
+        scale_a = 2.0 / 3.0
+        scale_b = 1.7159
+        self.attrs = {'scale_a': scale_a, 'scale_b': scale_b}
+        self.outputs = {'Y': scale_b * np.tanh(self.inputs['X'] * scale_a)}
+
+    def test_check_output(self):
+        self.check_output()
+
     def test_check_grad(self):
         self.check_grad(['X'], 'Y', max_relative_error=0.007)
 

From 41271f03cb609a9a772c3ff720a011ff3b1a1b93 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Thu, 14 Sep 2017 19:36:52 +0800
Subject: [PATCH 16/50] fix gpu build error

---
 paddle/operators/activation_op.cu             | 56 ++++++-------------
 .../paddle/trainer_config_helpers/networks.py |  4 +-
 2 files changed, 20 insertions(+), 40 deletions(-)

diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu
index 112b33d225..feed1302b2 100644
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@@ -19,10 +19,10 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(sigmoid,
                        ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::SigmoidFunctor>);
+                                             ops::SigmoidFunctor<float>>);
 REGISTER_OP_GPU_KERNEL(
     sigmoid_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                            ops::SigmoidGradFunctor>);
+                                            ops::SigmoidGradFunctor<float>>);
 
 REGISTER_OP_GPU_KERNEL(
     exp,
@@ -37,35 +37,27 @@ REGISTER_OP_GPU_KERNEL(
     relu_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
                                          ops::ReluGradFunctor<float>>);
 
-REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
-            ops::ActivationOpGrad);
-REGISTER_OP_GPU_KERNEL(tanh,
-                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::TanhFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    tanh,
+    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::TanhFunctor>);
 REGISTER_OP_GPU_KERNEL(
     tanh_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
                                          ops::TanhGradFunctor<float>>);
 
-REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
-            ops::ActivationOpGrad);
-REGISTER_OP_GPU_KERNEL(sqrt,
-                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::SqrtFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    sqrt,
+    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::SqrtFunctor>);
 REGISTER_OP_GPU_KERNEL(
     sqrt_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
                                          ops::SqrtGradFunctor<float>>);
 
-REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
-            ops::ActivationOpGrad);
-REGISTER_OP_GPU_KERNEL(abs,
-                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::AbsFunctor<float>>);
 REGISTER_OP_GPU_KERNEL(
-    abs_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                        ops::AbsGradFunctor<float>>);
+    abs,
+    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::AbsFunctor>);
+REGISTER_OP_GPU_KERNEL(abs_grad,
+                       ops::ActivationGradKernel<paddle::platform::GPUPlace,
+                                                 float, ops::AbsGradFunctor>);
 
-REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
-            reciprocal_grad, ops::ActivationOpGrad);
 REGISTER_OP_GPU_KERNEL(reciprocal,
                        ops::ActivationKernel<paddle::platform::GPUPlace, float,
                                              ops::ReciprocalFunctor<float>>);
@@ -74,47 +66,35 @@ REGISTER_OP_GPU_KERNEL(
     ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
                               ops::ReciprocalGradFunctor<float>>);
 
-REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad,
-            ops::ActivationOpGrad);
-REGISTER_OP_GPU_KERNEL(log,
-                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::LogFunctor<float>>);
+REGISTER_OP_GPU_KERNEL(
+    log,
+    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::LogFunctor>);
 REGISTER_OP_GPU_KERNEL(
     log_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
                                         ops::LogGradFunctor<float>>);
 
-REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad,
-            ops::ActivationOpGrad);
 REGISTER_OP_GPU_KERNEL(square,
                        ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::squareFunctor<float>>);
+                                             ops::SquareFunctor>);
 REGISTER_OP_GPU_KERNEL(
     square_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
                                            ops::SquareGradFunctor<float>>);
 
-REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker<float>, brelu_grad,
-            ops::ActivationOpGrad);
 REGISTER_OP_GPU_KERNEL(brelu,
                        ops::BReluKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(brelu_grad,
                        ops::BReluGradKernel<paddle::platform::GPUPlace, float>);
 
-REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker<float>,
-            soft_relu_grad, ops::ActivationOpGrad);
 REGISTER_OP_GPU_KERNEL(soft_relu,
                        ops::SoftReluKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
     soft_relu_grad, ops::SoftReluGradKernel<paddle::platform::GPUPlace, float>);
 
-REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker<float>, pow_grad,
-            ops::ActivationOpGrad);
 REGISTER_OP_GPU_KERNEL(pow, ops::PowKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(pow_grad,
                        ops::PowGradKernel<paddle::platform::GPUPlace, float>);
 
-REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker<float>, stanh_grad,
-            ops::ActivationOpGrad);
 REGISTER_OP_GPU_KERNEL(stanh,
                        ops::STanhKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(stanh_grad,
-                       ops::STanhGradKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
+                       ops::STanhGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 34be203ee2..28a71cf788 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1406,7 +1406,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(*[l.name for l in layers])
+    Inputs(* [l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1456,7 +1456,7 @@ def outputs(layers, *args):
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(*[l.name for l in layers])
+        Outputs(* [l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:

From 34ecfcad4a182f8d5c5feae03f290242adcbc313 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Fri, 15 Sep 2017 03:20:36 +0000
Subject: [PATCH 17/50] fix code style

---
 python/paddle/trainer_config_helpers/networks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 28a71cf788..34be203ee2 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1406,7 +1406,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(* [l.name for l in layers])
+    Inputs(*[l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1456,7 +1456,7 @@ def outputs(layers, *args):
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(* [l.name for l in layers])
+        Outputs(*[l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:

From 48f5f6bdd071736df63d7bdcf6a3740c8ae06240 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Fri, 15 Sep 2017 11:23:19 +0800
Subject: [PATCH 18/50] refine some operators' python unittests

---
 .../v2/framework/tests/test_activation_op.py  | 124 ++++++++++--------
 1 file changed, 67 insertions(+), 57 deletions(-)

diff --git a/python/paddle/v2/framework/tests/test_activation_op.py b/python/paddle/v2/framework/tests/test_activation_op.py
index 7cd39dfe91..003f6d50b6 100644
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
@@ -18,21 +18,6 @@ class TestExp(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.007)
 
 
-class TestRelu(OpTest):
-    def setUp(self):
-        self.op_type = "relu"
-        x = np.random.uniform(-1, 1, [11, 17]).astype("float32")
-        x = np.sign(x) * np.exp(np.abs(x))
-        self.inputs = {'X': x}
-        self.outputs = {'Y': np.maximum(self.inputs['X'], 0)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
-
-
 class TestSigmoid(OpTest):
     def setUp(self):
         self.op_type = "sigmoid"
@@ -81,8 +66,12 @@ class TestSqrt(OpTest):
 class TestAbs(OpTest):
     def setUp(self):
         self.op_type = "abs"
-        x = np.random.uniform(-1, 1, [11, 17]).astype("float32")
-        x = np.sign(x) * np.exp(np.abs(x))
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        # Because we set delta = 0.005 in caculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is unaccurate.
+        # we should avoid this
+        x[np.abs(x) < 0.005] = 0.02
         self.inputs = {'X': x}
         self.outputs = {'Y': np.abs(self.inputs['X'])}
 
@@ -93,41 +82,14 @@ class TestAbs(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.007)
 
 
-class TestReciprocal(OpTest):
-    def setUp(self):
-        self.op_type = "reciprocal"
-        self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
-        self.outputs = {'Y': np.reciprocal(self.inputs['X'])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.01)
-
-
-class TestLog(OpTest):
-    def setUp(self):
-        self.op_type = "log"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Y': np.log(self.inputs['X'])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
-
-
-class TestSquare(OpTest):
+class TestRelu(OpTest):
     def setUp(self):
-        self.op_type = "square"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Y': np.square(self.inputs['X'])}
+        self.op_type = "relu"
+        x = np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        self.inputs = {'X': x}
+        self.outputs = {'Y': np.maximum(self.inputs['X'], 0)}
 
     def test_check_output(self):
         self.check_output()
@@ -140,10 +102,13 @@ class TestBRelu(OpTest):
     def setUp(self):
         self.op_type = "brelu"
         x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
-        x = 2 * np.sign(x) * np.exp(np.abs(x))
-        self.inputs = {'X': x}
-        t_min = 0
+        t_min = 1
         t_max = 4
+        # The same with TestAbs
+        x[np.abs(x - t_min) < 0.005] = t_min + 0.02
+        x[np.abs(x - t_max) < 0.005] = t_min + 0.02
+
+        self.inputs = {'X': x}
         self.attrs = {'t_min': t_min, 't_max': t_max}
         t = np.copy(x)
         t[t < t_min] = t_min
@@ -160,10 +125,12 @@ class TestBRelu(OpTest):
 class TestSoftRelu(OpTest):
     def setUp(self):
         self.op_type = "soft_relu"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
-        x = 2 * np.sign(x) * np.exp(np.abs(x))
+        x = np.random.uniform(-3, 3, [4, 4]).astype("float32")
+        threshold = 2
+        # The same reason with TestAbs
+        x[np.abs(x - threshold) < 0.005] = threshold + 0.02
+        x[np.abs(x + threshold) < 0.005] = -threshold + 0.02
         self.inputs = {'X': x}
-        threshold = 4
         self.attrs = {'threshold': threshold}
         t = np.copy(x)
         t[t < -threshold] = -threshold
@@ -177,6 +144,49 @@ class TestSoftRelu(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.02)
 
 
+class TestReciprocal(OpTest):
+    def setUp(self):
+        self.op_type = "reciprocal"
+        self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
+        self.outputs = {'Y': np.reciprocal(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.01)
+
+
+class TestLog(OpTest):
+    def setUp(self):
+        self.op_type = "log"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Y': np.log(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
+class TestSquare(OpTest):
+    def setUp(self):
+        self.op_type = "square"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Y': np.square(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
 class TestPow(OpTest):
     def setUp(self):
         self.op_type = "pow"

From 57011b202275b6be135e2d708c67fd48ea23b675 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Sat, 16 Sep 2017 00:52:25 +0800
Subject: [PATCH 19/50] reste

---
 paddle/pybind/pybind.cc                     | 23 ---------------------
 python/paddle/v2/framework/tests/op_test.py |  1 +
 2 files changed, 1 insertion(+), 23 deletions(-)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index c7009a604f..a7a38339fb 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "paddle/framework/backward.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/cond_op.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
 #include "paddle/platform/enforce.h"
@@ -289,28 +288,6 @@ All parameter, weight, gradient are variables in Paddle.
            [](operators::RecurrentOp &self, const operators::NetOp &net)
                -> void { self.set_stepnet(net.Clone()); });
 
-  // cond_op
-  py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
-      .def_static("create",
-                  [](py::bytes protobin) -> operators::CondOp * {
-                    OpDesc desc;
-                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
-                                   "Cannot parse user input to OpDesc");
-                    PADDLE_ENFORCE(desc.IsInitialized(),
-                                   "User OpDesc is not initialized, reason %s",
-                                   desc.InitializationErrorString());
-                    auto cond_op = OpRegistry::CreateOp(desc);
-                    return static_cast<operators::CondOp *>(cond_op.release());
-                  })
-      .def("set_truenet",
-           [](operators::CondOp &self, const operators::NetOp &net) -> void {
-             self.set_truenet(net.Clone());
-           })
-      .def("set_falsenet",
-           [](operators::CondOp &self, const operators::NetOp &net) -> void {
-             self.set_falsenet(net.Clone());
-           });
-
   m.def("unique_integer", UniqueIntegerGenerator);
 
   m.def("is_compile_gpu", IsCompileGPU);
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 31724d98ed..8e111af467 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -64,6 +64,7 @@ def set_input(scope, op, inputs, place):
                 tensor.set_dims(in_array.shape)
                 tensor.set(in_array, place)
                 if isinstance(in_val, tuple):
+                    print "set lod"
                     tensor.set_lod(in_val[1])
 
 

From 57a3b8b69e750e47487a24d5c6888fc122a63fa5 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 18 Sep 2017 15:18:24 +0800
Subject: [PATCH 20/50] 1. Implement GPUCrop kernel instead of eigen. 2. Fix
 unitest

---
 paddle/operators/crop_op.cc                   | 26 +++----
 paddle/operators/crop_op.cu                   |  8 +--
 paddle/operators/crop_op.h                    |  7 +-
 python/paddle/v2/framework/tests/op_test.py   | 10 +--
 .../paddle/v2/framework/tests/test_crop_op.py | 69 +++++++++----------
 5 files changed, 53 insertions(+), 67 deletions(-)

diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
index 9f4a3152e4..09fa13dfbb 100644
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -19,6 +19,7 @@ namespace paddle {
 namespace operators {
 
 using framework::Tensor;
+using framework::LoDTensor;
 
 class CropOp : public framework::OperatorWithKernel {
  public:
@@ -26,8 +27,8 @@ class CropOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto x_dim = ctx.Input<Tensor>("X")->dims();
-    auto Y = ctx.Input<Tensor>("Y");
+    auto x_dim = ctx.Input<LoDTensor>("X")->dims();
+    auto Y = ctx.Input<LoDTensor>("Y");
     if (Y == nullptr) {
       auto shape = Attr<std::vector<int>>("shape");
       PADDLE_ENFORCE_EQ(
@@ -37,9 +38,9 @@ class CropOp : public framework::OperatorWithKernel {
       for (size_t i = 0; i < shape.size(); ++i) {
         tensor_shape[i] = (int64_t)shape[i];
       }
-      ctx.Output<Tensor>("Out")->Resize(framework::make_ddim(tensor_shape));
+      ctx.Output<LoDTensor>("Out")->Resize(framework::make_ddim(tensor_shape));
     } else {
-      ctx.Output<Tensor>("Out")->Resize(Y->dims());
+      ctx.Output<LoDTensor>("Out")->Resize(Y->dims());
     }
   }
 };
@@ -112,8 +113,8 @@ class CropOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                             "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto x_dims = ctx.Input<LoDTensor>("X")->dims();
+    auto *x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
     if (x_grad != nullptr) {
       x_grad->Resize(x_dims);
     }
@@ -141,23 +142,17 @@ template <typename T>
 class CropCPUKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    LOG(INFO) << "CropCPUKernel step1";
-    auto *x = context.Input<Tensor>("X");
-    LOG(INFO) << "CropCPUKernel step2";
-    auto *out = context.Output<Tensor>("Out");
-    LOG(INFO) << "CropCPUKernel step3";
+    auto *x = context.Input<LoDTensor>("X");
+    auto *out = context.Output<LoDTensor>("Out");
     auto x_data = x->data<T>();
-    T *out_data = out->mutable_data<T>(paddle::platform::CPUPlace());
-    LOG(INFO) << "CropCPUKernel step4";
+    T *out_data = out->mutable_data<T>(context.GetPlace());
     auto x_dims = x->dims();
     auto out_dims = out->dims();
-    LOG(INFO) << "CropCPUKernel step5";
     int64_t out_count = framework::product(out_dims);
     std::vector<int64_t> x_shape = framework::vectorize(x_dims);
     std::vector<int64_t> out_shape = framework::vectorize(out_dims);
 
     auto offsets = context.op().Attr<std::vector<int>>("offsets");
-    LOG(INFO) << "CropCPUKernel step6";
     PADDLE_ENFORCE_EQ(
         x_dims.size(), offsets.size(),
         "Offsets size should be equal to dimension size of input tensor.");
@@ -171,7 +166,6 @@ class CropCPUKernel : public framework::OpKernel {
     for (int64_t i = 0; i < out_count; ++i) {
       out_data[i] = x_data[transIndex(out_shape, x_shape, crop_rules, i)];
     }
-    LOG(INFO) << "CropCPUKernel step7";
   }
 };
 
diff --git a/paddle/operators/crop_op.cu b/paddle/operators/crop_op.cu
index f39478858a..1715b2eaf9 100644
--- a/paddle/operators/crop_op.cu
+++ b/paddle/operators/crop_op.cu
@@ -20,6 +20,7 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 
 template <typename T, int D>
 __global__ void CropKernel(const int N, const int64_t* out_shape,
@@ -48,9 +49,8 @@ template <typename T, int D>
 void CropCUDAFunctoin(const framework::ExecutionContext& context) {
   PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
                  "It must use GPUPlace.");
-  LOG(INFO) << "CropCUDAFunctoin step1";
-  auto* x = context.Input<Tensor>("X");
-  auto* out = context.Output<Tensor>("Out");
+  auto* x = context.Input<LoDTensor>("X");
+  auto* out = context.Output<LoDTensor>("Out");
   auto x_data = x->data<T>();
   T* out_data = out->mutable_data<T>(paddle::platform::GPUPlace());
   auto x_dims = x->dims();
@@ -100,7 +100,7 @@ template <typename T>
 class CropOpCUDAKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    size_t rank = context.Input<Tensor>("X")->dims().size();
+    size_t rank = context.Input<LoDTensor>("X")->dims().size();
     switch (rank) {
       case 1:
         CropCUDAFunctoin<T, 1>(context);
diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h
index 40bd024674..7f041737a7 100644
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
@@ -25,11 +25,12 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 
 using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 
 template <typename Place, typename T, size_t D>
 void CropGradFunction(const framework::ExecutionContext& context) {
-  auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-  auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+  auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+  auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
   if (d_x != nullptr) {
     d_x->mutable_data<T>(context.GetPlace());
     auto d_x_dims = d_x->dims();
@@ -52,7 +53,7 @@ class CropGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     size_t rank =
-        context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
+        context.Input<LoDTensor>(framework::GradVarName("Out"))->dims().size();
     switch (rank) {
       case 1:
         CropGradFunction<Place, T, 1>(context);
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 8e111af467..a0533efacd 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -64,7 +64,6 @@ def set_input(scope, op, inputs, place):
                 tensor.set_dims(in_array.shape)
                 tensor.set(in_array, place)
                 if isinstance(in_val, tuple):
-                    print "set lod"
                     tensor.set_lod(in_val[1])
 
 
@@ -189,10 +188,8 @@ class OpTest(unittest.TestCase):
         self.op.infer_shape(self.scope)
         ctx = core.DeviceContext.create(place)
         self.op.run(self.scope, ctx)
-        print "finish self.op.run"
+
         for out_name, out_dup in Operator.get_op_outputs(self.op.type()):
-            print "finish Operator.get_op_outputs"
-            print "out_dup=%s; out_name=%s" % (out_dup, out_name)
             if out_dup:
                 sub_out = self.outputs[out_name]
                 for sub_out_name in sub_out:
@@ -204,17 +201,12 @@ class OpTest(unittest.TestCase):
                             actual, expect, atol=1e-05),
                         "output name: " + out_name + "has diff")
             else:
-                v = self.scope.find_var(out_name)
-                print "var=%s" % v
-                print "tensor=%s" % v.get_tensor()
                 actual = np.array(self.scope.find_var(out_name).get_tensor())
-                print "actual=%s" % actual
                 expect = self.outputs[out_name]
                 self.assertTrue(
                     np.allclose(
                         actual, expect, atol=1e-05),
                     "output name: " + out_name + "has diff")
-                print "finish check in %s" % place
 
     def check_output(self):
         places = [core.CPUPlace()]
diff --git a/python/paddle/v2/framework/tests/test_crop_op.py b/python/paddle/v2/framework/tests/test_crop_op.py
index 45f13d84e5..62c883bdc1 100644
--- a/python/paddle/v2/framework/tests/test_crop_op.py
+++ b/python/paddle/v2/framework/tests/test_crop_op.py
@@ -47,45 +47,44 @@ class TestCropOp(OpTest):
 
     def initTestCase(self):
         self.x_shape = (8, 8)
-        self.crop_shape = [2, 2]
+        self.crop_shape = (2, 2)
         self.offsets = [1, 2]
 
     def test_check_output(self):
         self.check_output()
-        print "finish check_output"
-
-    #def test_check_grad_normal(self):
-    #    self.check_grad(['X'], 'Out', max_relative_error=0.006)
-
-    #class TestCase1(TestCropOp):
-    #    def initTestCase(self):
-    #        self.x_shape = (16, 16, 16)
-    #        self.crop_shape = [2, 2, 3]
-    #        self.offsets = [1, 5, 3]
-    #
-    #
-    #class TestCase2(TestCropOp):
-    #    def initTestCase(self):
-    #        self.x_shape = (4, 4)
-    #        self.crop_shape = [4, 4]
-    #        self.offsets = [0, 0]
-    #
-    #
-    #class TestCase3(TestCropOp):
-    #    def initTestCase(self):
-    #        self.x_shape = (16, 16, 16)
-    #        self.crop_shape = [2, 2, 3]
-    #        self.offsets = [1, 5, 3]
-    #        self.crop_by_input = True
-    #
-    #
-    #class TestCase4(TestCropOp):
-    #    def initTestCase(self):
-    #        self.x_shape = (4, 4)
-    #        self.crop_shape = [4, 4]
-    #        self.offsets = [0, 0]
-    #        self.crop_by_input = True
-    #
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.006)
+
+
+class TestCase1(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (16, 8, 32)
+        self.crop_shape = [2, 2, 3]
+        self.offsets = [1, 5, 3]
+
+
+class TestCase2(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (4, 8)
+        self.crop_shape = [4, 8]
+        self.offsets = [0, 0]
+
+
+class TestCase3(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (4, 8, 16)
+        self.crop_shape = [2, 2, 3]
+        self.offsets = [1, 5, 3]
+        self.crop_by_input = True
+
+
+class TestCase4(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (4, 4)
+        self.crop_shape = [4, 4]
+        self.offsets = [0, 0]
+        self.crop_by_input = True
 
 
 if __name__ == '__main__':

From 0c05ea39d4632de296d4f607dd15dce19df5cd04 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 18 Sep 2017 16:19:50 +0800
Subject: [PATCH 21/50] Pull latest pybind.cc to crop_op

---
 paddle/operators/crop_op.cc |  7 +++++++
 paddle/operators/crop_op.cu |  3 +--
 paddle/operators/crop_op.h  |  3 +--
 paddle/pybind/pybind.cc     | 23 +++++++++++++++++++++++
 4 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
index 09fa13dfbb..33fa9b7928 100644
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -29,6 +29,10 @@ class CropOp : public framework::OperatorWithKernel {
   void InferShape(const framework::InferShapeContext &ctx) const override {
     auto x_dim = ctx.Input<LoDTensor>("X")->dims();
     auto Y = ctx.Input<LoDTensor>("Y");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of CropOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) of CropOp should not be null.");
     if (Y == nullptr) {
       auto shape = Attr<std::vector<int>>("shape");
       PADDLE_ENFORCE_EQ(
@@ -40,6 +44,9 @@ class CropOp : public framework::OperatorWithKernel {
       }
       ctx.Output<LoDTensor>("Out")->Resize(framework::make_ddim(tensor_shape));
     } else {
+      PADDLE_ENFORCE_EQ(framework::arity(x_dim), framework::arity(Y->dims()),
+                        "Tensor rank of both CropOp's "
+                        "inputs must be same.");
       ctx.Output<LoDTensor>("Out")->Resize(Y->dims());
     }
   }
diff --git a/paddle/operators/crop_op.cu b/paddle/operators/crop_op.cu
index 1715b2eaf9..561dbe4803 100644
--- a/paddle/operators/crop_op.cu
+++ b/paddle/operators/crop_op.cu
@@ -19,8 +19,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
+using framework::LoDTensor;
 
 template <typename T, int D>
 __global__ void CropKernel(const int N, const int64_t* out_shape,
diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h
index 7f041737a7..09d42f4b7e 100644
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
@@ -24,8 +24,7 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
+using framework::LoDTensor;
 
 template <typename Place, typename T, size_t D>
 void CropGradFunction(const framework::ExecutionContext& context) {
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index a7a38339fb..c7009a604f 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/framework/backward.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/cond_op.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
 #include "paddle/platform/enforce.h"
@@ -288,6 +289,28 @@ All parameter, weight, gradient are variables in Paddle.
            [](operators::RecurrentOp &self, const operators::NetOp &net)
                -> void { self.set_stepnet(net.Clone()); });
 
+  // cond_op
+  py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
+      .def_static("create",
+                  [](py::bytes protobin) -> operators::CondOp * {
+                    OpDesc desc;
+                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                                   "Cannot parse user input to OpDesc");
+                    PADDLE_ENFORCE(desc.IsInitialized(),
+                                   "User OpDesc is not initialized, reason %s",
+                                   desc.InitializationErrorString());
+                    auto cond_op = OpRegistry::CreateOp(desc);
+                    return static_cast<operators::CondOp *>(cond_op.release());
+                  })
+      .def("set_truenet",
+           [](operators::CondOp &self, const operators::NetOp &net) -> void {
+             self.set_truenet(net.Clone());
+           })
+      .def("set_falsenet",
+           [](operators::CondOp &self, const operators::NetOp &net) -> void {
+             self.set_falsenet(net.Clone());
+           });
+
   m.def("unique_integer", UniqueIntegerGenerator);
 
   m.def("is_compile_gpu", IsCompileGPU);

From 5e0e455dc8288ce08771865e930c2cadb957a05a Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 18 Sep 2017 16:47:52 +0800
Subject: [PATCH 22/50] Add CUDA stream when launching kernel.

---
 paddle/operators/crop_op.cu | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/paddle/operators/crop_op.cu b/paddle/operators/crop_op.cu
index 561dbe4803..a40eb7af38 100644
--- a/paddle/operators/crop_op.cu
+++ b/paddle/operators/crop_op.cu
@@ -88,7 +88,13 @@ void CropCUDAFunctoin(const framework::ExecutionContext& context) {
   int d = out_dims[1];
   int block = 512;
   int grid = (n * d + block - 1) / block;
-  CropKernel<T, D><<<grid, block>>>(out_count, out_shape_gpu, x_shape_gpu,
+
+  auto* device_context =
+      const_cast<platform::DeviceContext*>(context.device_context_);
+  CropKernel<T,
+             D><<<grid, block, 0,
+                  reinterpret_cast<platform::CUDADeviceContext*>(device_context)
+                      ->stream()>>>(out_count, out_shape_gpu, x_shape_gpu,
                                     crop_rules_gpu, x_data, out_data);
   cudaFree(crop_rules_gpu);
   cudaFree(x_shape_gpu);

From 8d9d537b9fbab3d957c57d1adf52d453e7c00af4 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 18 Sep 2017 17:08:08 +0800
Subject: [PATCH 23/50] remove op_test_util.py

---
 .../paddle/v2/framework/tests/op_test_util.py | 74 -------------------
 1 file changed, 74 deletions(-)
 delete mode 100644 python/paddle/v2/framework/tests/op_test_util.py

diff --git a/python/paddle/v2/framework/tests/op_test_util.py b/python/paddle/v2/framework/tests/op_test_util.py
deleted file mode 100644
index 88adede7c7..0000000000
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import numpy
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
-
-
-class OpTestMeta(type):
-    """
-    Operator Test ClassMeta.
-
-    It injects `test_all` method into user's OperatorTest class, to make Python
-    unittest module run that method.
-
-    The `test_all` read what value is stored in `self`. It use self's values to
-    create and run a operator, and check whether that op is OK or not.
-
-    See `test_add_two_op` for example usage.
-    """
-
-    def __new__(cls, name, bases, attrs):
-        obj = super(OpTestMeta, cls).__new__(cls, name, bases, attrs)
-
-        def test_all(self):
-            scope = core.Scope()
-            kwargs = dict()
-            places = [core.CPUPlace()]
-            if core.is_compile_gpu():
-                places.append(core.GPUPlace(0))
-
-            for place in places:
-                for in_name in Operator.get_op_input_names(self.type):
-                    if hasattr(self, "inputs") and in_name in self.inputs:
-                        kwargs[in_name] = in_name
-                        var = scope.new_var(in_name).get_tensor()
-                        arr = self.inputs[in_name]
-                        var.set_dims(arr.shape)
-                        var.set(arr, place)
-                    else:
-                        kwargs[in_name] = "@EMPTY@"
-
-                for out_name in Operator.get_op_output_names(self.type):
-                    if not hasattr(self, "outputs"):
-                        raise ValueError(
-                            "The test op must set self.outputs dict.")
-                    if out_name not in self.outputs:
-                        raise ValueError("The %s is not in self.outputs dict." %
-                                         (out_name))
-                    kwargs[out_name] = out_name
-                    scope.new_var(out_name).get_tensor()
-
-                for attr_name in Operator.get_op_attr_names(self.type):
-                    if hasattr(self, "attrs") and attr_name in self.attrs:
-                        kwargs[attr_name] = self.attrs[attr_name]
-
-                op = Operator(self.type, **kwargs)
-                if isinstance(place, core.GPUPlace) and not op.support_gpu():
-                    return
-
-                op.infer_shape(scope)
-
-                ctx = core.DeviceContext.create(place)
-                op.run(scope, ctx)
-
-                for out_name in Operator.get_op_output_names(self.type):
-                    actual = numpy.array(scope.find_var(out_name).get_tensor())
-                    expect = self.outputs[out_name]
-                    print "actual: %s" % actual
-                    print "expect: %s" % expect
-                    self.assertTrue(
-                        numpy.allclose(
-                            actual, expect, atol=1e-05),
-                        "output name: " + out_name + " has diff")
-
-        obj.test_all = test_all
-        return obj

From 2c29cf1ea5ebf1ee73090e1002690d480af252d1 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Tue, 19 Sep 2017 01:06:16 +0800
Subject: [PATCH 24/50] Use Tensor as the temp variables instead of CUDA api

---
 paddle/operators/crop_op.cc | 46 +++++++++++++++++-----------------
 paddle/operators/crop_op.cu | 50 ++++++++++++++++++-------------------
 2 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
index 33fa9b7928..ee4bc9cdaf 100644
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -27,12 +27,12 @@ class CropOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto x_dim = ctx.Input<LoDTensor>("X")->dims();
-    auto Y = ctx.Input<LoDTensor>("Y");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
                             "Input(X) of CropOp should not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
                             "Output(Out) of CropOp should not be null.");
+    auto x_dim = ctx.Input<LoDTensor>("X")->dims();
+    auto Y = ctx.Input<LoDTensor>("Y");
     if (Y == nullptr) {
       auto shape = Attr<std::vector<int>>("shape");
       PADDLE_ENFORCE_EQ(
@@ -40,7 +40,7 @@ class CropOp : public framework::OperatorWithKernel {
           "Shape size should be equal to dimention size of input tensor.");
       std::vector<int64_t> tensor_shape(shape.size());
       for (size_t i = 0; i < shape.size(); ++i) {
-        tensor_shape[i] = (int64_t)shape[i];
+        tensor_shape[i] = static_cast<int64_t>(shape[i]);
       }
       ctx.Output<LoDTensor>("Out")->Resize(framework::make_ddim(tensor_shape));
     } else {
@@ -65,6 +65,15 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out",
               "The output of crop op "
               "with the same dimension as X.");
+    AddAttr<std::vector<int>>("offsets",
+                              "A list<int> describing offsets to be cropped."
+                              "The size of offsets list should be as same as "
+                              "dimension size of  input X.");
+    AddAttr<std::vector<int>>("shape",
+                              "A list<int> describing the shape of output."
+                              "The size of shape list should be as same as "
+                              "dimension size of  input X.")
+        .SetDefault(std::vector<int>());
     AddComment(R"DOC(
 Crop Operator.
 Crop input into output, as specified by offsets and shape.
@@ -81,33 +90,24 @@ The input should be a k-D tensor(k > 0 and k < 7). As an example:
 
 Given:
 
-X = [[0, 1, 2, 0, 0]
-       [0, 3, 4, 0, 0]
-       [0, 0, 0, 0, 0]]
+    X = [[0, 1, 2, 0, 0]
+         [0, 3, 4, 0, 0]
+         [0, 0, 0, 0, 0]]
 
 and 
 
-offsets = [0, 1]
+    offsets = [0, 1]
 
 and
  
-shape = [2, 2]
+    shape = [2, 2]
 
 then we get 
 
-Out = [[1, 2],
-   [3, 4]]
+    Out = [[1, 2],
+           [3, 4]]
 
 )DOC");
-    AddAttr<std::vector<int>>("offsets",
-                              "A list<int> describing offsets to be cropped."
-                              "The size of offsets list should be as same as "
-                              "dimension size of  input X.");
-    AddAttr<std::vector<int>>("shape",
-                              "A list<int> describing the shape of output."
-                              "The size of shape list should be as same as "
-                              "dimension size of  input X.")
-        .SetDefault(std::vector<int>());
   }
 };
 
@@ -149,17 +149,17 @@ template <typename T>
 class CropCPUKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<LoDTensor>("X");
-    auto *out = context.Output<LoDTensor>("Out");
+    auto *x = context.Input<Tensor>("X");
+    auto *out = context.Output<Tensor>("Out");
     auto x_data = x->data<T>();
     T *out_data = out->mutable_data<T>(context.GetPlace());
     auto x_dims = x->dims();
     auto out_dims = out->dims();
-    int64_t out_count = framework::product(out_dims);
+    int64_t out_count = out->numel();
     std::vector<int64_t> x_shape = framework::vectorize(x_dims);
     std::vector<int64_t> out_shape = framework::vectorize(out_dims);
 
-    auto offsets = context.op().Attr<std::vector<int>>("offsets");
+    auto offsets = context.Attr<std::vector<int>>("offsets");
     PADDLE_ENFORCE_EQ(
         x_dims.size(), offsets.size(),
         "Offsets size should be equal to dimension size of input tensor.");
diff --git a/paddle/operators/crop_op.cu b/paddle/operators/crop_op.cu
index a40eb7af38..f499ce3f27 100644
--- a/paddle/operators/crop_op.cu
+++ b/paddle/operators/crop_op.cu
@@ -20,6 +20,7 @@ namespace paddle {
 namespace operators {
 
 using framework::LoDTensor;
+using framework::Tensor;
 
 template <typename T, int D>
 __global__ void CropKernel(const int N, const int64_t* out_shape,
@@ -54,35 +55,36 @@ void CropCUDAFunctoin(const framework::ExecutionContext& context) {
   T* out_data = out->mutable_data<T>(paddle::platform::GPUPlace());
   auto x_dims = x->dims();
   auto out_dims = out->dims();
-  int64_t out_count = framework::product(out_dims);
-  int64_t x_shape[D];
-  int64_t out_shape[D];
+  int64_t out_count = out->numel();
+  Tensor x_shape;
+  Tensor out_shape;
+  int64_t* x_shape_data =
+      x_shape.mutable_data<int64_t>({D}, paddle::platform::CPUPlace());
+  int64_t* out_shape_data =
+      out_shape.mutable_data<int64_t>({D}, paddle::platform::CPUPlace());
   for (int i = 0; i < D; ++i) {
-    x_shape[i] = x_dims[i];
-    out_shape[i] = out_dims[i];
+    x_shape_data[i] = x_dims[i];
+    out_shape_data[i] = out_dims[i];
   }
-  int64_t* x_shape_gpu;
-  int64_t* out_shape_gpu;
-  cudaMalloc((void**)&x_shape_gpu, sizeof(int64_t) * D);
-  cudaMemcpy(x_shape_gpu, x_shape, sizeof(int64_t) * D, cudaMemcpyHostToDevice);
-  cudaMalloc((void**)&out_shape_gpu, sizeof(int64_t) * D);
-  cudaMemcpy(out_shape_gpu, out_shape, sizeof(int64_t) * D,
-             cudaMemcpyHostToDevice);
+  Tensor x_shape_gpu;
+  Tensor out_shape_gpu;
+  x_shape_gpu.CopyFrom<int64_t>(x_shape, paddle::platform::GPUPlace());
+  out_shape_gpu.CopyFrom<int64_t>(out_shape, paddle::platform::GPUPlace());
   auto offsets = context.op().Attr<std::vector<int>>("offsets");
   PADDLE_ENFORCE_EQ(
       D, offsets.size(),
       "Offsets size should be equal to dimension size of input tensor.");
 
-  int crop_rules[D * 2];
-  for (size_t i = 0; i < x_dims.size(); ++i) {
-    crop_rules[i * 2] = offsets[i];
-    crop_rules[i * 2 + 1] = x_dims[i] - out_dims[i] - offsets[i];
+  Tensor crop_rules;
+  int* crop_rules_data =
+      crop_rules.mutable_data<int>({D * 2}, paddle::platform::CPUPlace());
+  for (size_t i = 0; i < D; ++i) {
+    crop_rules_data[i * 2] = offsets[i];
+    crop_rules_data[i * 2 + 1] = x_dims[i] - out_dims[i] - offsets[i];
   }
 
-  int* crop_rules_gpu;
-  cudaMalloc((void**)&crop_rules_gpu, sizeof(int) * D * 2);
-  cudaMemcpy(crop_rules_gpu, crop_rules, sizeof(int) * D * 2,
-             cudaMemcpyHostToDevice);
+  Tensor crop_rules_gpu;
+  crop_rules_gpu.CopyFrom<int>(crop_rules, paddle::platform::GPUPlace());
 
   int n = out_dims[0];
   int d = out_dims[1];
@@ -94,11 +96,9 @@ void CropCUDAFunctoin(const framework::ExecutionContext& context) {
   CropKernel<T,
              D><<<grid, block, 0,
                   reinterpret_cast<platform::CUDADeviceContext*>(device_context)
-                      ->stream()>>>(out_count, out_shape_gpu, x_shape_gpu,
-                                    crop_rules_gpu, x_data, out_data);
-  cudaFree(crop_rules_gpu);
-  cudaFree(x_shape_gpu);
-  cudaFree(out_shape_gpu);
+                      ->stream()>>>(
+      out_count, out_shape_gpu.data<int64_t>(), x_shape_gpu.data<int64_t>(),
+      crop_rules_gpu.data<int>(), x_data, out_data);
 }
 
 template <typename T>

From fad48fa6b1865d353cc277845db5195f79df3be7 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Tue, 19 Sep 2017 15:31:58 +0800
Subject: [PATCH 25/50] Add bool type for attr.

---
 paddle/framework/attribute.cc    | 18 ++++++++++++++++++
 paddle/framework/attribute.h     |  5 +++--
 paddle/framework/framework.proto |  4 ++++
 python/paddle/v2/framework/op.py |  4 ++++
 4 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc
index 27132eaa0b..e18d1add96 100644
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -19,6 +19,10 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+template <>
+AttrType AttrTypeID<bool>() {
+  return BOOL;
+}
 template <>
 AttrType AttrTypeID<int>() {
   return INT;
@@ -32,6 +36,10 @@ AttrType AttrTypeID<std::string>() {
   return STRING;
 }
 template <>
+AttrType AttrTypeID<std::vector<bool>>() {
+  return BOOLS;
+}
+template <>
 AttrType AttrTypeID<std::vector<int>>() {
   return INTS;
 }
@@ -50,6 +58,9 @@ AttrType AttrTypeID<std::vector<std::pair<int, int>>>() {
 
 Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
   switch (attr_desc.type()) {
+    case paddle::framework::AttrType::BOOL: {
+      return attr_desc.b();
+    }
     case paddle::framework::AttrType::INT: {
       return attr_desc.i();
     }
@@ -59,6 +70,13 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
     case paddle::framework::AttrType::STRING: {
       return attr_desc.s();
     }
+    case paddle::framework::AttrType::BOOLS: {
+      std::vector<bool> val(attr_desc.bools_size());
+      for (int i = 0; i < attr_desc.bools_size(); ++i) {
+        val[i] = attr_desc.bools(i);
+      }
+      return val;
+    }
     case paddle::framework::AttrType::INTS: {
       std::vector<int> val(attr_desc.ints_size());
       for (int i = 0; i < attr_desc.ints_size(); ++i) {
diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
index 2b788a76ca..3232a9003e 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -27,8 +27,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
-                       std::vector<float>, std::vector<std::string>,
+typedef boost::variant<boost::blank, bool, int, float, std::string,
+                       std::vector<bool>, std::vector<int>, std::vector<float>,
+                       std::vector<std::string>,
                        std::vector<std::pair<int, int>>>
     Attribute;
 
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index dfcb5fb621..ec7b750d81 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -23,6 +23,8 @@ enum AttrType {
   FLOATS = 4;
   STRINGS = 5;
   INT_PAIRS = 6;
+  BOOL = 7;
+  BOOLS = 8;
 }
 
 message IntPair {
@@ -44,6 +46,8 @@ message OpDesc {
     repeated float floats = 7;
     repeated string strings = 8;
     repeated IntPair int_pairs = 9;
+    optional bool b = 10;
+    repeated bool bools = 6;
   };
 
   message Var {
diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/framework/op.py
index 6cca41e43b..93dfbc5d30 100644
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
@@ -89,12 +89,16 @@ class OpDescCreationMethod(object):
                     new_attr.f = user_defined_attr
                 elif attr.type == framework_pb2.STRING:
                     new_attr.s = user_defined_attr
+                elif attr.type == framework_pb2.BOOL:
+                    new_attr.b = user_defined_attr
                 elif attr.type == framework_pb2.INTS:
                     new_attr.ints.extend(user_defined_attr)
                 elif attr.type == framework_pb2.FLOATS:
                     new_attr.floats.extend(user_defined_attr)
                 elif attr.type == framework_pb2.STRINGS:
                     new_attr.strings.extend(user_defined_attr)
+                elif attr.type == framework_pb2.BOOLS:
+                    new_attr.bools.extend(user_defined_attr)
                 elif attr.type == framework_pb2.INT_PAIRS:
                     for p in user_defined_attr:
                         pair = new_attr.int_pairs.add()

From 94fa9d1a957d3faecb5a15cb3c8d0c0f5c7eabf7 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Tue, 19 Sep 2017 22:33:01 +0800
Subject: [PATCH 26/50] Remove const cast for device context

---
 paddle/operators/crop_op.cu | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/paddle/operators/crop_op.cu b/paddle/operators/crop_op.cu
index f499ce3f27..05782145b8 100644
--- a/paddle/operators/crop_op.cu
+++ b/paddle/operators/crop_op.cu
@@ -91,12 +91,11 @@ void CropCUDAFunctoin(const framework::ExecutionContext& context) {
   int block = 512;
   int grid = (n * d + block - 1) / block;
 
-  auto* device_context =
-      const_cast<platform::DeviceContext*>(context.device_context_);
-  CropKernel<T,
-             D><<<grid, block, 0,
-                  reinterpret_cast<platform::CUDADeviceContext*>(device_context)
-                      ->stream()>>>(
+  CropKernel<
+      T,
+      D><<<grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                               context.device_context())
+                               .stream()>>>(
       out_count, out_shape_gpu.data<int64_t>(), x_shape_gpu.data<int64_t>(),
       crop_rules_gpu.data<int>(), x_data, out_data);
 }

From 72ba02701b78f3296dfecb154b6c78fa475a19c7 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Wed, 20 Sep 2017 12:59:17 +0800
Subject: [PATCH 27/50] Add bool type for attribute and use it in dropout_op.

---
 paddle/framework/attribute.cc                   |  8 ++++----
 paddle/framework/framework.proto                |  6 +++---
 paddle/operators/dropout_op.cc                  | 17 +++++------------
 paddle/operators/dropout_op.cu                  |  2 +-
 paddle/operators/dropout_op.h                   |  6 +++---
 python/paddle/v2/framework/op.py                |  4 ++--
 .../v2/framework/tests/test_dropout_op.py       | 10 +++++-----
 7 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc
index e18d1add96..e8f40a868d 100644
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -21,7 +21,7 @@ namespace framework {
 
 template <>
 AttrType AttrTypeID<bool>() {
-  return BOOL;
+  return BOOLEAN;
 }
 template <>
 AttrType AttrTypeID<int>() {
@@ -37,7 +37,7 @@ AttrType AttrTypeID<std::string>() {
 }
 template <>
 AttrType AttrTypeID<std::vector<bool>>() {
-  return BOOLS;
+  return BOOLEANS;
 }
 template <>
 AttrType AttrTypeID<std::vector<int>>() {
@@ -58,7 +58,7 @@ AttrType AttrTypeID<std::vector<std::pair<int, int>>>() {
 
 Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
   switch (attr_desc.type()) {
-    case paddle::framework::AttrType::BOOL: {
+    case paddle::framework::AttrType::BOOLEAN: {
       return attr_desc.b();
     }
     case paddle::framework::AttrType::INT: {
@@ -70,7 +70,7 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
     case paddle::framework::AttrType::STRING: {
       return attr_desc.s();
     }
-    case paddle::framework::AttrType::BOOLS: {
+    case paddle::framework::AttrType::BOOLEANS: {
       std::vector<bool> val(attr_desc.bools_size());
       for (int i = 0; i < attr_desc.bools_size(); ++i) {
         val[i] = attr_desc.bools(i);
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index ec7b750d81..f232e48c45 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -23,8 +23,8 @@ enum AttrType {
   FLOATS = 4;
   STRINGS = 5;
   INT_PAIRS = 6;
-  BOOL = 7;
-  BOOLS = 8;
+  BOOLEAN = 7;
+  BOOLEANS = 8;
 }
 
 message IntPair {
@@ -47,7 +47,7 @@ message OpDesc {
     repeated string strings = 8;
     repeated IntPair int_pairs = 9;
     optional bool b = 10;
-    repeated bool bools = 6;
+    repeated bool bools = 11;
   };
 
   message Var {
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index b111b9fccb..95641f3ce7 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -29,13 +29,10 @@ class DropoutOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE_GE(ctx.Attr<float>("dropout_prob"), 0);
     PADDLE_ENFORCE_LE(ctx.Attr<float>("dropout_prob"), 1);
-    // TODO(xinghai-sun): remove this check after swtiching to bool
-    PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
-                   ctx.Attr<int>("is_training") == 1);
 
     auto dims = ctx.Input<Tensor>("X")->dims();
     ctx.Output<LoDTensor>("Out")->Resize(dims);
-    if (ctx.Attr<int>("is_training") == 1) {
+    if (ctx.Attr<bool>("is_training")) {
       ctx.Output<LoDTensor>("Mask")->Resize(dims);
     }
   }
@@ -49,8 +46,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddAttr<AttrType>("dropout_prob", "Probability of setting units to zero.")
         .SetDefault(.5f);
-    // TODO(xinghai-sun): use bool for is_training after bool is supported.
-    AddAttr<int>("is_training", "Whether in training phase.").SetDefault(1);
+    AddAttr<bool>("is_training", "Whether in training phase.").SetDefault(true);
     AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
     AddInput("X", "The input of dropout op.");
     AddOutput("Out", "The output of dropout op.");
@@ -59,7 +55,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Dropout Operator.
 
-"Dropout" refers to randomly dropping out units in a nerual network. It is a
+'Dropout' refers to randomly dropping out units in a nerual network. It is a
 regularization technique for reducing overfitting by preventing neuron
 co-adaption during training. The dropout operator randomly set (according to
 the given dropout probability) the outputs of some units to zero, while others
@@ -75,8 +71,8 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx.Attr<int>("is_training"), 1,
-                      "GradOp is only callable when is_training is true");
+    PADDLE_ENFORCE(ctx.Attr<bool>("is_training"),
+                   "GradOp is only callable when is_training is true");
 
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Mask"), "Mask must not be null.");
@@ -85,9 +81,6 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_GE(ctx.Attr<AttrType>("dropout_prob"), 0);
     PADDLE_ENFORCE_LE(ctx.Attr<AttrType>("dropout_prob"), 1);
-    // TODO(xinghai-sun): remove this check after swtiching to bool
-    PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
-                   ctx.Attr<int>("is_training") == 1);
     auto x_dims = ctx.Input<Tensor>("X")->dims();
     auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
     PADDLE_ENFORCE_EQ(x_dims, out_dims,
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
index 186237fb23..a04e4a22cc 100644
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -59,7 +59,7 @@ class GPUDropoutKernel : public framework::OpKernel {
     auto Y = EigenMatrix<T>::Reshape(*y, 1);
 
     auto place = context.GetEigenDevice<Place>();
-    if (context.Attr<int>("is_training") == 1) {
+    if (context.Attr<bool>("is_training")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
       int size = framework::product(mask->dims());
diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h
index 82eafee0e0..d57f64afcb 100644
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -35,7 +35,7 @@ class CPUDropoutKernel : public framework::OpKernel {
     auto* y_data = y->mutable_data<T>(context.GetPlace());
     AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
 
-    if (context.Attr<int>("is_training") == 1) {
+    if (context.Attr<bool>("is_training")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
       int seed = context.Attr<int>("seed");
@@ -65,8 +65,8 @@ template <typename Place, typename T>
 class DropoutGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(context.Attr<int>("is_training"), 1,
-                      "GradOp is only callable when is_training is true");
+    PADDLE_ENFORCE(context.Attr<bool>("is_training"),
+                   "GradOp is only callable when is_training is true");
 
     auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
     auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/framework/op.py
index 93dfbc5d30..9086a5cc34 100644
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
@@ -89,7 +89,7 @@ class OpDescCreationMethod(object):
                     new_attr.f = user_defined_attr
                 elif attr.type == framework_pb2.STRING:
                     new_attr.s = user_defined_attr
-                elif attr.type == framework_pb2.BOOL:
+                elif attr.type == framework_pb2.BOOLEAN:
                     new_attr.b = user_defined_attr
                 elif attr.type == framework_pb2.INTS:
                     new_attr.ints.extend(user_defined_attr)
@@ -97,7 +97,7 @@ class OpDescCreationMethod(object):
                     new_attr.floats.extend(user_defined_attr)
                 elif attr.type == framework_pb2.STRINGS:
                     new_attr.strings.extend(user_defined_attr)
-                elif attr.type == framework_pb2.BOOLS:
+                elif attr.type == framework_pb2.BOOLEANS:
                     new_attr.bools.extend(user_defined_attr)
                 elif attr.type == framework_pb2.INT_PAIRS:
                     for p in user_defined_attr:
diff --git a/python/paddle/v2/framework/tests/test_dropout_op.py b/python/paddle/v2/framework/tests/test_dropout_op.py
index 3638fee1a1..29fc702791 100644
--- a/python/paddle/v2/framework/tests/test_dropout_op.py
+++ b/python/paddle/v2/framework/tests/test_dropout_op.py
@@ -7,7 +7,7 @@ class TestDropoutOp(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_training': 1}
+        self.attrs = {'dropout_prob': 0.0, 'is_training': True}
         self.outputs = {'Out': self.inputs['X'], 'Mask': np.ones((32, 64))}
 
     def test_check_output(self):
@@ -21,7 +21,7 @@ class TestDropoutOp2(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 1.0, 'is_training': 1}
+        self.attrs = {'dropout_prob': 1.0, 'is_training': True}
         self.outputs = {'Out': np.zeros((32, 64)), 'Mask': np.zeros((32, 64))}
 
 
@@ -29,7 +29,7 @@ class TestDropoutOp3(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_training': 1}
+        self.attrs = {'dropout_prob': 0.0, 'is_training': True}
         self.outputs = {'Out': self.inputs['X'], 'Mask': np.ones((32, 64, 2))}
 
 
@@ -37,7 +37,7 @@ class TestDropoutOp4(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.35, 'is_training': 0}
+        self.attrs = {'dropout_prob': 0.35, 'is_training': False}
         self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
 
     def test_check_output(self):
@@ -48,7 +48,7 @@ class TestDropoutOp5(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.75, 'is_training': 0}
+        self.attrs = {'dropout_prob': 0.75, 'is_training': False}
         self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
 
     def test_check_output(self):

From ece329100a691d92ea20c3e3240f8f1cc4ea955d Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Wed, 20 Sep 2017 17:13:49 +0800
Subject: [PATCH 28/50] refine rank_loss_op

---
 paddle/operators/rank_loss_op.cc              | 77 +++++++++++------
 paddle/operators/rank_loss_op.h               | 86 ++++++++-----------
 .../v2/framework/tests/test_rank_loss_op.py   | 27 +++---
 3 files changed, 104 insertions(+), 86 deletions(-)

diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index 66571bd9a6..fd3ac86939 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -28,18 +28,21 @@ class RankLossOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
     // input check
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("P"), "Input(P) shouldn't be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Oi"), "Input(Oi) shouldn't be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Oj"), "Input(Oj) shouldn't be null");
-    auto p_dims = ctx.Input<framework::Tensor>("P")->dims();
-    auto oi_dims = ctx.Input<framework::Tensor>("Oi")->dims();
-    auto oj_dims = ctx.Input<framework::Tensor>("Oj")->dims();
-    PADDLE_ENFORCE_EQ(oi_dims, oj_dims,
-                      "Input(Oi) and Input(Oj) must have the same size");
-    PADDLE_ENFORCE_EQ(
-        p_dims, oi_dims,
-        "Input(P) must have the same size with Input(Oi) & Input(Oj)");
-    ctx.Output<framework::Tensor>("Out")->Resize(p_dims);
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
+                            "Input(Label) shouldn't be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Left"),
+                            "Input(Left) shouldn't be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Right"),
+                            "Input(Right) shouldn't be null");
+    auto label_dims = ctx.Input<framework::Tensor>("Label")->dims();
+    auto left_dims = ctx.Input<framework::Tensor>("Left")->dims();
+    auto right_dims = ctx.Input<framework::Tensor>("Right")->dims();
+    PADDLE_ENFORCE((label_dims.size() == 1) && (left_dims.size() == 1) &&
+                       (right_dims.size() == 1),
+                   "The rank of all inputs must be 1.");
+    PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims),
+                   "All inputs must have the same size");
+    ctx.Output<framework::LoDTensor>("Out")->Resize(label_dims);
   }
 };
 
@@ -48,14 +51,23 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
   RankLossOpMaker(framework::OpProto *proto,
                   framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("P", "The desired target values for posteriors.");
-    AddInput("Oi", "The model output for item i.");
-    AddInput("Oj", "The model output for item j.");
-    AddOutput("Out", "The output tensor of RankLoss operator.");
+    AddInput("Label",
+             "The label indicating A ranked higher than B or not, 1-D tensor.");
+    AddInput("Left", "The output of RankNet for doc A, 1-D tensor.");
+    AddInput("Right", "The output of RankNet for doc B, 1-D tensor");
+    AddOutput("Out", "The output loss of RankLoss operator, 1-D tensor.");
     AddComment(R"DOC(RankLoss operator
 
-A rank loss operator for learning to rank (LTR) task. This operator contains
-three inputs: P, Oi, and Oj, and the rank cost can be expressed as
+Rank loss operator for RankNet[1]. RankNet is a pairwise ranking model with
+one training sample consisting of a pair of doc A and B, and the label P
+indicating that A is ranked higher than B or not:
+
+P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
+the input pair.
+
+The RankLoss operator contains three inputs: Left (o_i), Right (o_j) and Label
+(P_{i,j}), which represent the output of RankNet for two docs and the label
+respectively, and yields the rank loss C_{i,j} by following the expression
 
 \f[
   C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
@@ -63,10 +75,11 @@ three inputs: P, Oi, and Oj, and the rank cost can be expressed as
   \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
 \f]
 
-A detailed explanation about these notations can be found in
+The operator can take inputs of one sample or in batch.
 
 [1]. Chris Burges, Tal Shaked, Erin Renshaw, et al. Learning to
-     Rank useing Gradient Descent.
+     Rank using Gradient Descent.
+     http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf
 )DOC");
   }
 };
@@ -81,15 +94,25 @@ class RankLossGradOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("P"), "Input(P) shouldn't be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Oi"), "Input(Oi) shouldn't be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Oj"), "Input(Oj) shouldn't be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
+                            "Input(Label) shouldn't be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Left"),
+                            "Input(Left) shouldn't be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Right"),
+                            "Input(Right) shouldn't be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                             "Input(Out@GRAD) shouldn't be null.");
-    auto dims = ctx.Input<framework::Tensor>("P")->dims();
-    ctx.Output<framework::Tensor>(framework::GradVarName("P"))->Resize(dims);
-    ctx.Output<framework::Tensor>(framework::GradVarName("Oi"))->Resize(dims);
-    ctx.Output<framework::Tensor>(framework::GradVarName("Oj"))->Resize(dims);
+    auto dims = ctx.Input<framework::Tensor>("Left")->dims();
+    auto *left_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("Left"));
+    auto *right_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("Right"));
+    if (left_grad) {
+      left_grad->Resize(dims);
+    }
+    if (right_grad) {
+      right_grad->Resize(dims);
+    }
   }
 };
 
diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h
index d21871107a..9776d123fe 100644
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
@@ -24,25 +24,20 @@ template <typename Place, typename T>
 class RankLossKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* p_t = ctx.Input<framework::Tensor>("P");
-    auto* oi_t = ctx.Input<framework::Tensor>("Oi");
-    auto* oj_t = ctx.Input<framework::Tensor>("Oj");
-    out->mutable_data<T>(ctx.GetPlace());
+    auto* out_t = ctx.Output<framework::LoDTensor>("Out");
+    auto* label_t = ctx.Input<framework::Tensor>("Label");
+    auto* left_t = ctx.Input<framework::Tensor>("Left");
+    auto* right_t = ctx.Input<framework::Tensor>("Right");
+    out_t->mutable_data<T>(ctx.GetPlace());
 
-    auto& dev = ctx.GetEigenDevice<Place>();
-    auto out_eig = framework::EigenVector<T>::Flatten(*out);
-    auto p_eig = framework::EigenVector<T>::Flatten(*p_t);
-    auto oi_eig = framework::EigenVector<T>::Flatten(*oi_t);
-    auto oj_eig = framework::EigenVector<T>::Flatten(*oj_t);
-
-    framework::Tensor o_t;
-    o_t.Resize(oi_t->dims());
-    o_t.mutable_data<T>(ctx.GetPlace());
-    auto o_eig = framework::EigenVector<T>::Flatten(o_t);
-    o_eig.device(dev) = oi_eig - oj_eig;
+    auto out = framework::EigenVector<T>::Flatten(*out_t);
+    auto label = framework::EigenVector<T>::Flatten(*label_t);
+    auto left = framework::EigenVector<T>::Flatten(*left_t);
+    auto right = framework::EigenVector<T>::Flatten(*right_t);
 
-    out_eig.device(dev) = (1. + (o_eig).exp()).log() - p_eig * o_eig;
+    auto& dev = ctx.GetEigenDevice<Place>();
+    out.device(dev) =
+        (1. + (left - right).exp()).log() - label * (left - right);
   }
 };
 
@@ -50,40 +45,35 @@ template <typename Place, typename T>
 class RankLossGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_oi = ctx.Output<framework::Tensor>(framework::GradVarName("Oi"));
-    auto* d_oj = ctx.Output<framework::Tensor>(framework::GradVarName("Oj"));
-    auto* d_p = ctx.Output<framework::Tensor>(framework::GradVarName("P"));
-
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* p_t = ctx.Input<framework::Tensor>("P");
-    auto* oi_t = ctx.Input<framework::Tensor>("Oi");
-    auto* oj_t = ctx.Input<framework::Tensor>("Oj");
+    auto* d_left_t =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("Left"));
+    auto* d_right_t =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("Right"));
 
-    d_oi->mutable_data<T>(ctx.GetPlace());
-    d_oj->mutable_data<T>(ctx.GetPlace());
-    d_p->mutable_data<T>(ctx.GetPlace());
+    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* label_t = ctx.Input<framework::Tensor>("Label");
+    auto* left_t = ctx.Input<framework::Tensor>("Left");
+    auto* right_t = ctx.Input<framework::Tensor>("Right");
 
     auto& dev = ctx.GetEigenDevice<Place>();
-    auto d_out_eig = framework::EigenVector<T>::Flatten(*d_out);
-    auto p_eig = framework::EigenVector<T>::Flatten(*p_t);
-    auto oi_eig = framework::EigenVector<T>::Flatten(*oi_t);
-    auto oj_eig = framework::EigenVector<T>::Flatten(*oj_t);
-
-    auto d_oi_eig = framework::EigenVector<T>::Flatten(*d_oi);
-    auto d_oj_eig = framework::EigenVector<T>::Flatten(*d_oj);
-
-    framework::Tensor o_t;
-    o_t.Resize(oi_t->dims());
-    o_t.mutable_data<T>(ctx.GetPlace());
-    auto o_eig = framework::EigenVector<T>::Flatten(o_t);
-    o_eig.device(dev) = oi_eig - oj_eig;
-
-    // dOi & dOj
-    d_oi_eig.device(dev) =
-        d_out_eig * (o_eig.exp() / (1. + o_eig.exp()) - p_eig);
-    d_oj_eig.device(dev) = -d_oi_eig;
-    // dP
-    framework::EigenVector<T>::Flatten(*d_p).device(dev) = -o_eig;
+    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
+    auto label = framework::EigenVector<T>::Flatten(*label_t);
+    auto left = framework::EigenVector<T>::Flatten(*left_t);
+    auto right = framework::EigenVector<T>::Flatten(*right_t);
+
+    // compute d_left
+    if (d_left_t) {
+      d_left_t->mutable_data<T>(ctx.GetPlace());
+      auto d_left = framework::EigenVector<T>::Flatten(*d_left_t);
+      d_left.device(dev) = d_out * (1. / (1. + (right - left).exp()) - label);
+    }
+    // compute d_right
+    if (d_right_t) {
+      d_right_t->mutable_data<T>(ctx.GetPlace());
+      auto d_right = framework::EigenVector<T>::Flatten(*d_right_t);
+      d_right.device(dev) =
+          -d_out * (1.0 / (1. + (right - left).exp()) - label);
+    }
   }
 };
 }  // namespace operators
diff --git a/python/paddle/v2/framework/tests/test_rank_loss_op.py b/python/paddle/v2/framework/tests/test_rank_loss_op.py
index 48354b7f7b..c4d74e1c04 100644
--- a/python/paddle/v2/framework/tests/test_rank_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_rank_loss_op.py
@@ -3,24 +3,29 @@ import numpy as np
 from op_test import OpTest
 
 
-class TestReshapeOp(OpTest):
+class TestRankLossOp(OpTest):
     def setUp(self):
         self.op_type = "rank_loss"
-        num = 5
-        # P = {0, 1.0} or {0, 0.5, 1.0}
-        P = np.random.randint(0, 2, size=(num, num)).astype("float32")
-        Oi = np.random.random((num, num)).astype("float32")
-        Oj = np.random.random((num, num)).astype("float32")
-        O = Oi - Oj
-        Out = np.log(1.0 + np.exp(O)) - P * O
-        self.inputs = {'P': P, 'Oi': Oi, 'Oj': Oj}
-        self.outputs = {'Out': Out}
+        batch_size = 5
+        # labels_{i} = {0, 1.0} or {0, 0.5, 1.0}
+        label = np.random.randint(0, 2, size=(batch_size, )).astype("float32")
+        left = np.random.random((batch_size, )).astype("float32")
+        right = np.random.random((batch_size, )).astype("float32")
+        loss = np.log(1.0 + np.exp(left - right)) - label * (left - right)
+        self.inputs = {'Label': label, 'Left': left, 'Right': right}
+        self.outputs = {'Out': loss}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["Oj"], "Out")
+        self.check_grad(["Left", "Right"], "Out")
+
+    def test_check_grad_ignore_left(self):
+        self.check_grad(["Right"], "Out", no_grad_set=set('Left'))
+
+    def test_check_grad_ignore_right(self):
+        self.check_grad(["Left"], "Out", no_grad_set=set('Right'))
 
 
 if __name__ == '__main__':

From b65709e4039f338d90391b0fed9b8f6118b23380 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Tue, 19 Sep 2017 16:44:28 +0800
Subject: [PATCH 29/50] Share LoD between input and output of each opeators.

---
 paddle/framework/operator.h                   |  8 +++++++
 paddle/operators/accuracy_op.cc               |  7 ++++++-
 paddle/operators/cos_sim_op.cc                | 10 ++++++---
 paddle/operators/elementwise_mul_op.cc        |  5 +++++
 paddle/operators/fc_op.cc                     |  4 ++++
 paddle/operators/fill_zeros_like_op.cc        | 21 +++++++++----------
 paddle/operators/fill_zeros_like_op.h         |  2 +-
 paddle/operators/lookup_table_op.cc           | 11 +++++++---
 paddle/operators/mean_op.cc                   |  3 ++-
 paddle/operators/minus_op.cc                  |  8 ++++++-
 paddle/operators/mul_op.cc                    | 10 +++++++--
 paddle/operators/onehot_cross_entropy_op.cc   |  3 +++
 paddle/operators/prelu_op.cc                  |  3 +++
 paddle/operators/rowwise_add_op.cc            |  1 +
 paddle/operators/scale_op.cc                  |  1 +
 paddle/operators/sigmoid_op.cc                |  1 +
 paddle/operators/squared_l2_distance_op.cc    |  4 ++++
 paddle/operators/sum_op.cc                    |  8 +++++--
 .../tests/test_fill_zeros_like_op.py          |  4 ++--
 19 files changed, 87 insertions(+), 27 deletions(-)

diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index b7c9c39402..28a253ec0b 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -336,6 +336,14 @@ class InferShapeContext {
     return &var->Get<Tensor>();
   }
 
+  void ShareLoD(const std::string& in, const std::string& out) const {
+    PADDLE_ENFORCE(InputVar(in)->IsType<LoDTensor>(),
+                   "The Input(%s) must be LoDTensor.", in);
+    PADDLE_ENFORCE(OutputVar(out)->IsType<LoDTensor>(),
+                   "The Output(%s) must be LoDTensor.", out);
+    Output<LoDTensor>(out)->set_lod(Input<LoDTensor>(in)->lod());
+  }
+
  private:
   const OperatorBase& op_;
   const Scope& scope_;
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 0c813748b2..32479ae5a3 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -40,6 +40,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
                       "inference size must be the same as label size");
 
     ctx.Output<framework::LoDTensor>("Accuracy")->Resize({1});
+    ctx.ShareLoD("Inference", "Accuracy");
   }
 };
 
@@ -58,7 +59,11 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
         R"DOC(Accuracy. It will print accuracy rate for classification.
 The accuracy is:
 ..  math::
-accuracy = \\frac{NumOfCorrectPredicts}{NumOfAllSamples})DOC");
+accuracy = \\frac{NumOfCorrectPredicts}{NumOfAllSamples})
+
+Both the input `Inference` and `Label` can carry the LoD (Level of Details)
+information, or not. But the output only shares the LoD with input `Inference`.
+DOC");
   }
 };
 
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
index 72c4464936..840848fa08 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -57,6 +57,7 @@ class CosSimOp : public framework::OperatorWithKernel {
     ctx.Output<framework::LoDTensor>("Out")->Resize({x_dims[0], 1});
     ctx.Output<framework::LoDTensor>("XNorm")->Resize({x_dims[0], 1});
     ctx.Output<framework::LoDTensor>("YNorm")->Resize({y_dims[0], 1});
+    ctx.ShareLoD("X", "Out");
   }
 };
 
@@ -81,10 +82,13 @@ Cosine Similarity Operator.
 
 The equation is: Out = X^T * Y / (sqrt(X^T * X) * sqrt(Y^T * Y)).
 
-Input(X) and Input(Y) must have the same shape, except that the 1st dimension
-of Input(Y) could be just 1 (different from Input(X)), which will be
-broadcasted to match the shape of Input(X) before computing their cosine
+The input `X` and `Y` must have the same shape, except that the 1st dimension
+of input `Y` could be just 1 (different from input `X`), which will be
+broadcasted to match the shape of input `X` before computing their cosine
 similarity.
+
+Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
 )DOC");
   }
 };
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index ee6e975b44..304e45fa5b 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -38,6 +38,7 @@ class ElementWiseMulOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
                       "Rank of first input must >= rank of second input.")
     ctx.Output<framework::LoDTensor>("Out")->Resize(x_dim);
+    ctx.ShareLoD("X", "Out");
   }
 };
 
@@ -63,11 +64,15 @@ Limited elementwise multiple operator.The equation is: Out = X ⊙ Y.
 2. Y's shape is a subset of X. 
    Y will be broadcasted to match the shape of X and axis should be dimension index Y in X.
    example:
+
       shape(X) = (2, 3, 4, 5), shape(Y) = (,)
       shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
       shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
       shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
       shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+
+Both the input X and Y can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input X.
 )DOC");
   }
 };
diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc
index e5d0f3c372..56fe654d1e 100644
--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
@@ -186,6 +186,10 @@ W_i is a 2-D matrix of size (K x N), where N means the number of neurons
 in the fully connected layer. B is a 1-D vector of size N.
 Thus, the output Out is a 2-D matrix of size (M x N).
 Activation type can be set to `identity` (default), `sigmoid` or `softmax`.
+
+All the inputs can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with first input (`X[0]`).
+)DOC");
 )DOC");
   }
 };
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
index ba7857cc65..a238b59b78 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -23,15 +23,14 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.InputVar("Src"),
-        "Input(Src) of FillZerosLikeOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("Dst"),
-        "Output(Dst) of FillZerosLikeOp should not be null.");
-
-    ctx.Output<framework::LoDTensor>("Dst")->Resize(
-        ctx.Input<framework::Tensor>("Src")->dims());
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of FillZerosLikeOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"),
+                            "Output(Y) of FillZerosLikeOp should not be null.");
+
+    ctx.Output<framework::LoDTensor>("Y")->Resize(
+        ctx.Input<framework::Tensor>("X")->dims());
+    ctx.ShareLoD("X", "Y");
   }
 };
 
@@ -40,8 +39,8 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
   FillZerosLikeOpMaker(framework::OpProto *proto,
                        framework::OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Src", "The input of fill-zeros-like op.");
-    AddOutput("Dst", "The varibale will be filled up with zeros.");
+    AddInput("X", "The input of fill-zeros-like op.");
+    AddOutput("Y", "The varibale will be filled up with zeros.");
     AddComment(R"DOC(
 Fill up a vriable with zeros.
 
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
index 969998ce2e..4474581784 100644
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -23,7 +23,7 @@ template <typename Place, typename T>
 class FillZerosLikeKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* output = context.Output<framework::Tensor>("Dst");
+    auto* output = context.Output<framework::Tensor>("Y");
     output->mutable_data<T>(context.GetPlace());
     auto t = framework::EigenVector<T>::Flatten(*output);
     t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index 07f6dfabca..8f533f1cc3 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -35,6 +35,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
     auto output_t = ctx.Output<framework::LoDTensor>("Out");
 
     output_t->Resize({ids_t->dims()[0], table_t->dims()[1]});
+    ctx.ShareLoD("Ids", "Out");
   }
 };
 
@@ -50,9 +51,13 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
              "An input with type int32 or int64"
              "contains the ids to be looked up in W.");
     AddOutput("Out", "The lookup results, which have the same type with W.");
-    AddComment(
-        "This operator is used to perform lookups on the parameter W,"
-        "then concatenated into a dense tensor.");
+    AddComment(R"DOC(
+This operator is used to perform lookups on the parameter W,
+then concatenated into a dense tensor.
+
+The input `Ids` can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD with input `Ids`.
+)DOC");
   }
 };
 
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index 7d7eeb59a2..96540ff454 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -37,7 +37,8 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of mean op");
     AddOutput("Out", "The output of mean op").NotInGradient();
-    AddComment("Mean Operator");
+    AddComment(R"DOC( Mean Operator
+)DOC");
   }
 };
 
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index a97bbecdca..5036f9f98a 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -41,6 +41,7 @@ class MinusOp : public framework::OperatorWithKernel {
         left_tensor->numel(), right_tensor->numel(),
         "Minus operator must take two tensor with same num of elements");
     ctx.Output<framework::LoDTensor>("Out")->Resize(left_tensor->dims());
+    ctx.ShareLoD("X", "Out");
   }
 };
 
@@ -54,7 +55,12 @@ class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(Minus Operator
 
-Equation: Out = X - Y
+Equation:
+
+    Out = X - Y
+
+Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
 )DOC");
   }
 };
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index b6d320b415..b2409a1870 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -55,6 +55,7 @@ class MulOp : public framework::OperatorWithKernel {
         "First matrix's width must be equal with second matrix's height.");
     ctx.Output<framework::LoDTensor>("Out")->Resize(
         {x_mat_dims[0], y_mat_dims[1]});
+    ctx.ShareLoD("X", "Out");
   }
 };
 
@@ -83,9 +84,14 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(1)
         .EqualGreaterThan(1);
     AddComment(R"DOC(
-Two Element Mul Operator.
+Mul operator is used to perform matrix multiplication for input X and Y.
 
-The equation is: Out = X * Y
+The equation is:
+
+    Out = X * Y
+
+Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
 )DOC");
   }
 };
diff --git a/paddle/operators/onehot_cross_entropy_op.cc b/paddle/operators/onehot_cross_entropy_op.cc
index f38be3549f..1d87032d27 100644
--- a/paddle/operators/onehot_cross_entropy_op.cc
+++ b/paddle/operators/onehot_cross_entropy_op.cc
@@ -40,6 +40,7 @@ class OnehotCrossEntropyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(label->dims().size(), 1, "label's dimension must be 1.");
     PADDLE_ENFORCE_EQ(X->dims()[0], label->dims()[0]);
     ctx.Output<framework::LoDTensor>("Y")->Resize({X->dims()[0], 1});
+    ctx.ShareLoD("X", "Y");
   }
 };
 
@@ -69,6 +70,8 @@ OnehotCrossEntropy Operator.
 
                 Y[i] = -log(X[i][j])
 
+Both the input `X` and `Label` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
 )DOC");
   }
 };
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
index 7ae80b2968..2b7b82a3e1 100644
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -38,6 +38,7 @@ class PReluOp : public framework::OperatorWithKernel {
                             "Output(Out) should not be null");
     auto *out = ctx.Output<framework::LoDTensor>("Out");
     out->Resize(in->dims());
+    ctx.ShareLoD("X", "Out");
   }
 };
 
@@ -55,6 +56,8 @@ The equation is:
   f(x) = alpha * x , for x < 0
   f(x) = x         , for x >= 0
 
+The input `X` can carry the LoD (Level of Details) information,
+or not. And the output shares the LoD with input `X`.
 )DOC");
   }
 };
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index 2a3fd3be94..90cdb2558b 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -45,6 +45,7 @@ class RowwiseAddOp : public framework::OperatorWithKernel {
         "The width of two operands must be same");
     PADDLE_ENFORCE_EQ(ctx.OutputSize("Out"), 1, "The output size must be 1");
     ctx.Output<framework::LoDTensor>("Out")->Resize(x_dims);
+    ctx.ShareLoD("X", "Out");
   }
 };
 
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index d1f42e8662..ca1bc4ac80 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -35,6 +35,7 @@ class ScaleOp : public framework::OperatorWithKernel {
     auto *in = ctx.Input<framework::Tensor>("X");
     auto *out = ctx.Output<framework::LoDTensor>("Out");
     out->Resize(in->dims());
+    ctx.ShareLoD("X", "Out");
   }
 };
 
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index 992b19965e..42befa22d0 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -30,6 +30,7 @@ class SigmoidOp : public framework::OperatorWithKernel {
 
     ctx.Output<framework::LoDTensor>("Y")->Resize(
         ctx.Input<Tensor>("X")->dims());
+    ctx.ShareLoD("X", "Y");
   }
 };
 
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
index 39f4305877..dfe8e6decd 100644
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -57,6 +57,7 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
     ctx.Output<framework::LoDTensor>("sub_result")
         ->Resize({x_dims[0], x->numel() / x_dims[0]});
     ctx.Output<framework::LoDTensor>("Out")->Resize({x_dims[0], 1});
+    ctx.ShareLoD("X", "Out");
   }
 };
 
@@ -79,6 +80,9 @@ class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
     input or to 1. If the first dimension of target is 1, SquaredL2DistanceOp
     will broadcast target's first dimension to input's first dimension.
     You can decide whether calculate the gradient of input and target.
+
+    Both the input X and Y can carry the LoD (Level of Details) information,
+    or not. But the output only shares the LoD with input X.
     )DOC");
   }
 };
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index 41e05c27f9..ebc57d6b7b 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -39,6 +39,7 @@ class SumOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE(in_dim == dim, "Input tensors must have same shape");
     }
     out->Resize(in_dim);
+    ctx.ShareLoD(ctx.op().Inputs("X")[0], "Out");
   }
 };
 
@@ -49,8 +50,11 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "the input tensors of sum operator.").AsDuplicable();
     AddOutput("Out", "the output tensor of sum operator.");
     AddComment(R"DOC(
-            Sum the input tensors.
-        )DOC");
+Sum the input tensors.
+
+All the inputs can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with the first input.
+)DOC");
   }
 };
 
diff --git a/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py b/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
index 2473daaba2..eff8fa87d9 100644
--- a/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
@@ -6,8 +6,8 @@ from op_test import OpTest
 class TestFillZerosLikeOp(OpTest):
     def setUp(self):
         self.op_type = "fill_zeros_like"
-        self.inputs = {'Src': np.random.random((219, 232)).astype("float32")}
-        self.outputs = {'Dst': np.zeros_like(self.inputs["Src"])}
+        self.inputs = {'X': np.random.random((219, 232)).astype("float32")}
+        self.outputs = {'Y': np.zeros_like(self.inputs["X"])}
 
     def test_check_output(self):
         self.check_output()

From 3ec48480af191970ad12377c9161994920fbf722 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Wed, 20 Sep 2017 19:05:36 +0800
Subject: [PATCH 30/50] Fix bug

---
 paddle/operators/accuracy_op.cc | 6 +++---
 paddle/operators/dropout_op.cc  | 1 +
 paddle/operators/fc_op.cc       | 1 -
 paddle/operators/pad_op.cc      | 5 +++++
 paddle/operators/reshape_op.cc  | 5 +++++
 5 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 32479ae5a3..391258b40b 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -55,15 +55,15 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
     // TODO(typhoonzero): AddInput("Weight", ...
     AddOutput("Accuracy", "The accuracy of current batch");
 
-    AddComment(
-        R"DOC(Accuracy. It will print accuracy rate for classification.
+    AddComment(R"DOC(
+Accuracy. It will print accuracy rate for classification.
 The accuracy is:
 ..  math::
 accuracy = \\frac{NumOfCorrectPredicts}{NumOfAllSamples})
 
 Both the input `Inference` and `Label` can carry the LoD (Level of Details)
 information, or not. But the output only shares the LoD with input `Inference`.
-DOC");
+)DOC");
   }
 };
 
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index b111b9fccb..bfa1992d79 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -38,6 +38,7 @@ class DropoutOp : public framework::OperatorWithKernel {
     if (ctx.Attr<int>("is_training") == 1) {
       ctx.Output<LoDTensor>("Mask")->Resize(dims);
     }
+    ctx.ShareLoD("X", "Out");
   }
 };
 
diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc
index 56fe654d1e..5ac0e8cc45 100644
--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
@@ -189,7 +189,6 @@ Activation type can be set to `identity` (default), `sigmoid` or `softmax`.
 
 All the inputs can carry the LoD (Level of Details) information,
 or not. But the output only shares the LoD with first input (`X[0]`).
-)DOC");
 )DOC");
   }
 };
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index a0b1c6b631..98de18fb9f 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -41,6 +41,11 @@ class PadOp : public framework::OperatorWithKernel {
     }
     ctx.Output<framework::LoDTensor>("Out")->Resize(
         framework::make_ddim(out_dims));
+    if (out_dims[0] == x_dim[0]) {
+      // Only pass LoD when the first dimension is equal between
+      // output and input.
+      ctx.ShareLoD("X", "Out");
+    }
   }
 };
 
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index 0d05e34414..c090758619 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -51,6 +51,11 @@ class ReshapeOp : public framework::OperatorWithKernel {
                    [](int a) { return static_cast<int64_t>(a); });
     auto out_dims = framework::make_ddim(shape_int64);
     ctx.Output<framework::LoDTensor>("Out")->Resize(out_dims);
+    if (shape[0] == in->dims()[0]) {
+      // Only pass LoD when the first dimension is equal between
+      // output and input.
+      ctx.ShareLoD("X", "Out");
+    }
   }
 };
 

From 9a3c6d5f9c50ba170337881953890c41fa9cf0f3 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 20 Sep 2017 17:56:08 -0700
Subject: [PATCH 31/50] Add BlockDesc and ProgramDesc to framework.proto

---
 paddle/framework/framework.proto | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index dfcb5fb621..dcbc9ec407 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -23,6 +23,7 @@ enum AttrType {
   FLOATS = 4;
   STRINGS = 5;
   INT_PAIRS = 6;
+  BLOCK = 7;
 }
 
 message IntPair {
@@ -44,6 +45,7 @@ message OpDesc {
     repeated float floats = 7;
     repeated string strings = 8;
     repeated IntPair int_pairs = 9;
+    optional int32 block = 10;
   };
 
   message Var {
@@ -108,3 +110,12 @@ message VarDesc {
   required string name = 1;
   optional LoDTensorDesc lod_tensor = 2;
 }
+
+message BlockDesc {
+  required int32 idx = 1;
+  required int32 parent_idx = 2;
+  repeated VarDesc vars = 3;
+  repeated OpDesc ops = 4;
+}
+
+message ProgramDesc { repeated BlockDesc blocks = 1; }

From 68b5e5bf85ade89bffeec09a4e959dd11da8af67 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Thu, 21 Sep 2017 10:12:07 +0800
Subject: [PATCH 32/50] Use stridecpy instead of CUDA kernel

---
 paddle/operators/crop_op.cc |  50 +--------------
 paddle/operators/crop_op.cu | 121 +-----------------------------------
 paddle/operators/crop_op.h  |  53 +++++++++++++---
 3 files changed, 45 insertions(+), 179 deletions(-)

diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
index ee4bc9cdaf..d38c7ba358 100644
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -128,59 +128,11 @@ class CropOpGrad : public framework::OperatorWithKernel {
   }
 };
 
-int64_t transIndex(std::vector<int64_t> out_shape, std::vector<int64_t> x_shape,
-                   std::vector<std::pair<int, int>> crop_rules, size_t index) {
-  int64_t dim_size = out_shape.size();
-  std::vector<int64_t> pos(dim_size);
-
-  for (int64_t i = out_shape.size() - 1; i >= 0; --i) {
-    pos[i] = (index % out_shape[i]) + crop_rules[i].first;
-    index = index / out_shape[i];
-  }
-
-  size_t result = pos[0];
-  for (size_t i = 1; i < x_shape.size(); ++i) {
-    result = result * x_shape[i] + pos[i];
-  }
-  return result;
-}
-
-template <typename T>
-class CropCPUKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<Tensor>("X");
-    auto *out = context.Output<Tensor>("Out");
-    auto x_data = x->data<T>();
-    T *out_data = out->mutable_data<T>(context.GetPlace());
-    auto x_dims = x->dims();
-    auto out_dims = out->dims();
-    int64_t out_count = out->numel();
-    std::vector<int64_t> x_shape = framework::vectorize(x_dims);
-    std::vector<int64_t> out_shape = framework::vectorize(out_dims);
-
-    auto offsets = context.Attr<std::vector<int>>("offsets");
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(), offsets.size(),
-        "Offsets size should be equal to dimension size of input tensor.");
-
-    std::vector<std::pair<int, int>> crop_rules(x_dims.size());
-    for (size_t i = 0; i < crop_rules.size(); ++i) {
-      crop_rules[i].first = offsets[i];
-      crop_rules[i].second = x_dims[i] - out_dims[i] - offsets[i];
-    }
-
-    for (int64_t i = 0; i < out_count; ++i) {
-      out_data[i] = x_data[transIndex(out_shape, x_shape, crop_rules, i)];
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad);
-REGISTER_OP_CPU_KERNEL(crop, ops::CropCPUKernel<float>);
+REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
 REGISTER_OP_CPU_KERNEL(crop_grad,
                        ops::CropGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/crop_op.cu b/paddle/operators/crop_op.cu
index 05782145b8..f8ee18a1d6 100644
--- a/paddle/operators/crop_op.cu
+++ b/paddle/operators/crop_op.cu
@@ -13,128 +13,9 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include <stdio.h>
 #include "paddle/operators/crop_op.h"
 
-namespace paddle {
-namespace operators {
-
-using framework::LoDTensor;
-using framework::Tensor;
-
-template <typename T, int D>
-__global__ void CropKernel(const int N, const int64_t* out_shape,
-                           const int64_t* x_shape, const int* crop_rules,
-                           const T* x_data, T* out_data) {
-  int64_t pos[D];
-  int tmp;
-  int64_t x_index;
-  for (int out_index = blockIdx.x * blockDim.x + threadIdx.x; out_index < N;
-       out_index += blockDim.x * gridDim.x) {
-    tmp = out_index;
-    for (int64_t i = D - 1; i >= 0; --i) {
-      pos[i] = (tmp % out_shape[i]) + crop_rules[i * 2];
-      tmp = tmp / out_shape[i];
-    }
-
-    x_index = pos[0];
-    for (size_t i = 1; i < D; ++i) {
-      x_index = x_index * x_shape[i] + pos[i];
-    }
-    out_data[out_index] = x_data[x_index];
-  }
-}
-
-template <typename T, int D>
-void CropCUDAFunctoin(const framework::ExecutionContext& context) {
-  PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                 "It must use GPUPlace.");
-  auto* x = context.Input<LoDTensor>("X");
-  auto* out = context.Output<LoDTensor>("Out");
-  auto x_data = x->data<T>();
-  T* out_data = out->mutable_data<T>(paddle::platform::GPUPlace());
-  auto x_dims = x->dims();
-  auto out_dims = out->dims();
-  int64_t out_count = out->numel();
-  Tensor x_shape;
-  Tensor out_shape;
-  int64_t* x_shape_data =
-      x_shape.mutable_data<int64_t>({D}, paddle::platform::CPUPlace());
-  int64_t* out_shape_data =
-      out_shape.mutable_data<int64_t>({D}, paddle::platform::CPUPlace());
-  for (int i = 0; i < D; ++i) {
-    x_shape_data[i] = x_dims[i];
-    out_shape_data[i] = out_dims[i];
-  }
-  Tensor x_shape_gpu;
-  Tensor out_shape_gpu;
-  x_shape_gpu.CopyFrom<int64_t>(x_shape, paddle::platform::GPUPlace());
-  out_shape_gpu.CopyFrom<int64_t>(out_shape, paddle::platform::GPUPlace());
-  auto offsets = context.op().Attr<std::vector<int>>("offsets");
-  PADDLE_ENFORCE_EQ(
-      D, offsets.size(),
-      "Offsets size should be equal to dimension size of input tensor.");
-
-  Tensor crop_rules;
-  int* crop_rules_data =
-      crop_rules.mutable_data<int>({D * 2}, paddle::platform::CPUPlace());
-  for (size_t i = 0; i < D; ++i) {
-    crop_rules_data[i * 2] = offsets[i];
-    crop_rules_data[i * 2 + 1] = x_dims[i] - out_dims[i] - offsets[i];
-  }
-
-  Tensor crop_rules_gpu;
-  crop_rules_gpu.CopyFrom<int>(crop_rules, paddle::platform::GPUPlace());
-
-  int n = out_dims[0];
-  int d = out_dims[1];
-  int block = 512;
-  int grid = (n * d + block - 1) / block;
-
-  CropKernel<
-      T,
-      D><<<grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                               context.device_context())
-                               .stream()>>>(
-      out_count, out_shape_gpu.data<int64_t>(), x_shape_gpu.data<int64_t>(),
-      crop_rules_gpu.data<int>(), x_data, out_data);
-}
-
-template <typename T>
-class CropOpCUDAKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    size_t rank = context.Input<LoDTensor>("X")->dims().size();
-    switch (rank) {
-      case 1:
-        CropCUDAFunctoin<T, 1>(context);
-        break;
-      case 2:
-        CropCUDAFunctoin<T, 2>(context);
-        break;
-      case 3:
-        CropCUDAFunctoin<T, 3>(context);
-        break;
-      case 4:
-        CropCUDAFunctoin<T, 4>(context);
-        break;
-      case 5:
-        CropCUDAFunctoin<T, 5>(context);
-        break;
-      case 6:
-        CropCUDAFunctoin<T, 6>(context);
-        break;
-      default:
-        PADDLE_THROW(
-            "CropOp only support tensors with no more than 6 dimensions.");
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(crop, ops::CropOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(crop, ops::CropKernel<float>);
 REGISTER_OP_GPU_KERNEL(crop_grad,
                        ops::CropGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h
index 09d42f4b7e..d4c523cf30 100644
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
@@ -16,6 +16,7 @@
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {  // Internal
@@ -24,26 +25,58 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 
-using framework::LoDTensor;
+using framework::Tensor;
+using framework::DDim;
+
+// TODO(wanghaoshuang):  move this function to other place
+DDim stride(const DDim& ddim) {
+  std::vector<int64_t> strides(ddim.size());
+  strides[ddim.size() - 1] = 1;
+  for (int i = ddim.size() - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * ddim[i + 1];
+  }
+  return make_ddim(strides);
+}
+
+template <typename T>
+class CropKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    T* x_data = x->data<T>();
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto x_stride = stride(x->dims());
+    auto out_stride = stride(out->dims());
+    auto offsets = context.Attr<std::vector<int>>("offsets");
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), offsets.size(),
+        "Offsets size should be equal to dimension size of input tensor.");
+    int64_t offset = 0;
+    for (int i = 0; i < offsets.size(); ++i) {
+      offset += (x_stride[i] * offsets[i]);
+    }
+    StridedMemcpy<T>(context.device_context(), x_data + offset, x_stride,
+                     out->dims(), out_stride, out_data);
+  }
+};
 
 template <typename Place, typename T, size_t D>
 void CropGradFunction(const framework::ExecutionContext& context) {
-  auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
-  auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
+  auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
   if (d_x != nullptr) {
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
     d_x->mutable_data<T>(context.GetPlace());
-    auto d_x_dims = d_x->dims();
-    auto d_out_dims = d_out->dims();
-    auto offsets = context.op().Attr<std::vector<int>>("offsets");
+    auto offsets = context.Attr<std::vector<int>>("offsets");
     Eigen::array<std::pair<int, int>, D> paddings;
-    for (int i = 0; i < d_out_dims.size(); ++i) {
+    for (int i = 0; i < D; ++i) {
       paddings[i].first = offsets[i];
       paddings[i].second = d_x_dims[i] - d_out_dims[i] - offsets[i];
     }
     auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
     auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
-    auto place = context.GetEigenDevice<Place>();
-    d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0);
+    d_x_tensor.device(context.GetEigenDevice<Place>()) =
+        d_out_tensor.pad(paddings, 0);
   }
 }
 
@@ -52,7 +85,7 @@ class CropGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     size_t rank =
-        context.Input<LoDTensor>(framework::GradVarName("Out"))->dims().size();
+        context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
     switch (rank) {
       case 1:
         CropGradFunction<Place, T, 1>(context);

From 36aeb30d12b2cdb7a763b59c70bc427eec7a49e3 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Thu, 21 Sep 2017 11:07:56 +0800
Subject: [PATCH 33/50] Remove LoDTensor in some operators' InferShape and
 refine ShareLoD function.

---
 paddle/framework/operator.cc               | 15 +++----
 paddle/framework/operator.h                | 51 +++++++---------------
 paddle/operators/accuracy_op.cc            |  4 +-
 paddle/operators/add_op.cc                 |  2 +-
 paddle/operators/concat_op.cc              |  2 +-
 paddle/operators/cos_sim_op.cc             | 14 +++---
 paddle/operators/cross_entropy_op.cc       |  2 +-
 paddle/operators/dropout_op.cc             |  2 +-
 paddle/operators/elementwise_mul_op.cc     | 10 ++---
 paddle/operators/fill_zeros_like_op.cc     |  4 +-
 paddle/operators/gather_op.cc              |  4 +-
 paddle/operators/gaussian_random_op.cc     |  2 +-
 paddle/operators/lookup_table_op.cc        |  6 +--
 paddle/operators/mean_op.cc                |  4 +-
 paddle/operators/minus_op.cc               |  4 +-
 paddle/operators/mul_op.cc                 | 10 ++---
 paddle/operators/pad_op.cc                 |  6 +--
 paddle/operators/prelu_op.cc               |  8 ++--
 paddle/operators/reshape_op.cc             |  6 +--
 paddle/operators/rowwise_add_op.cc         |  8 ++--
 paddle/operators/scale_op.cc               |  4 +-
 paddle/operators/scatter_op.cc             |  7 ++-
 paddle/operators/sequence_avg_pool_op.cc   |  5 +--
 paddle/operators/sgd_op.cc                 |  2 +-
 paddle/operators/sigmoid_op.cc             |  7 ++-
 paddle/operators/softmax_op.cc             |  5 +--
 paddle/operators/split_op.cc               |  2 +-
 paddle/operators/squared_l2_distance_op.cc | 12 +++--
 paddle/operators/sum_op.cc                 |  6 +--
 paddle/operators/top_k_op.cc               |  4 +-
 paddle/operators/uniform_random_op.cc      |  2 +-
 31 files changed, 93 insertions(+), 127 deletions(-)

diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index f8a64a7866..fdc0660837 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -207,23 +207,22 @@ const std::vector<const Tensor*> InferShapeContext::MultiInput<Tensor>(
 }
 
 template <>
-Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
-  auto* var = OutputVar(name);
-  return var == nullptr ? nullptr : const_cast<Tensor*>(GetTensorFromVar(var));
+Tensor* InferShapeContext::Output<Tensor>(const std::string& name) const {
+  auto var = OutputVar(name);
+  return var == nullptr ? nullptr : var->GetMutable<LoDTensor>();
 }
 
 template <>
-std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
+std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
     const std::string& name) const {
   auto names = op().Outputs(name);
   std::vector<Tensor*> res;
   res.reserve(names.size());
   std::transform(names.begin(), names.end(), std::back_inserter(res),
                  [&](const std::string& sub_name) {
-                   auto var = scope().FindVar(sub_name);
-                   return var == nullptr
-                              ? nullptr
-                              : const_cast<Tensor*>(GetTensorFromVar(var));
+                   auto var = scope_.FindVar(sub_name);
+                   return var == nullptr ? nullptr
+                                         : var->GetMutable<LoDTensor>();
                  });
   return res;
 }
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 28a253ec0b..4a078258d2 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -277,9 +277,9 @@ class InferShapeContext {
     return res;
   }
 
-  std::vector<const Variable*> MultiOutputVar(const std::string& name) const {
+  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
     auto names = op_.Outputs(name);
-    std::vector<const Variable*> res;
+    std::vector<Variable*> res;
     res.reserve(names.size());
     std::transform(names.begin(), names.end(), std::back_inserter(res),
                    [this](const std::string& name) {
@@ -336,12 +336,19 @@ class InferShapeContext {
     return &var->Get<Tensor>();
   }
 
-  void ShareLoD(const std::string& in, const std::string& out) const {
-    PADDLE_ENFORCE(InputVar(in)->IsType<LoDTensor>(),
-                   "The Input(%s) must be LoDTensor.", in);
-    PADDLE_ENFORCE(OutputVar(out)->IsType<LoDTensor>(),
-                   "The Output(%s) must be LoDTensor.", out);
-    Output<LoDTensor>(out)->set_lod(Input<LoDTensor>(in)->lod());
+  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) const {
+    PADDLE_ENFORCE_LT(i, InputSize(in));
+    PADDLE_ENFORCE_LT(j, OutputSize(out));
+    auto* in_var = MultiInputVar(in)[i];
+    auto* out_var = MultiOutputVar(out)[j];
+    PADDLE_ENFORCE(in_var->IsType<LoDTensor>(),
+                   "The %d-th input of Input(%s) must be LoDTensor.", in);
+    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                   "The %d-th output of Output(%s) must be LoDTensor.", out);
+    auto in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
+    out_tensor->set_lod(in_tensor.lod());
   }
 
  private:
@@ -388,38 +395,10 @@ class ExecutionContext : public InferShapeContext {
     return device_context_;
   }
 
-  // redefine Output function,
-  // use Variable::Get instead of Variable::GetMutable
-  template <typename T>
-  T* Output(const std::string& name) const {
-    auto var = OutputVar(name);
-    return var == nullptr ? nullptr : const_cast<T*>(&var->Get<T>());
-  }
-
-  // redefine MultiOutput function.
-  // use Variable::Get instead of Variable::GetMutable
-  template <typename T>
-  std::vector<T*> MultiOutput(const std::string& name) const {
-    auto names = op().Outputs(name);
-    std::vector<T*> res;
-    res.reserve(names.size());
-    std::transform(
-        names.begin(), names.end(), std::back_inserter(res),
-        [&](const std::string& sub_name) { return Output<T>(sub_name); });
-    return res;
-  }
-
  private:
   const platform::DeviceContext& device_context_;
 };
 
-template <>
-Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
-
-template <>
-std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
-    const std::string& name) const;
-
 class OpKernel {
  public:
   /**
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 391258b40b..70e4f9da12 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -39,8 +39,8 @@ class AccuracyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(inference->dims()[0], label->dims()[0],
                       "inference size must be the same as label size");
 
-    ctx.Output<framework::LoDTensor>("Accuracy")->Resize({1});
-    ctx.ShareLoD("Inference", "Accuracy");
+    ctx.Output<framework::Tensor>("Accuracy")->Resize({1});
+    ctx.ShareLoD("Inference", /*->*/ "Accuracy");
   }
 };
 
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index e83c1efeaf..ed11d09697 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -33,7 +33,7 @@ class AddOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
                       ctx.Input<Tensor>("Y")->dims(),
                       "Two input of Add Op's dimension must be same.");
-    ctx.Output<framework::LoDTensor>("Out")->Resize(
+    ctx.Output<framework::Tensor>("Out")->Resize(
         ctx.Input<Tensor>("X")->dims());
   }
 };
diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
index 223bb0ffe6..07f847079e 100644
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -29,7 +29,7 @@ class ConcatOp : public framework::OperatorWithKernel {
                             "Output(Out) of ConcatOp should not be null.");
 
     auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *out = ctx.Output<framework::Tensor>("Out");
     size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
     size_t n = ins.size();
 
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
index 840848fa08..b56ee2047b 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -54,10 +54,10 @@ class CosSimOp : public framework::OperatorWithKernel {
                    " just 1 (which will be broadcasted to match Input(X)).");
 
     // resize tensor
-    ctx.Output<framework::LoDTensor>("Out")->Resize({x_dims[0], 1});
-    ctx.Output<framework::LoDTensor>("XNorm")->Resize({x_dims[0], 1});
-    ctx.Output<framework::LoDTensor>("YNorm")->Resize({y_dims[0], 1});
-    ctx.ShareLoD("X", "Out");
+    ctx.Output<framework::Tensor>("Out")->Resize({x_dims[0], 1});
+    ctx.Output<framework::Tensor>("XNorm")->Resize({x_dims[0], 1});
+    ctx.Output<framework::Tensor>("YNorm")->Resize({y_dims[0], 1});
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -143,10 +143,8 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
                       "Shape of Input(Out@Grad) must be [X.Dim(0), 1].");
 
     // resize tensor
-    auto *x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto *y_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+    auto *x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
     if (x_grad) x_grad->Resize(x_dims);
     if (y_grad) y_grad->Resize(y_dims);
   }
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 5de8f1489d..fd91d39d5f 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -52,7 +52,7 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
     }
 
     ctx.Output<LoDTensor>("Y")->Resize({x->dims()[0], 1});
-    ctx.ShareLoD("X", "Y");
+    ctx.ShareLoD("X", /*->*/ "Y");
   }
 };
 
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index bfa1992d79..dc773e510e 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -38,7 +38,7 @@ class DropoutOp : public framework::OperatorWithKernel {
     if (ctx.Attr<int>("is_training") == 1) {
       ctx.Output<LoDTensor>("Mask")->Resize(dims);
     }
-    ctx.ShareLoD("X", "Out");
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index 304e45fa5b..02bd4c7b85 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -37,8 +37,8 @@ class ElementWiseMulOp : public framework::OperatorWithKernel {
     auto y_dim = ctx.Input<Tensor>("Y")->dims();
     PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
                       "Rank of first input must >= rank of second input.")
-    ctx.Output<framework::LoDTensor>("Out")->Resize(x_dim);
-    ctx.ShareLoD("X", "Out");
+    ctx.Output<framework::Tensor>("Out")->Resize(x_dim);
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -91,10 +91,8 @@ class ElementWiseMulOpGrad : public framework::OperatorWithKernel {
     auto x_dims = ctx.Input<Tensor>("X")->dims();
     auto y_dims = ctx.Input<Tensor>("Y")->dims();
     auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    auto *x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto *y_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+    auto *x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
 
     PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
                       "Rank of first input must >= rank of second input.")
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
index a238b59b78..761a527a55 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -28,9 +28,9 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"),
                             "Output(Y) of FillZerosLikeOp should not be null.");
 
-    ctx.Output<framework::LoDTensor>("Y")->Resize(
+    ctx.Output<framework::Tensor>("Y")->Resize(
         ctx.Input<framework::Tensor>("X")->dims());
-    ctx.ShareLoD("X", "Y");
+    ctx.ShareLoD("X", /*->*/ "Y");
   }
 };
 
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index d445b61c16..fecd1ce214 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -35,7 +35,7 @@ class GatherOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
     framework::DDim output_dims(ctx.Input<Tensor>("X")->dims());
     output_dims[0] = batch_size;
-    ctx.Output<framework::LoDTensor>("Out")->Resize(output_dims);
+    ctx.Output<framework::Tensor>("Out")->Resize(output_dims);
   }
 };
 
@@ -45,7 +45,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto X_grad = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto X_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     auto X = ctx.Input<Tensor>("X");
 
     X_grad->Resize(X->dims());
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index c0e161bbc0..5b7cbb5cc7 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -48,7 +48,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
         ctx.OutputVar("Out"),
         "Output(Out) of GaussianRandomOp should not be null.");
 
-    auto* tensor = ctx.Output<framework::LoDTensor>("Out");
+    auto* tensor = ctx.Output<framework::Tensor>("Out");
     auto dims = Attr<std::vector<int>>("dims");
     std::vector<int64_t> temp;
     temp.reserve(dims.size());
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index 8f533f1cc3..04ac24662e 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -32,10 +32,10 @@ class LookupTableOp : public framework::OperatorWithKernel {
 
     auto table_t = ctx.Input<Tensor>("W");
     auto ids_t = ctx.Input<Tensor>("Ids");
-    auto output_t = ctx.Output<framework::LoDTensor>("Out");
+    auto output_t = ctx.Output<framework::Tensor>("Out");
 
     output_t->Resize({ids_t->dims()[0], table_t->dims()[1]});
-    ctx.ShareLoD("Ids", "Out");
+    ctx.ShareLoD("Ids", /*->*/ "Out");
   }
 };
 
@@ -69,7 +69,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
   void InferShape(const framework::InferShapeContext &context) const override {
     auto table = context.Input<Tensor>("W");
     auto d_table =
-        context.Output<framework::LoDTensor>(framework::GradVarName("W"));
+        context.Output<framework::Tensor>(framework::GradVarName("W"));
     d_table->Resize(table->dims());
   }
 };
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index 96540ff454..b04384bda8 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -27,7 +27,7 @@ class MeanOp : public framework::OperatorWithKernel {
                             "Input(X) of MeanOp should not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
                             "Output(Out) of MeanOp should not be null.");
-    ctx.Output<framework::LoDTensor>("Out")->Resize({1});
+    ctx.Output<framework::Tensor>("Out")->Resize({1});
   }
 };
 
@@ -48,7 +48,7 @@ class MeanGradOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<framework::LoDTensor>(framework::GradVarName("X"))
+    ctx.Output<framework::Tensor>(framework::GradVarName("X"))
         ->Resize(ctx.Input<Tensor>("X")->dims());
   }
 };
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index 5036f9f98a..29cb85489b 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -40,8 +40,8 @@ class MinusOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         left_tensor->numel(), right_tensor->numel(),
         "Minus operator must take two tensor with same num of elements");
-    ctx.Output<framework::LoDTensor>("Out")->Resize(left_tensor->dims());
-    ctx.ShareLoD("X", "Out");
+    ctx.Output<framework::Tensor>("Out")->Resize(left_tensor->dims());
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index b2409a1870..5303a31501 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -53,9 +53,9 @@ class MulOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         x_mat_dims[1], y_mat_dims[0],
         "First matrix's width must be equal with second matrix's height.");
-    ctx.Output<framework::LoDTensor>("Out")->Resize(
+    ctx.Output<framework::Tensor>("Out")->Resize(
         {x_mat_dims[0], y_mat_dims[1]});
-    ctx.ShareLoD("X", "Out");
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -109,10 +109,8 @@ class MulOpGrad : public framework::OperatorWithKernel {
     auto x_dims = ctx.Input<Tensor>("X")->dims();
     auto y_dims = ctx.Input<Tensor>("Y")->dims();
     auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    auto *x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto *y_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+    auto *x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
 
     auto x_mat_dims =
         framework::flatten_to_2d(x_dims, Attr<int>("x_num_col_dims"));
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index 98de18fb9f..375d8a35ac 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -39,12 +39,12 @@ class PadOp : public framework::OperatorWithKernel {
     for (int i = 0; i < x_dim.size(); ++i) {
       out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
     }
-    ctx.Output<framework::LoDTensor>("Out")->Resize(
+    ctx.Output<framework::Tensor>("Out")->Resize(
         framework::make_ddim(out_dims));
     if (out_dims[0] == x_dim[0]) {
       // Only pass LoD when the first dimension is equal between
       // output and input.
-      ctx.ShareLoD("X", "Out");
+      ctx.ShareLoD("X", /*->*/ "Out");
     }
   }
 };
@@ -106,7 +106,7 @@ class PadOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                             "Input(Out@GRAD) should not be null");
     auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto *x_g = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *x_g = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     if (x_g != nullptr) {
       x_g->Resize(x_dims);
     }
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
index 2b7b82a3e1..912196c190 100644
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -36,9 +36,9 @@ class PReluOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
                             "Output(Out) should not be null");
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *out = ctx.Output<framework::Tensor>("Out");
     out->Resize(in->dims());
-    ctx.ShareLoD("X", "Out");
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -72,11 +72,11 @@ class PReluGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                             "Input(Out@GRAD) should not be null");
-    auto *dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     auto *x = ctx.Input<framework::Tensor>("X");
 
     auto *dalpha =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Alpha"));
+        ctx.Output<framework::Tensor>(framework::GradVarName("Alpha"));
     auto *alpha = ctx.Input<framework::Tensor>("Alpha");
 
     dx->Resize(x->dims());
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index c090758619..ddb93007e2 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -50,11 +50,11 @@ class ReshapeOp : public framework::OperatorWithKernel {
     std::transform(shape.begin(), shape.end(), shape_int64.begin(),
                    [](int a) { return static_cast<int64_t>(a); });
     auto out_dims = framework::make_ddim(shape_int64);
-    ctx.Output<framework::LoDTensor>("Out")->Resize(out_dims);
+    ctx.Output<framework::Tensor>("Out")->Resize(out_dims);
     if (shape[0] == in->dims()[0]) {
       // Only pass LoD when the first dimension is equal between
       // output and input.
-      ctx.ShareLoD("X", "Out");
+      ctx.ShareLoD("X", /*->*/ "Out");
     }
   }
 };
@@ -99,7 +99,7 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                             "Input(Out@GRAD) shouldn't be null.");
     auto dims = ctx.Input<framework::Tensor>("X")->dims();
-    auto *d_in = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *d_in = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     d_in->Resize(dims);
   }
 };
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index 90cdb2558b..fc3ad721f2 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -44,8 +44,8 @@ class RowwiseAddOp : public framework::OperatorWithKernel {
         framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
         "The width of two operands must be same");
     PADDLE_ENFORCE_EQ(ctx.OutputSize("Out"), 1, "The output size must be 1");
-    ctx.Output<framework::LoDTensor>("Out")->Resize(x_dims);
-    ctx.ShareLoD("X", "Out");
+    ctx.Output<framework::Tensor>("Out")->Resize(x_dims);
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -84,8 +84,8 @@ class RowwiseAddGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
         "The width of two operands must be same");
-    auto *dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto *db = ctx.Output<framework::LoDTensor>(framework::GradVarName("b"));
+    auto *dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *db = ctx.Output<framework::Tensor>(framework::GradVarName("b"));
     if (dx) dx->Resize(x_dims);
     if (db) db->Resize(b_dims);
   }
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index ca1bc4ac80..3940037c37 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -33,9 +33,9 @@ class ScaleOp : public framework::OperatorWithKernel {
                             "Output(Out) of ScaleOp should not be null.");
 
     auto *in = ctx.Input<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *out = ctx.Output<framework::Tensor>("Out");
     out->Resize(in->dims());
-    ctx.ShareLoD("X", "Out");
+    // ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index 8820262732..3f02081a06 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -44,7 +44,7 @@ class ScatterOp : public framework::OperatorWithKernel {
     framework::DDim data_dim(ctx.Input<Tensor>("Updates")->dims());
     for (int i = 1; i < data_dim.size(); ++i)
       PADDLE_ENFORCE_EQ(data_dim[i], ctx.Input<Tensor>("Updates")->dims()[i]);
-    ctx.Output<framework::LoDTensor>("Out")->Resize(
+    ctx.Output<framework::Tensor>("Out")->Resize(
         ctx.Input<Tensor>("Ref")->dims());
   }
 };
@@ -56,10 +56,9 @@ class ScatterGradOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
     auto *dUpdates =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Updates"));
+        ctx.Output<framework::Tensor>(framework::GradVarName("Updates"));
     auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *dRef =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Ref"));
+    auto *dRef = ctx.Output<framework::Tensor>(framework::GradVarName("Ref"));
     auto *Ref = ctx.Input<Tensor>("Ref");
 
     dRef->Resize(Ref->dims());
diff --git a/paddle/operators/sequence_avg_pool_op.cc b/paddle/operators/sequence_avg_pool_op.cc
index 9815b8f3a8..11d42ac44e 100644
--- a/paddle/operators/sequence_avg_pool_op.cc
+++ b/paddle/operators/sequence_avg_pool_op.cc
@@ -38,7 +38,7 @@ class SequenceAvgPoolOp : public framework::OperatorWithKernel {
         /*batch size = */ static_cast<int64_t>(lod[0].size() - 1),
         "The first dimension of Input(X) must be large than batch size.");
     dims[0] = lod[0].size() - 1;
-    ctx.Output<framework::LoDTensor>("Out")->Resize({dims});
+    ctx.Output<framework::Tensor>("Out")->Resize({dims});
   }
 };
 
@@ -74,8 +74,7 @@ class SequenceAvgPoolGradOp : public framework::OperatorWithKernel {
     for (int64_t i = 1; i < og_dims.size(); ++i) {
       PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch.");
     }
-    auto* x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     x_grad->Resize(x_dims);
   }
 };
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 1232e64c7f..b063e24272 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -33,7 +33,7 @@ class SGDOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("param")->dims(),
                       ctx.Input<Tensor>("grad")->dims(),
                       "Two input of SGD Op's dimension must be same.");
-    ctx.Output<framework::LoDTensor>("param_out")
+    ctx.Output<framework::Tensor>("param_out")
         ->Resize(ctx.Input<Tensor>("param")->dims());
   }
 };
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index 42befa22d0..d2a38d1ebe 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -28,9 +28,8 @@ class SigmoidOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"),
                             "Output(Y) of SigmoidOp should not be null.");
 
-    ctx.Output<framework::LoDTensor>("Y")->Resize(
-        ctx.Input<Tensor>("X")->dims());
-    ctx.ShareLoD("X", "Y");
+    ctx.Output<framework::Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
+    ctx.ShareLoD("X", /*->*/ "Y");
   }
 };
 
@@ -51,7 +50,7 @@ class SigmoidOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<framework::LoDTensor>(framework::GradVarName("X"))
+    ctx.Output<framework::Tensor>(framework::GradVarName("X"))
         ->Resize(ctx.Input<Tensor>("Y")->dims());
   }
 };
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index c67eb028c8..e15cfe4850 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -30,8 +30,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE(ctx.Input<Tensor>("X")->dims().size() == 2UL,
                    "The input of softmax op must be a matrix.");
-    ctx.Output<framework::LoDTensor>("Y")->Resize(
-        ctx.Input<Tensor>("X")->dims());
+    ctx.Output<framework::Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
   }
 };
 
@@ -77,7 +76,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
                       ctx.Input<Tensor>(framework::GradVarName("Y"))->dims(),
                       "Input(Y) and its gradients should have a same shape.");
 
-    ctx.Output<framework::LoDTensor>(framework::GradVarName("X"))
+    ctx.Output<framework::Tensor>(framework::GradVarName("X"))
         ->Resize(ctx.Input<Tensor>("X")->dims());
   }
 };
diff --git a/paddle/operators/split_op.cc b/paddle/operators/split_op.cc
index 61296f5c81..a9d35b4fb7 100644
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
@@ -27,7 +27,7 @@ class SplitOp : public framework::OperatorWithKernel {
   void InferShape(const framework::InferShapeContext &ctx) const override {
     // infershape
     auto *in = ctx.Input<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
     size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
     size_t num = static_cast<size_t>(ctx.Attr<int>("num"));
     std::vector<int> sections =
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
index dfe8e6decd..33a564b05b 100644
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -54,10 +54,10 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
                    "First dimension of target must be equal to input "
                    "or to 1.");
 
-    ctx.Output<framework::LoDTensor>("sub_result")
+    ctx.Output<framework::Tensor>("sub_result")
         ->Resize({x_dims[0], x->numel() / x_dims[0]});
-    ctx.Output<framework::LoDTensor>("Out")->Resize({x_dims[0], 1});
-    ctx.ShareLoD("X", "Out");
+    ctx.Output<framework::Tensor>("Out")->Resize({x_dims[0], 1});
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -104,10 +104,8 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(out_dims[1], 1,
                       "Second dimension of output gradient "
                       "must be 1.");
-    auto* x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto* y_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
     if (x_grad) x_grad->Resize(x_dims);
     if (y_grad) y_grad->Resize(y_dims);
   }
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index ebc57d6b7b..437fc262f3 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -28,7 +28,7 @@ class SumOp : public framework::OperatorWithKernel {
                             "Output(Out) of SumOp should not be null.");
 
     auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *out = ctx.Output<framework::Tensor>("Out");
     int N = ins.size();
 
     auto in_dim = ins[0]->dims();
@@ -39,7 +39,7 @@ class SumOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE(in_dim == dim, "Input tensors must have same shape");
     }
     out->Resize(in_dim);
-    ctx.ShareLoD(ctx.op().Inputs("X")[0], "Out");
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -65,7 +65,7 @@ class SumGradOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
     auto outputs =
-        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
+        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
     auto dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
     for (auto output : outputs) {
       output->Resize(dims);
diff --git a/paddle/operators/top_k_op.cc b/paddle/operators/top_k_op.cc
index 169b815fef..a6e43964e9 100644
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
@@ -40,8 +40,8 @@ class TopkOp : public framework::OperatorWithKernel {
 
     framework::DDim dims = input->dims();
     dims[dims.size() - 1] = k;
-    ctx.Output<framework::LoDTensor>("Out")->Resize(dims);
-    ctx.Output<framework::LoDTensor>("Indices")->Resize(dims);
+    ctx.Output<framework::Tensor>("Out")->Resize(dims);
+    ctx.Output<framework::Tensor>("Indices")->Resize(dims);
   }
 };
 
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 184bcbc29c..17ea48361b 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -54,7 +54,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE(Attr<float>("min") < Attr<float>("max"),
                    "uniform_random's min must less then max");
-    auto* tensor = ctx.Output<framework::LoDTensor>("Out");
+    auto* tensor = ctx.Output<framework::Tensor>("Out");
     auto dims = Attr<std::vector<int>>("dims");
     std::vector<int64_t> temp;
     temp.reserve(dims.size());

From 8b6fda6fd8e3023ab96475253b08526006babb45 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Thu, 21 Sep 2017 11:28:33 +0800
Subject: [PATCH 34/50] move stride function to ddim.h

---
 paddle/framework/ddim.cc    |  8 ++++++++
 paddle/framework/ddim.h     |  1 +
 paddle/operators/crop_op.cc | 11 ++++++-----
 paddle/operators/crop_op.h  | 22 +++++-----------------
 4 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index fc3d508553..a335786753 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -292,5 +292,13 @@ DDim flatten_to_2d(const DDim& src, int num_col_dims) {
 
 DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); }
 
+DDim stride(const DDim& ddim) {
+  std::vector<int64_t> strides(ddim.size());
+  strides[ddim.size() - 1] = 1;
+  for (int i = ddim.size() - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * ddim[i + 1];
+  }
+  return framework::make_ddim(strides);
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index ca29e7e8c7..4a871bb0a9 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -121,6 +121,7 @@ DDim flatten_to_2d(const DDim& src, int num_col_dims);
 
 DDim flatten_to_1d(const DDim& src);
 
+DDim stride(const DDim& ddim);
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
index d38c7ba358..7ed21f336f 100644
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -32,8 +32,9 @@ class CropOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
                             "Output(Out) of CropOp should not be null.");
     auto x_dim = ctx.Input<LoDTensor>("X")->dims();
-    auto Y = ctx.Input<LoDTensor>("Y");
-    if (Y == nullptr) {
+    auto *y = ctx.Input<LoDTensor>("Y");
+    auto *out = ctx.Output<LoDTensor>("Out");
+    if (y == nullptr) {
       auto shape = Attr<std::vector<int>>("shape");
       PADDLE_ENFORCE_EQ(
           int64_t(shape.size()), x_dim.size(),
@@ -42,12 +43,12 @@ class CropOp : public framework::OperatorWithKernel {
       for (size_t i = 0; i < shape.size(); ++i) {
         tensor_shape[i] = static_cast<int64_t>(shape[i]);
       }
-      ctx.Output<LoDTensor>("Out")->Resize(framework::make_ddim(tensor_shape));
+      out->Resize(framework::make_ddim(tensor_shape));
     } else {
-      PADDLE_ENFORCE_EQ(framework::arity(x_dim), framework::arity(Y->dims()),
+      PADDLE_ENFORCE_EQ(framework::arity(x_dim), framework::arity(y->dims()),
                         "Tensor rank of both CropOp's "
                         "inputs must be same.");
-      ctx.Output<LoDTensor>("Out")->Resize(Y->dims());
+      out->Resize(y->dims());
     }
   }
 };
diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h
index d4c523cf30..2f40c05903 100644
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
@@ -24,19 +24,7 @@ namespace operators {  // Internal
 template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
 using framework::Tensor;
-using framework::DDim;
-
-// TODO(wanghaoshuang):  move this function to other place
-DDim stride(const DDim& ddim) {
-  std::vector<int64_t> strides(ddim.size());
-  strides[ddim.size() - 1] = 1;
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i + 1];
-  }
-  return make_ddim(strides);
-}
 
 template <typename T>
 class CropKernel : public framework::OpKernel {
@@ -44,13 +32,13 @@ class CropKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<Tensor>("X");
     auto* out = context.Output<Tensor>("Out");
-    T* x_data = x->data<T>();
+    const T* x_data = x->data<T>();
     T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto x_stride = stride(x->dims());
-    auto out_stride = stride(out->dims());
+    auto x_stride = framework::stride(x->dims());
+    auto out_stride = framework::stride(out->dims());
     auto offsets = context.Attr<std::vector<int>>("offsets");
     PADDLE_ENFORCE_EQ(
-        x_dims.size(), offsets.size(),
+        x->dims().size(), offsets.size(),
         "Offsets size should be equal to dimension size of input tensor.");
     int64_t offset = 0;
     for (int i = 0; i < offsets.size(); ++i) {
@@ -71,7 +59,7 @@ void CropGradFunction(const framework::ExecutionContext& context) {
     Eigen::array<std::pair<int, int>, D> paddings;
     for (int i = 0; i < D; ++i) {
       paddings[i].first = offsets[i];
-      paddings[i].second = d_x_dims[i] - d_out_dims[i] - offsets[i];
+      paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i];
     }
     auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
     auto d_out_tensor = EigenTensor<T, D>::From(*d_out);

From cf4b2db7584536fcb03d08a29a424a1bf4c5dbfc Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Thu, 21 Sep 2017 11:39:42 +0800
Subject: [PATCH 35/50] change the dims of input of rank_loss_op

---
 paddle/operators/rank_loss_op.cc                   | 14 ++++++--------
 .../paddle/v2/framework/tests/test_rank_loss_op.py |  6 +++---
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index fd3ac86939..d98fd54f22 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -1,4 +1,3 @@
-
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
    Licensed under the Apache License, Version 2.0 (the "License");
@@ -37,11 +36,10 @@ class RankLossOp : public framework::OperatorWithKernel {
     auto label_dims = ctx.Input<framework::Tensor>("Label")->dims();
     auto left_dims = ctx.Input<framework::Tensor>("Left")->dims();
     auto right_dims = ctx.Input<framework::Tensor>("Right")->dims();
-    PADDLE_ENFORCE((label_dims.size() == 1) && (left_dims.size() == 1) &&
-                       (right_dims.size() == 1),
-                   "The rank of all inputs must be 1.");
     PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims),
                    "All inputs must have the same size");
+    PADDLE_ENFORCE((label_dims.size() == 2) && (label_dims[1] == 1),
+                   "All inputs must be row vector with size batch_sizex1.");
     ctx.Output<framework::LoDTensor>("Out")->Resize(label_dims);
   }
 };
@@ -52,10 +50,10 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
                   framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Label",
-             "The label indicating A ranked higher than B or not, 1-D tensor.");
-    AddInput("Left", "The output of RankNet for doc A, 1-D tensor.");
-    AddInput("Right", "The output of RankNet for doc B, 1-D tensor");
-    AddOutput("Out", "The output loss of RankLoss operator, 1-D tensor.");
+             "The label indicating A ranked higher than B or not, row vector.");
+    AddInput("Left", "The output of RankNet for doc A, vector.");
+    AddInput("Right", "The output of RankNet for doc B, vetor");
+    AddOutput("Out", "The output loss of RankLoss operator, vector.");
     AddComment(R"DOC(RankLoss operator
 
 Rank loss operator for RankNet[1]. RankNet is a pairwise ranking model with
diff --git a/python/paddle/v2/framework/tests/test_rank_loss_op.py b/python/paddle/v2/framework/tests/test_rank_loss_op.py
index c4d74e1c04..0e41ab1b3f 100644
--- a/python/paddle/v2/framework/tests/test_rank_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_rank_loss_op.py
@@ -8,9 +8,9 @@ class TestRankLossOp(OpTest):
         self.op_type = "rank_loss"
         batch_size = 5
         # labels_{i} = {0, 1.0} or {0, 0.5, 1.0}
-        label = np.random.randint(0, 2, size=(batch_size, )).astype("float32")
-        left = np.random.random((batch_size, )).astype("float32")
-        right = np.random.random((batch_size, )).astype("float32")
+        label = np.random.randint(0, 2, size=(batch_size, 1)).astype("float32")
+        left = np.random.random((batch_size, 1)).astype("float32")
+        right = np.random.random((batch_size, 1)).astype("float32")
         loss = np.log(1.0 + np.exp(left - right)) - label * (left - right)
         self.inputs = {'Label': label, 'Left': left, 'Right': right}
         self.outputs = {'Out': loss}

From 1f6b90904aaab3dc144f966e63c5041888457ee9 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Thu, 21 Sep 2017 11:44:55 +0800
Subject: [PATCH 36/50] fix a typo in rank_loss_op

---
 paddle/operators/rank_loss_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index d98fd54f22..4bba420072 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -39,7 +39,7 @@ class RankLossOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims),
                    "All inputs must have the same size");
     PADDLE_ENFORCE((label_dims.size() == 2) && (label_dims[1] == 1),
-                   "All inputs must be row vector with size batch_sizex1.");
+                   "All inputs must be row vector with size batch_size x 1.");
     ctx.Output<framework::LoDTensor>("Out")->Resize(label_dims);
   }
 };

From 4b948abbf00d097e3eb2a2121174860f2f35c989 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 20 Sep 2017 21:24:54 -0700
Subject: [PATCH 37/50] Update Attribute to make it compatible with BLOCK

---
 paddle/framework/attribute.cc    | 21 ++++++++++++++-------
 paddle/framework/attribute.h     |  4 +++-
 paddle/framework/framework.proto |  2 +-
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc
index 27132eaa0b..534c0d8d68 100644
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -47,40 +47,44 @@ template <>
 AttrType AttrTypeID<std::vector<std::pair<int, int>>>() {
   return INT_PAIRS;
 }
+template <>
+AttrType AttrTypeID<BlockDesc>() {
+  return BLOCK;
+}
 
 Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
   switch (attr_desc.type()) {
-    case paddle::framework::AttrType::INT: {
+    case framework::AttrType::INT: {
       return attr_desc.i();
     }
-    case paddle::framework::AttrType::FLOAT: {
+    case framework::AttrType::FLOAT: {
       return attr_desc.f();
     }
-    case paddle::framework::AttrType::STRING: {
+    case framework::AttrType::STRING: {
       return attr_desc.s();
     }
-    case paddle::framework::AttrType::INTS: {
+    case framework::AttrType::INTS: {
       std::vector<int> val(attr_desc.ints_size());
       for (int i = 0; i < attr_desc.ints_size(); ++i) {
         val[i] = attr_desc.ints(i);
       }
       return val;
     }
-    case paddle::framework::AttrType::FLOATS: {
+    case framework::AttrType::FLOATS: {
       std::vector<float> val(attr_desc.floats_size());
       for (int i = 0; i < attr_desc.floats_size(); ++i) {
         val[i] = attr_desc.floats(i);
       }
       return val;
     }
-    case paddle::framework::AttrType::STRINGS: {
+    case framework::AttrType::STRINGS: {
       std::vector<std::string> val(attr_desc.strings_size());
       for (int i = 0; i < attr_desc.strings_size(); ++i) {
         val[i] = attr_desc.strings(i);
       }
       return val;
     }
-    case paddle::framework::AttrType::INT_PAIRS: {
+    case framework::AttrType::INT_PAIRS: {
       std::vector<std::pair<int, int>> val(attr_desc.int_pairs_size());
       for (int i = 0; i < attr_desc.int_pairs_size(); ++i) {
         val[i].first = attr_desc.int_pairs(i).first();
@@ -88,6 +92,9 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
       }
       return val;
     }
+    case framework::AttrType::BLOCK: {
+      return g_program_desc.blocks(attr_desc.block_idx());
+    }
   }
   PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
   return boost::blank();
diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
index 2b788a76ca..f18123bac7 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -29,11 +29,13 @@ namespace framework {
 
 typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                        std::vector<float>, std::vector<std::string>,
-                       std::vector<std::pair<int, int>>>
+                       std::vector<std::pair<int, int>>, BlockDesc>
     Attribute;
 
 typedef std::unordered_map<std::string, Attribute> AttributeMap;
 
+static ProgramDesc g_program_desc;
+
 template <typename T>
 AttrType AttrTypeID();
 
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index dcbc9ec407..89a49f6906 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -45,7 +45,7 @@ message OpDesc {
     repeated float floats = 7;
     repeated string strings = 8;
     repeated IntPair int_pairs = 9;
-    optional int32 block = 10;
+    optional int32 block_idx = 10;
   };
 
   message Var {

From 659f2f71ac62434485675ce6cc1403fe4409c589 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 21 Sep 2017 13:29:42 +0800
Subject: [PATCH 38/50] Bug fix for get device_context.

---
 paddle/operators/gemm_conv2d_op.h    | 21 +++++++++---------
 paddle/operators/math/im2col.cc      |  8 +++----
 paddle/operators/math/im2col.cu      | 32 ++++++++++++++--------------
 paddle/operators/math/im2col.h       |  4 ++--
 paddle/operators/math/im2col_test.cc |  4 ++--
 5 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/paddle/operators/gemm_conv2d_op.h b/paddle/operators/gemm_conv2d_op.h
index 08b7df1dfe..72de0a5cf3 100644
--- a/paddle/operators/gemm_conv2d_op.h
+++ b/paddle/operators/gemm_conv2d_op.h
@@ -75,8 +75,7 @@ class GemmConv2DKernel : public framework::OpKernel {
     framework::DDim output_matrix_shape = {output_channels,
                                            output_height * output_width};
 
-    auto* device_context =
-        const_cast<platform::DeviceContext*>(context.device_context_);
+    auto device_context = context.device_context();
 
     // convolution operator: im2col + gemm
     int in_step = input_channels / groups;
@@ -93,8 +92,8 @@ class GemmConv2DKernel : public framework::OpKernel {
         // gemm
         Tensor out_slice = out_batch.Slice<T>(g * out_step, (g + 1) * out_step);
         Tensor filter_slice = filter.Slice<T>(g * out_step, (g + 1) * out_step);
-        math::matmul<Place, T>(filter_slice, false, col_matrix, false, T(1.0),
-                               &out_slice, T(0.0), device_context);
+        math::matmul<Place, T>(device_context, filter_slice, false, col_matrix,
+                               false, T(1.0), &out_slice, T(0.0));
       }
     }
   }
@@ -160,8 +159,7 @@ class GemmConvGrad2DKernel : public framework::OpKernel {
                                            filter.numel() / filter.dims()[0]};
     filter.Resize(filter_matrix_shape);
 
-    auto* device_context =
-        const_cast<platform::DeviceContext*>(context.device_context_);
+    auto device_context = context.device_context();
 
     // convolution backward input operator:  gemm + col2im
     // convolution backward weight operator: im2col + gemm
@@ -184,8 +182,9 @@ class GemmConvGrad2DKernel : public framework::OpKernel {
               out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step);
           Tensor filter_slice =
               filter.Slice<T>(g * out_step, (g + 1) * out_step);
-          math::matmul<Place, T>(filter_slice, true, out_grad_slice, false,
-                                 T(1.0), &col_matrix, T(0.0), device_context);
+          math::matmul<Place, T>(device_context, filter_slice, true,
+                                 out_grad_slice, false, T(1.0), &col_matrix,
+                                 T(0.0));
 
           // col2im
           Tensor in_grad_slice =
@@ -218,9 +217,9 @@ class GemmConvGrad2DKernel : public framework::OpKernel {
           // gemm
           Tensor filter_grad_slice =
               filter_grad_.Slice<T>(g * out_step, (g + 1) * out_step);
-          math::matmul<Place, T>(out_grad_slice, false, col_matrix, true,
-                                 T(1.0), &filter_grad_slice, T(1.0),
-                                 device_context);
+          math::matmul<Place, T>(device_context, out_grad_slice, false,
+                                 col_matrix, true, T(1.0), &filter_grad_slice,
+                                 T(1.0));
         }
       }
     }
diff --git a/paddle/operators/math/im2col.cc b/paddle/operators/math/im2col.cc
index 5727c1cab1..36a07f7a31 100644
--- a/paddle/operators/math/im2col.cc
+++ b/paddle/operators/math/im2col.cc
@@ -29,7 +29,7 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
  public:
   void operator()(const framework::Tensor& im, framework::Tensor& col,
                   int stride_height, int stride_width, int padding_height,
-                  int padding_width, platform::DeviceContext* context) {
+                  int padding_width, const platform::DeviceContext& context) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
 
@@ -81,7 +81,7 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
  public:
   void operator()(framework::Tensor& im, const framework::Tensor& col,
                   int stride_height, int stride_width, int padding_height,
-                  int padding_width, platform::DeviceContext* context) {
+                  int padding_width, const platform::DeviceContext& context) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
     int input_channels = im.dims()[0];
@@ -139,7 +139,7 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
  public:
   void operator()(const framework::Tensor& im, framework::Tensor& col,
                   int stride_height, int stride_width, int padding_height,
-                  int padding_width, platform::DeviceContext* context) {
+                  int padding_width, const platform::DeviceContext& context) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
     int input_channels = im.dims()[0];
@@ -199,7 +199,7 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
  public:
   void operator()(framework::Tensor& im, const framework::Tensor& col,
                   int stride_height, int stride_width, int padding_height,
-                  int padding_width, platform::DeviceContext* context) {
+                  int padding_width, const platform::DeviceContext& context) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
     int input_channels = im.dims()[0];
diff --git a/paddle/operators/math/im2col.cu b/paddle/operators/math/im2col.cu
index 9bff7bee3c..b433c8f8e8 100644
--- a/paddle/operators/math/im2col.cu
+++ b/paddle/operators/math/im2col.cu
@@ -66,7 +66,7 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
  public:
   void operator()(const framework::Tensor& im, framework::Tensor& col,
                   int stride_height, int stride_width, int padding_height,
-                  int padding_width, platform::DeviceContext* context) {
+                  int padding_width, const platform::DeviceContext& context) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
 
@@ -84,9 +84,9 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     int block_y = (blocks + 512 - 1) / 512;
     dim3 threads(1024, 1);
     dim3 grid(block_x, block_y);
-    im2col<T><<<
-        grid, threads, 0,
-        reinterpret_cast<platform::CUDADeviceContext*>(context)->stream()>>>(
+    im2col<T><<<grid, threads, 0,
+                reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                    .stream()>>>(
         im.data<T>(), num_outputs, input_height, input_width, filter_height,
         filter_width, stride_height, stride_width, padding_height,
         padding_width, output_height, output_width, col.data<T>());
@@ -151,7 +151,7 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
  public:
   void operator()(framework::Tensor& im, const framework::Tensor& col,
                   int stride_height, int stride_width, int padding_height,
-                  int padding_width, platform::DeviceContext* context) {
+                  int padding_width, const platform::DeviceContext& context) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
 
@@ -174,9 +174,9 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
 
     // To avoid involving atomic operations, we will launch one kernel per
     // bottom dimension, and then in the kernel add up the top dimensions.
-    col2im<T><<<
-        grid, threads, 0,
-        reinterpret_cast<platform::CUDADeviceContext*>(context)->stream()>>>(
+    col2im<T><<<grid, threads, 0,
+                reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                    .stream()>>>(
         num_kernels, col.data<T>(), input_height + 2 * padding_height,
         input_width + 2 * padding_width, input_channels, filter_height,
         filter_width, stride_height, stride_width, padding_height,
@@ -237,7 +237,7 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
  public:
   void operator()(const framework::Tensor& im, framework::Tensor& col,
                   int stride_height, int stride_width, int padding_height,
-                  int padding_width, platform::DeviceContext* context) {
+                  int padding_width, const platform::DeviceContext& context) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
     int input_channels = im.dims()[0];
@@ -268,9 +268,9 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
     dim3 threads(block_dim_x, block_dim_y,
                  std::min(block_dim_z, input_channels));
     dim3 grid(output_width, output_height);
-    im2colOCF<T><<<
-        grid, threads, 0,
-        reinterpret_cast<platform::CUDADeviceContext*>(context)->stream()>>>(
+    im2colOCF<T><<<grid, threads, 0,
+                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                       .stream()>>>(
         im.data<T>(), col.data<T>(), input_channels, input_height, input_width,
         filter_height, filter_width, stride_height, stride_width,
         padding_height, padding_width, output_height, output_width);
@@ -320,7 +320,7 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
  public:
   void operator()(framework::Tensor& im, const framework::Tensor& col,
                   int stride_height, int stride_width, int padding_height,
-                  int padding_width, platform::DeviceContext* context) {
+                  int padding_width, const platform::DeviceContext& context) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
     int input_channels = im.dims()[0];
@@ -351,9 +351,9 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
     dim3 threads(block_dim_x, block_dim_y,
                  std::min(block_dim_z, input_channels));
     dim3 grid(output_width, output_height);
-    col2imOCF<T><<<
-        grid, threads, 0,
-        reinterpret_cast<platform::CUDADeviceContext*>(context)->stream()>>>(
+    col2imOCF<T><<<grid, threads, 0,
+                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                       .stream()>>>(
         im.data<T>(), col.data<T>(), input_channels, input_height, input_width,
         filter_height, filter_width, stride_height, stride_width,
         padding_height, padding_width, output_height, output_width);
diff --git a/paddle/operators/math/im2col.h b/paddle/operators/math/im2col.h
index 8958c5457c..9a119c6894 100644
--- a/paddle/operators/math/im2col.h
+++ b/paddle/operators/math/im2col.h
@@ -74,7 +74,7 @@ class Im2ColFunctor {
  public:
   void operator()(const framework::Tensor& im, framework::Tensor& col,
                   int stride_height, int stride_width, int padding_height,
-                  int padding_width, platform::DeviceContext* context);
+                  int padding_width, const platform::DeviceContext& context);
 };
 
 template <ColFormat Format, typename Place, typename T>
@@ -82,7 +82,7 @@ class Col2ImFunctor {
  public:
   void operator()(framework::Tensor& im, const framework::Tensor& col,
                   int stride_height, int stride_width, int padding_height,
-                  int padding_width, platform::DeviceContext* context);
+                  int padding_width, const platform::DeviceContext& context);
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc
index 4f380388b1..e0943c0379 100644
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@@ -78,8 +78,8 @@ void testIm2col() {
     PADDLE_THROW("no GPU support");
 #endif  // PADDLE_ONLY_CPU
   }
-  im2col(input, output_cfo, stride, stride, padding, padding, context);
-  im2col_ocf(input, output_ocf, stride, stride, padding, padding, context);
+  im2col(input, output_cfo, stride, stride, padding, padding, *context);
+  im2col_ocf(input, output_ocf, stride, stride, padding, padding, *context);
 
   float* out_cfo_ptr;
   if (paddle::platform::is_cpu_place(*place)) {

From bb546cf13e6076b28d748b526a4486021b0d2b84 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 21 Sep 2017 13:35:46 +0800
Subject: [PATCH 39/50] Bug fix.

---
 paddle/operators/gemm_conv2d_op.h | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/paddle/operators/gemm_conv2d_op.h b/paddle/operators/gemm_conv2d_op.h
index 72de0a5cf3..f9215b46fc 100644
--- a/paddle/operators/gemm_conv2d_op.h
+++ b/paddle/operators/gemm_conv2d_op.h
@@ -75,8 +75,6 @@ class GemmConv2DKernel : public framework::OpKernel {
     framework::DDim output_matrix_shape = {output_channels,
                                            output_height * output_width};
 
-    auto device_context = context.device_context();
-
     // convolution operator: im2col + gemm
     int in_step = input_channels / groups;
     int out_step = output_channels / groups;
@@ -87,13 +85,13 @@ class GemmConv2DKernel : public framework::OpKernel {
         // im2col
         Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step);
         im2col(in_slice, col, strides[0], strides[1], paddings[0], paddings[1],
-               device_context);
+               context.device_context());
 
         // gemm
         Tensor out_slice = out_batch.Slice<T>(g * out_step, (g + 1) * out_step);
         Tensor filter_slice = filter.Slice<T>(g * out_step, (g + 1) * out_step);
-        math::matmul<Place, T>(device_context, filter_slice, false, col_matrix,
-                               false, T(1.0), &out_slice, T(0.0));
+        math::matmul<Place, T>(context.device_context(), filter_slice, false,
+                               col_matrix, false, T(1.0), &out_slice, T(0.0));
       }
     }
   }
@@ -159,8 +157,6 @@ class GemmConvGrad2DKernel : public framework::OpKernel {
                                            filter.numel() / filter.dims()[0]};
     filter.Resize(filter_matrix_shape);
 
-    auto device_context = context.device_context();
-
     // convolution backward input operator:  gemm + col2im
     // convolution backward weight operator: im2col + gemm
     int in_step = input_channels / groups;
@@ -182,7 +178,7 @@ class GemmConvGrad2DKernel : public framework::OpKernel {
               out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step);
           Tensor filter_slice =
               filter.Slice<T>(g * out_step, (g + 1) * out_step);
-          math::matmul<Place, T>(device_context, filter_slice, true,
+          math::matmul<Place, T>(context.device_context(), filter_slice, true,
                                  out_grad_slice, false, T(1.0), &col_matrix,
                                  T(0.0));
 
@@ -190,7 +186,7 @@ class GemmConvGrad2DKernel : public framework::OpKernel {
           Tensor in_grad_slice =
               in_grad_batch.Slice<T>(g * in_step, (g + 1) * in_step);
           col2im(in_grad_slice, col, strides[0], strides[1], paddings[0],
-                 paddings[1], device_context);
+                 paddings[1], context.device_context());
         }
       }
     }
@@ -212,14 +208,14 @@ class GemmConvGrad2DKernel : public framework::OpKernel {
               out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step);
           Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step);
           im2col(in_slice, col, strides[0], strides[1], paddings[0],
-                 paddings[1], device_context);
+                 paddings[1], context.device_context());
 
           // gemm
           Tensor filter_grad_slice =
               filter_grad_.Slice<T>(g * out_step, (g + 1) * out_step);
-          math::matmul<Place, T>(device_context, out_grad_slice, false,
-                                 col_matrix, true, T(1.0), &filter_grad_slice,
-                                 T(1.0));
+          math::matmul<Place, T>(context.device_context(), out_grad_slice,
+                                 false, col_matrix, true, T(1.0),
+                                 &filter_grad_slice, T(1.0));
         }
       }
     }

From c42e2049a2cdd9799d1ffb088cb306d9378ac909 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 21 Sep 2017 13:46:43 +0800
Subject: [PATCH 40/50] Refine code.

---
 paddle/operators/gemm_conv2d_op.h    | 12 ++++++------
 paddle/operators/math/im2col.cc      | 22 ++++++++++++----------
 paddle/operators/math/im2col.cu      | 22 ++++++++++++----------
 paddle/operators/math/im2col.h       | 11 ++++++-----
 paddle/operators/math/im2col_test.cc |  4 ++--
 5 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/paddle/operators/gemm_conv2d_op.h b/paddle/operators/gemm_conv2d_op.h
index f9215b46fc..5c9e81732a 100644
--- a/paddle/operators/gemm_conv2d_op.h
+++ b/paddle/operators/gemm_conv2d_op.h
@@ -84,8 +84,8 @@ class GemmConv2DKernel : public framework::OpKernel {
       for (int g = 0; g < groups; g++) {
         // im2col
         Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step);
-        im2col(in_slice, col, strides[0], strides[1], paddings[0], paddings[1],
-               context.device_context());
+        im2col(context.device_context(), in_slice, col, strides[0], strides[1],
+               paddings[0], paddings[1]);
 
         // gemm
         Tensor out_slice = out_batch.Slice<T>(g * out_step, (g + 1) * out_step);
@@ -185,8 +185,8 @@ class GemmConvGrad2DKernel : public framework::OpKernel {
           // col2im
           Tensor in_grad_slice =
               in_grad_batch.Slice<T>(g * in_step, (g + 1) * in_step);
-          col2im(in_grad_slice, col, strides[0], strides[1], paddings[0],
-                 paddings[1], context.device_context());
+          col2im(context.device_context(), in_grad_slice, col, strides[0],
+                 strides[1], paddings[0], paddings[1]);
         }
       }
     }
@@ -207,8 +207,8 @@ class GemmConvGrad2DKernel : public framework::OpKernel {
           Tensor out_grad_slice =
               out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step);
           Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step);
-          im2col(in_slice, col, strides[0], strides[1], paddings[0],
-                 paddings[1], context.device_context());
+          im2col(context.device_context(), in_slice, col, strides[0],
+                 strides[1], paddings[0], paddings[1]);
 
           // gemm
           Tensor filter_grad_slice =
diff --git a/paddle/operators/math/im2col.cc b/paddle/operators/math/im2col.cc
index 36a07f7a31..c08a3380f0 100644
--- a/paddle/operators/math/im2col.cc
+++ b/paddle/operators/math/im2col.cc
@@ -27,9 +27,10 @@ template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                     platform::CPUPlace, T> {
  public:
-  void operator()(const framework::Tensor& im, framework::Tensor& col,
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& im, framework::Tensor& col,
                   int stride_height, int stride_width, int padding_height,
-                  int padding_width, const platform::DeviceContext& context) {
+                  int padding_width) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
 
@@ -79,9 +80,9 @@ template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                     platform::CPUPlace, T> {
  public:
-  void operator()(framework::Tensor& im, const framework::Tensor& col,
-                  int stride_height, int stride_width, int padding_height,
-                  int padding_width, const platform::DeviceContext& context) {
+  void operator()(const platform::DeviceContext& context, framework::Tensor& im,
+                  const framework::Tensor& col, int stride_height,
+                  int stride_width, int padding_height, int padding_width) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
     int input_channels = im.dims()[0];
@@ -137,9 +138,10 @@ template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                     platform::CPUPlace, T> {
  public:
-  void operator()(const framework::Tensor& im, framework::Tensor& col,
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& im, framework::Tensor& col,
                   int stride_height, int stride_width, int padding_height,
-                  int padding_width, const platform::DeviceContext& context) {
+                  int padding_width) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
     int input_channels = im.dims()[0];
@@ -197,9 +199,9 @@ template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                     platform::CPUPlace, T> {
  public:
-  void operator()(framework::Tensor& im, const framework::Tensor& col,
-                  int stride_height, int stride_width, int padding_height,
-                  int padding_width, const platform::DeviceContext& context) {
+  void operator()(const platform::DeviceContext& context, framework::Tensor& im,
+                  const framework::Tensor& col, int stride_height,
+                  int stride_width, int padding_height, int padding_width) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
     int input_channels = im.dims()[0];
diff --git a/paddle/operators/math/im2col.cu b/paddle/operators/math/im2col.cu
index b433c8f8e8..01f60bfe70 100644
--- a/paddle/operators/math/im2col.cu
+++ b/paddle/operators/math/im2col.cu
@@ -64,9 +64,10 @@ template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                     platform::GPUPlace, T> {
  public:
-  void operator()(const framework::Tensor& im, framework::Tensor& col,
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& im, framework::Tensor& col,
                   int stride_height, int stride_width, int padding_height,
-                  int padding_width, const platform::DeviceContext& context) {
+                  int padding_width) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
 
@@ -149,9 +150,9 @@ template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                     platform::GPUPlace, T> {
  public:
-  void operator()(framework::Tensor& im, const framework::Tensor& col,
-                  int stride_height, int stride_width, int padding_height,
-                  int padding_width, const platform::DeviceContext& context) {
+  void operator()(const platform::DeviceContext& context, framework::Tensor& im,
+                  const framework::Tensor& col, int stride_height,
+                  int stride_width, int padding_height, int padding_width) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
 
@@ -235,9 +236,10 @@ template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                     platform::GPUPlace, T> {
  public:
-  void operator()(const framework::Tensor& im, framework::Tensor& col,
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& im, framework::Tensor& col,
                   int stride_height, int stride_width, int padding_height,
-                  int padding_width, const platform::DeviceContext& context) {
+                  int padding_width) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
     int input_channels = im.dims()[0];
@@ -318,9 +320,9 @@ template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                     platform::GPUPlace, T> {
  public:
-  void operator()(framework::Tensor& im, const framework::Tensor& col,
-                  int stride_height, int stride_width, int padding_height,
-                  int padding_width, const platform::DeviceContext& context) {
+  void operator()(const platform::DeviceContext& context, framework::Tensor& im,
+                  const framework::Tensor& col, int stride_height,
+                  int stride_width, int padding_height, int padding_width) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
     int input_channels = im.dims()[0];
diff --git a/paddle/operators/math/im2col.h b/paddle/operators/math/im2col.h
index 9a119c6894..7b717e1603 100644
--- a/paddle/operators/math/im2col.h
+++ b/paddle/operators/math/im2col.h
@@ -72,17 +72,18 @@ enum class ColFormat { kCFO = 0, kOCF = 1 };
 template <ColFormat Format, typename Place, typename T>
 class Im2ColFunctor {
  public:
-  void operator()(const framework::Tensor& im, framework::Tensor& col,
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& im, framework::Tensor& col,
                   int stride_height, int stride_width, int padding_height,
-                  int padding_width, const platform::DeviceContext& context);
+                  int padding_width);
 };
 
 template <ColFormat Format, typename Place, typename T>
 class Col2ImFunctor {
  public:
-  void operator()(framework::Tensor& im, const framework::Tensor& col,
-                  int stride_height, int stride_width, int padding_height,
-                  int padding_width, const platform::DeviceContext& context);
+  void operator()(const platform::DeviceContext& context, framework::Tensor& im,
+                  const framework::Tensor& col, int stride_height,
+                  int stride_width, int padding_height, int padding_width);
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc
index e0943c0379..f0b8c88591 100644
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@@ -78,8 +78,8 @@ void testIm2col() {
     PADDLE_THROW("no GPU support");
 #endif  // PADDLE_ONLY_CPU
   }
-  im2col(input, output_cfo, stride, stride, padding, padding, *context);
-  im2col_ocf(input, output_ocf, stride, stride, padding, padding, *context);
+  im2col(*context, input, output_cfo, stride, stride, padding, padding);
+  im2col_ocf(*context, input, output_ocf, stride, stride, padding, padding);
 
   float* out_cfo_ptr;
   if (paddle::platform::is_cpu_place(*place)) {

From a524498efe8f1273bafe7e1a874ef272ce00bb1d Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Thu, 21 Sep 2017 12:55:46 +0800
Subject: [PATCH 41/50] fix the bug in TeamCity environment.

---
 paddle/framework/operator.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 4a078258d2..82a23797d4 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -342,10 +342,9 @@ class InferShapeContext {
     PADDLE_ENFORCE_LT(j, OutputSize(out));
     auto* in_var = MultiInputVar(in)[i];
     auto* out_var = MultiOutputVar(out)[j];
-    PADDLE_ENFORCE(in_var->IsType<LoDTensor>(),
-                   "The %d-th input of Input(%s) must be LoDTensor.", in);
+    if (!in_var->IsType<LoDTensor>()) return;
     PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                   "The %d-th output of Output(%s) must be LoDTensor.", out);
+                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
     auto in_tensor = in_var->Get<LoDTensor>();
     auto* out_tensor = out_var->GetMutable<LoDTensor>();
     out_tensor->set_lod(in_tensor.lod());
@@ -363,6 +362,13 @@ template <>
 const std::vector<const Tensor*> InferShapeContext::MultiInput<Tensor>(
     const std::string& name) const;
 
+template <>
+Tensor* InferShapeContext::Output<Tensor>(const std::string& name) const;
+
+template <>
+std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
+    const std::string& name) const;
+
 template <typename T>
 struct EigenDeviceConverter;
 

From ccbb285311843afcccdc7fbf85053a5c617db83d Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 21 Sep 2017 14:04:16 +0800
Subject: [PATCH 42/50] Increase the max_relative_error in TestConv2dOp.

---
 .../paddle/v2/framework/tests/test_conv2d_op.py   | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/python/paddle/v2/framework/tests/test_conv2d_op.py b/python/paddle/v2/framework/tests/test_conv2d_op.py
index 3142a60a1a..118a5fc1cd 100644
--- a/python/paddle/v2/framework/tests/test_conv2d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_op.py
@@ -73,13 +73,22 @@ class TestConv2dOp(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(set(['Input', 'Filter']), 'Output')
+        self.check_grad(
+            set(['Input', 'Filter']), 'Output', max_relative_error=0.05)
 
     def test_check_grad_no_filter(self):
-        self.check_grad(['Input'], 'Output', no_grad_set=set(['Filter']))
+        self.check_grad(
+            ['Input'],
+            'Output',
+            max_relative_error=0.05,
+            no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
-        self.check_grad(['Filter'], 'Output', no_grad_set=set(['Input']))
+        self.check_grad(
+            ['Filter'],
+            'Output',
+            max_relative_error=0.05,
+            no_grad_set=set(['Input']))
 
     def init_groups(self):
         self.groups = 1

From 35dd981db052c27129080d62e7b41cbd7986ac62 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Thu, 21 Sep 2017 16:25:55 +0800
Subject: [PATCH 43/50] Fix linking error on mac.

---
 cmake/util.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/util.cmake b/cmake/util.cmake
index e814cad36f..ac911052eb 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -25,7 +25,7 @@ function(target_circle_link_libraries TARGET_NAME)
             endif()
         endforeach()
         if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-            if(IOS AND NOT IOS_ENABLE_BITCODE)
+            if(NOT IOS_ENABLE_BITCODE)
                 list(APPEND LIBS "-undefined dynamic_lookup")
             endif()
         endif()

From a77af98bfadcb07fecc24bf72c127efe7133691d Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Thu, 21 Sep 2017 11:04:06 -0700
Subject: [PATCH 44/50] Fix bug of static variable

---
 paddle/framework/attribute.cc | 11 ++++++++++-
 paddle/framework/attribute.h  |  4 ++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc
index 534c0d8d68..159ed03b92 100644
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -19,6 +19,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+static ProgramDesc* g_program_desc = nullptr;
+
+ProgramDesc& GetProgramDesc() {
+  if (g_program_desc == nullptr) {
+    g_program_desc = new ProgramDesc();
+  }
+  return *g_program_desc;
+}
+
 template <>
 AttrType AttrTypeID<int>() {
   return INT;
@@ -93,7 +102,7 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
       return val;
     }
     case framework::AttrType::BLOCK: {
-      return g_program_desc.blocks(attr_desc.block_idx());
+      return GetProgramDesc().mutable_blocks(attr_desc.block_idx());
     }
   }
   PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
index f18123bac7..6735ca0a8d 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -29,12 +29,12 @@ namespace framework {
 
 typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                        std::vector<float>, std::vector<std::string>,
-                       std::vector<std::pair<int, int>>, BlockDesc>
+                       std::vector<std::pair<int, int>>, BlockDesc*>
     Attribute;
 
 typedef std::unordered_map<std::string, Attribute> AttributeMap;
 
-static ProgramDesc g_program_desc;
+ProgramDesc& GetProgramDesc();
 
 template <typename T>
 AttrType AttrTypeID();

From 4f9d82a9c62064c9523bc84a457e05b03a1ddd0d Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Fri, 22 Sep 2017 10:10:59 +0800
Subject: [PATCH 45/50] Fix bug.

---
 paddle/framework/backward_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index b4e51ad6ed..6932f5b989 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -127,8 +127,8 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
  public:
   FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Src", "x");
-    AddOutput("Dst", "out");
+    AddInput("X", "x");
+    AddOutput("Y", "out");
     AddComment("");
   }
 };

From fbc0db4a61c76ffa29f8e0df405557b7997b15f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=AD=A6=E6=AF=85?= <typhoonzero1986@gmail.com>
Date: Fri, 22 Sep 2017 11:46:57 +0800
Subject: [PATCH 46/50] Update faq (#4317)

* update faq

* follow comments
---
 doc/faq/index_cn.rst | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index 00192aa69b..acbf4c87ae 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -158,17 +158,23 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 
 这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
 
-7. \*-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
+7. paddlepaddle\*.whl is not a supported wheel on this platform.
 ------------------------------------------------------------------------
 
-出现这个问题的主要原因是，系统编译wheel包的时候，使用的 :code:`wheel` 包是最新的，
-而系统中的 :code:`pip` 包比较老。具体的解决方法是，更新 :code:`pip` 包并重新编译PaddlePaddle。
+出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。最新的paddlepaddle python安装包支持Linux x86_64和MacOS 10.12操作系统，并安装了python 2.7和pip 9.0.1。
+
 更新 :code:`pip` 包的方法是\:
 
 ..  code-block:: bash
 
     pip install --upgrade pip
 
+如果还不行，可以执行 :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` 获取当前系统支持的python包的后缀，
+并对比是否和正在安装的后缀一致。
+
+如果系统支持的是 :code:`linux_x86_64` 而安装包是 :code:`manylinux1_x86_64` ，需要升级pip版本到最新；
+如果系统支持 :code:`manylinux1_x86_64` 而安装包（本地）是 :code:`linux_x86_64` ，可以重命名这个whl包为 :code:`manylinux1_x86_64` 再安装。
+
 8.  python相关的单元测试都过不了
 --------------------------------
 
@@ -310,7 +316,7 @@ Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异
 * 模型一直不收敛，发散到了一个数值特别大的地方。
 * 训练数据有问题，导致参数收敛到了一些奇异的情况。或者输入数据尺度过大，有些特征的取值达到数百万，这时进行矩阵乘法运算就可能导致浮点数溢出。
 
-主要的解决办法是减小学习律或者对数据进行归一化处理。
+主要的解决办法是减小学习率或者对数据进行归一化处理。
 
 15. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
 ------------------------------------------------------------------------
@@ -373,3 +379,15 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数
 
     parameters = paddle.parameters.create(my_cost)
     parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
+
+18. 集群多节点训练，日志中保存均为网络通信类错误
+------------------------------
+
+集群多节点训练，日志报错为网络通信类错误，比如 :code:`Connection reset by peer` 等。
+此类报错通常是由于某一个节点的错误导致这个节点的训练进程退出，从而引发其他节点无法连接导致，可以参考下面的步骤排查：
+
+* 从 :code:`train.log` ， :code:`server.log` 找到最早报错的地方，查看是否是其他错误引发的报错（比如FPE，内存不足，磁盘空间不足等）。
+
+* 如果发现最早的报错就是网络通信的问题，很有可能是非独占方式执行导致的端口冲突，可以联系OP，看当前MPI集群是否支持resource=full参数提交，如果支持增加此参数提交，并更换job 端口。
+
+* 如果当前MPI集群并不支持任务独占模式，可以联系OP是否可以更换集群或升级当前集群。
\ No newline at end of file

From 0dce16a697713b848d2afa5d8a5ee3b8108b150a Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Fri, 22 Sep 2017 10:20:02 +0800
Subject: [PATCH 47/50] Use bool type for attr in cross_entropy_op.

---
 paddle/operators/cross_entropy_op.cc          | 25 ++++++++-----------
 paddle/operators/cross_entropy_op.cu          |  4 +--
 paddle/operators/cross_entropy_op.h           |  4 +--
 .../framework/tests/test_cross_entropy_op.py  |  6 ++---
 4 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 953367eb8b..559fc5a8d7 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -35,19 +35,16 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
     PADDLE_ENFORCE_EQ(label->dims().size(), 2,
                       "Input(Label)'s rank must be 2.");
-    // TODO(xinghai-sun): remove this check after swtiching to bool
-    PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
-                   ctx.Attr<int>("soft_label") == 1);
     PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
                       "The 1st dimension of Input(X) and Input(Label) must "
                       "be equal.");
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
       PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
-                        "If Attr(soft_label) == 1, The 2nd dimension of "
+                        "If Attr(soft_label) == true, The 2nd dimension of "
                         "Input(X) and Input(Label) must be equal.");
     } else {
       PADDLE_ENFORCE_EQ(label->dims()[1], 1,
-                        "If Attr(soft_label) == 0, The 2nd dimension of "
+                        "If Attr(soft_label) == false, The 2nd dimension of "
                         "Input(Label) must be 1.");
     }
 
@@ -74,9 +71,6 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(dy->dims().size(), 2, "Input(Y@Grad)'s rank must be 2.");
     PADDLE_ENFORCE_EQ(label->dims().size(), 2,
                       "Input(Label)'s rank must be 2.");
-    // TODO(xinghai-sun): remove this check after swtiching to bool
-    PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
-                   ctx.Attr<int>("soft_label") == 1);
     PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
                       "The 1st dimension of Input(X) and Input(Label) must "
                       "be equal.");
@@ -85,13 +79,13 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                       "be equal.");
     PADDLE_ENFORCE_EQ(dy->dims()[1], 1,
                       "The 2nd dimension of Input(Y@Grad) must be 1.");
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
       PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
-                        "If Attr(soft_label) == 1, The 2nd dimension of "
+                        "If Attr(soft_label) == true, The 2nd dimension of "
                         "Input(X) and Input(Label) must be equal.");
     } else {
       PADDLE_ENFORCE_EQ(label->dims()[1], 1,
-                        "If Attr(soft_label) == 0, The 2nd dimension of "
+                        "If Attr(soft_label) == false, The 2nd dimension of "
                         "Input(Label) must be 1.");
     }
 
@@ -108,7 +102,8 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "The first input of CrossEntropyOp");
     AddInput("Label", "The second input of CrossEntropyOp");
     AddOutput("Y", "The output of CrossEntropyOp");
-    AddAttr<int>("soft_label", "Is soft label. Default zero.").SetDefault(0);
+    AddAttr<bool>("soft_label", "Is soft label. Default zero.")
+        .SetDefault(false);
 
     AddComment(R"DOC(
 CrossEntropy Operator.
@@ -116,12 +111,12 @@ CrossEntropy Operator.
 It supports both standard cross-entropy and soft-label cross-entropy loss
 computation.
 1) One-hot cross-entropy:
-    soft_label = 0, Label[i, 0] indicates the class index for sample i:
+    soft_label = False, Label[i, 0] indicates the class index for sample i:
 
                 Y[i] = -log(X[i, Label[i]])
 
 2) Soft-label cross-entropy:
-    soft_label = 1, Label[i, j] indicates the soft label of class j
+    soft_label = True, Label[i, j] indicates the soft label of class j
     for sample i:
 
                 Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index ab6ad0e062..1d6361a814 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -102,7 +102,7 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
     int grid = (n + block - 1) / block;
     // TODO(qingqing) launch kernel on specified stream
     // base on ExecutionContext.
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
       auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
       SoftCrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n,
                                                  d);
@@ -137,7 +137,7 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
     grid = (n + block - 1) / block;
     // TODO(qingqing): launch kernel on specified stream
     // base on ExecutionContext.
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
       auto* label_data = label->data<T>();
       SoftCrossEntropyGradientKernel<T><<<grid, block>>>(
           dx_data, dy_data, x_data, label_data, n, d);
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index 1b4b23ac20..69caba5ff3 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -51,7 +51,7 @@ class CrossEntropyOpKernel : public framework::OpKernel {
     int batch_size = x->dims()[0];
     int class_num = x->dims()[1];
 
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
       auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
       int index = 0;
       for (int i = 0; i < batch_size; ++i) {
@@ -92,7 +92,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel {
     int class_num = x->dims()[1];
 
     // TODO(qingqing): make zero setting an common function.
-    if (ctx.Attr<int>("soft_label") == 1) {
+    if (ctx.Attr<bool>("soft_label")) {
       auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
       int index = 0;
       for (int i = 0; i < batch_size; ++i) {
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index 0206ca064b..f10db78322 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -19,7 +19,7 @@ class TestCrossEntropyOp1(OpTest):
             dtype="float32")
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {'soft_label': 0}
+        self.attrs = {'soft_label': False}
 
     def test_check_output(self):
         self.check_output()
@@ -45,7 +45,7 @@ class TestCrossEntropyOp2(OpTest):
             axis=1, keepdims=True).astype("float32")
         self.inputs = {'X': X, 'Label': label}
         self.outputs = {'Y': cross_entropy}
-        self.attrs = {'soft_label': 1}
+        self.attrs = {'soft_label': True}
 
     def test_check_output(self):
         self.check_output()
@@ -76,7 +76,7 @@ class TestCrossEntropyOp3(OpTest):
             axis=1, keepdims=True).astype("float32")
         self.inputs = {'X': X, 'Label': label}
         self.outputs = {'Y': cross_entropy}
-        self.attrs = {'soft_label': 1}
+        self.attrs = {'soft_label': True}
 
     def test_check_output(self):
         self.check_output()

From efb56db770227900e0d7de22fa3711484e9a2747 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Fri, 22 Sep 2017 13:56:54 +0800
Subject: [PATCH 48/50] tune max_relative_error in test_cos_sim_op.

---
 python/paddle/v2/framework/tests/test_cos_sim_op.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/v2/framework/tests/test_cos_sim_op.py b/python/paddle/v2/framework/tests/test_cos_sim_op.py
index d314ce391e..47557ccb41 100644
--- a/python/paddle/v2/framework/tests/test_cos_sim_op.py
+++ b/python/paddle/v2/framework/tests/test_cos_sim_op.py
@@ -24,15 +24,15 @@ class TestCosSimOp(OpTest):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.06)
 
     def test_check_grad_ingore_x(self):
         self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set("X"))
+            ['Y'], 'Out', max_relative_error=0.06, no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
         self.check_grad(
-            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
+            ['X'], 'Out', max_relative_error=0.06, no_grad_set=set('Y'))
 
 
 class TestCosSimOp2(TestCosSimOp):

From dd2f477a5053690dd8bc542e7d25b3ad33638e50 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Fri, 22 Sep 2017 15:50:58 +0800
Subject: [PATCH 49/50] Fix bug in cc_library, when merging several libraries
 into one on Linux.

---
 cmake/generic.cmake        | 62 ++++++++++++++++++++------------------
 paddle/capi/CMakeLists.txt |  4 +--
 2 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index d2aab938d4..0bbf922931 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -106,22 +106,22 @@ function(merge_static_libs TARGET_NAME)
   endforeach()
   list(REMOVE_DUPLICATES libs_deps)
 
-  if(APPLE) # Use OSX's libtool to merge archives
-    # To produce a library we need at least one source file.
-    # It is created by add_custom_command below and will helps
-    # also help to track dependencies.
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+  # To produce a library we need at least one source file.
+  # It is created by add_custom_command below and will helps
+  # also help to track dependencies.
+  set(target_SRCS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
 
+  if(APPLE) # Use OSX's libtool to merge archives
     # Make the generated dummy source file depended on all static input
     # libs. If input lib changes,the source file is touched
     # which causes the desired effect (relink).
-    add_custom_command(OUTPUT ${dummyfile}
-      COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
+    add_custom_command(OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
       DEPENDS ${libs})
 
     # Generate dummy staic lib
-    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-    add_library(${TARGET_NAME} STATIC ${dummyfile})
+    file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
     target_link_libraries(${TARGET_NAME} ${libs_deps})
 
     foreach(lib ${libs})
@@ -130,11 +130,14 @@ function(merge_static_libs TARGET_NAME)
     endforeach()
     add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
       COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
-      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
+      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
+      )
   else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+    set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
+
     foreach(lib ${libs})
-      set(objlistfile ${lib}.objlist) # list of objects in the input library
-      set(objdir ${lib}.objdir)
+      set(objlistfile ${target_DIR}/${lib}.objlist) # list of objects in the input library
+      set(objdir ${target_DIR}/${lib}.objdir)
 
       add_custom_command(OUTPUT ${objdir}
         COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
@@ -142,31 +145,32 @@ function(merge_static_libs TARGET_NAME)
 
       add_custom_command(OUTPUT ${objlistfile}
         COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
-        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ../${objlistfile}
+        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ${objlistfile}
         DEPENDS ${lib} ${objdir}
         WORKING_DIRECTORY ${objdir})
 
-      # Empty dummy source file that goes into merged library		
-      set(mergebase ${lib}.mergebase.c)		
-      add_custom_command(OUTPUT ${mergebase}		
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
-        DEPENDS ${objlistfile})		
-
-      list(APPEND mergebases "${mergebase}")
+      list(APPEND target_OBJS "${objlistfile}")
     endforeach()
 
-    add_library(${TARGET_NAME} STATIC ${mergebases})
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
+      DEPENDS ${libs} ${target_OBJS})
+
+    # Generate dummy staic lib
+    file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
     target_link_libraries(${TARGET_NAME} ${libs_deps})
 
     # Get the file name of the generated library
-    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+    set(target_LIBNAME "$<TARGET_FILE:${TARGET_NAME}>")
 
-    foreach(lib ${libs})
-      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-        COMMAND ${CMAKE_AR} cr ${outlibfile} *.o
-        COMMAND ${CMAKE_RANLIB} ${outlibfile}
-        WORKING_DIRECTORY ${lib}.objdir)
-    endforeach()
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+        COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
+        COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
+        WORKING_DIRECTORY ${target_DIR})
   endif()
 endfunction(merge_static_libs)
 
@@ -196,7 +200,7 @@ function(cc_library TARGET_NAME)
     add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
 
   else(cc_library_SRCS)
-    if (cc_library_DEPS)
+    if(cc_library_DEPS)
       merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
     else()
       message(FATAL "Please specify source file or library in cc_library.")
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index dd9e4f1cbd..b9bbe58951 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -62,6 +62,7 @@ if(ANDROID)
           LIBRARY DESTINATION lib/${ANDROID_ABI})
   execute_process(
     COMMAND ${GIT_EXECUTABLE} log --pretty=oneline -1
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
     OUTPUT_VARIABLE GIT_COMMITS_LIST
     RESULT_VARIABLE GIT_COMMITS_LIST_RESULT
     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
@@ -81,8 +82,7 @@ if(ANDROID)
       )"
   )
 else(ANDROID)
-  install(TARGETS paddle_capi_whole
-          ARCHIVE DESTINATION lib)
+  install(TARGETS paddle_capi_whole ARCHIVE DESTINATION lib)
   if(NOT IOS)
     install(TARGETS paddle_capi_shared DESTINATION lib)
   endif()

From f99841dd2aa63c87b701a0cce6d482033e4b66aa Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Fri, 22 Sep 2017 19:31:21 +0800
Subject: [PATCH 50/50] Elementwise operator. (#4139)

Elementwise operator add/sub/mul/div
---
 paddle/operators/elementwise_add_op.cc        |  39 +++
 paddle/operators/elementwise_add_op.cu        |  25 ++
 paddle/operators/elementwise_add_op.h         | 113 +++++++
 paddle/operators/elementwise_div_op.cc        |  40 +++
 paddle/operators/elementwise_div_op.cu        |  25 ++
 paddle/operators/elementwise_div_op.h         | 115 +++++++
 paddle/operators/elementwise_mul_op.cc        |  99 +-----
 paddle/operators/elementwise_mul_op.cu        |   4 +-
 paddle/operators/elementwise_mul_op.h         | 201 ++++-------
 paddle/operators/elementwise_op.h             | 312 ++++++++++++++++++
 paddle/operators/elementwise_sub_op.cc        |  39 +++
 paddle/operators/elementwise_sub_op.cu        |  25 ++
 paddle/operators/elementwise_sub_op.h         | 115 +++++++
 .../tests/test_elementwise_add_op.py          |  96 ++++++
 .../tests/test_elementwise_div_op.py          | 105 ++++++
 .../tests/test_elementwise_mul_op.py          |  73 +---
 .../tests/test_elementwise_sub_op.py          |  96 ++++++
 17 files changed, 1230 insertions(+), 292 deletions(-)
 create mode 100644 paddle/operators/elementwise_add_op.cc
 create mode 100644 paddle/operators/elementwise_add_op.cu
 create mode 100644 paddle/operators/elementwise_add_op.h
 create mode 100644 paddle/operators/elementwise_div_op.cc
 create mode 100644 paddle/operators/elementwise_div_op.cu
 create mode 100644 paddle/operators/elementwise_div_op.h
 create mode 100644 paddle/operators/elementwise_op.h
 create mode 100644 paddle/operators/elementwise_sub_op.cc
 create mode 100644 paddle/operators/elementwise_sub_op.cu
 create mode 100644 paddle/operators/elementwise_sub_op.h
 create mode 100644 python/paddle/v2/framework/tests/test_elementwise_add_op.py
 create mode 100644 python/paddle/v2/framework/tests/test_elementwise_div_op.py
 create mode 100644 python/paddle/v2/framework/tests/test_elementwise_sub_op.py

diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/operators/elementwise_add_op.cc
new file mode 100644
index 0000000000..5f7b654d69
--- /dev/null
+++ b/paddle/operators/elementwise_add_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/elementwise_add_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseAddOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseAddOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("add", "Out = X + Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker,
+            elementwise_add_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_add,
+    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_add_grad,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/elementwise_add_op.cu b/paddle/operators/elementwise_add_op.cu
new file mode 100644
index 0000000000..85d063a76b
--- /dev/null
+++ b/paddle/operators/elementwise_add_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_add_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    elementwise_add,
+    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    elementwise_add_grad,
+    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h
new file mode 100644
index 0000000000..42a7a29e99
--- /dev/null
+++ b/paddle/operators/elementwise_add_op.h
@@ -0,0 +1,113 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class ElementwiseAddKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseCompute<EigenAddFunctor, Place, T>(ctx);
+  }
+};
+
+template <typename T>
+struct ElementwiseAddGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseAddOneGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e.sum();
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseAddBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseAddBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename Place, typename T>
+class ElementwiseAddGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<Place, T, ElementwiseAddGradFunctor<T>,
+                           ElementwiseAddOneGradFunctor<T>,
+                           ElementwiseAddBroadCastGradFunctor<T>,
+                           ElementwiseAddBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/operators/elementwise_div_op.cc
new file mode 100644
index 0000000000..c6898150d3
--- /dev/null
+++ b/paddle/operators/elementwise_div_op.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/elementwise_div_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseDivOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseDivOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Div", "Out = X / Y");
+    AddComment(comment_);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
+            elementwise_div_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_div,
+    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_div_grad,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/elementwise_div_op.cu b/paddle/operators/elementwise_div_op.cu
new file mode 100644
index 0000000000..b96aa31748
--- /dev/null
+++ b/paddle/operators/elementwise_div_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_div_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    elementwise_div,
+    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    elementwise_div_grad,
+    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h
new file mode 100644
index 0000000000..6ef60cdf8d
--- /dev/null
+++ b/paddle/operators/elementwise_div_op.h
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class ElementwiseDivKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseCompute<EigenDivFunctor, Place, T>(ctx);
+  }
+};
+
+template <typename T>
+struct ElementwiseDivGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto z_e = framework::EigenVector<T>::Flatten(*z);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = -1.0 * dz_e * z_e / y_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseDivBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e_bcast;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseDivBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e_bcast;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename Place, typename T>
+class ElementwiseDivGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<Place, T, ElementwiseDivGradFunctor<T>,
+                           ElementwiseDivGradFunctor<T>,
+                           ElementwiseDivBroadCastGradFunctor<T>,
+                           ElementwiseDivBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index 02bd4c7b85..f2544b54d6 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -17,104 +17,25 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-
-class ElementWiseMulOp : public framework::OperatorWithKernel {
+class ElementwiseMulOpMaker : public ElementwiseOpMaker {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of ElementWiseMulOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
-                            "Input(Y) of ElementWiseMulOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("Out"),
-        "Output(Out) of ElementWiseMulOp should not be null.");
-
-    auto x_dim = ctx.Input<Tensor>("X")->dims();
-    auto y_dim = ctx.Input<Tensor>("Y")->dims();
-    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
-                      "Rank of first input must >= rank of second input.")
-    ctx.Output<framework::Tensor>("Out")->Resize(x_dim);
-    ctx.ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class ElementWiseMulOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ElementWiseMulOpMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of elementwise mul op");
-    AddInput("Y", "The second input of elementwise mul op");
-    AddAttr<int>("axis",
-                 R"DOC(
-When shape(Y) does not equal shape(X),Y will be broadcasted 
-to match the shape of X and axis should be dimension index Y in X
-        )DOC")
-        .SetDefault(-1)
-        .EqualGreaterThan(-1);
-
-    AddOutput("Out", "The output of elementwise mul op");
-    AddComment(R"DOC(
-Limited elementwise multiple operator.The equation is: Out = X ⊙ Y.
-1. The shape of Y should be same with X or
-2. Y's shape is a subset of X. 
-   Y will be broadcasted to match the shape of X and axis should be dimension index Y in X.
-   example:
-
-      shape(X) = (2, 3, 4, 5), shape(Y) = (,)
-      shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-      shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
-      shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
-      shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
-
-Both the input X and Y can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input X.
-)DOC");
+  ElementwiseMulOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Mul", "Out = X ⊙ Y");
+    AddComment(comment_);
   }
 };
 
-class ElementWiseMulOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) should not be null");
-
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto y_dims = ctx.Input<Tensor>("Y")->dims();
-    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    auto *x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto *y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-
-    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                      "Rank of first input must >= rank of second input.")
-
-    if (x_grad) {
-      x_grad->Resize(x_dims);
-    }
-
-    if (y_grad) {
-      y_grad->Resize(y_dims);
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_mul, ops::ElementWiseMulOp, ops::ElementWiseMulOpMaker,
-            elementwise_mul_grad, ops::ElementWiseMulOpGrad);
+REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
+            elementwise_mul_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
-    ops::ElementWiseMulKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul_grad,
-    ops::ElementWiseMulGradKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/elementwise_mul_op.cu b/paddle/operators/elementwise_mul_op.cu
index 56f2087c22..da08a75596 100644
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
@@ -19,7 +19,7 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(
     elementwise_mul,
-    ops::ElementWiseMulKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
     elementwise_mul_grad,
-    ops::ElementWiseMulGradKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h
index 6d58da580b..1eaf2e3efc 100644
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -13,171 +13,104 @@
    limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/operators/elementwise_op.h"
 
 namespace paddle {
 namespace operators {
-/*
- * Out = X ⊙ Y
- * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
- *    pre=2, n=3*4, post=5
- * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
- *    pre=2*3, n=4*5, post=1
- */
-
-inline void get_mid_dims(const framework::DDim& x_dims,
-                         const framework::DDim& y_dims, const int axis,
-                         int& pre, int& n, int& post) {
-  pre = 1;
-  n = 1;
-  post = 1;
-  for (int i = 0; i < axis; ++i) {
-    pre *= x_dims[i];
-  }
-
-  for (int i = 0; i < y_dims.size(); ++i) {
-    PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
-                      "Broadcast dimension mismatch.");
-    n *= y_dims[i];
-  }
-
-  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    post *= x_dims[i];
-  }
-}
 
 template <typename Place, typename T>
-class ElementWiseMulKernel : public framework::OpKernel {
+class ElementwiseMulKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
+    ElementwiseCompute<EigenMulFunctor, Place, T>(ctx);
+  }
+};
 
+template <typename T>
+struct ElementwiseMulGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
     auto x_e = framework::EigenVector<T>::Flatten(*x);
     auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto z_e = framework::EigenVector<T>::Flatten(*z);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
 
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                      "Rank of first input must >= rank of second input.")
-
-    if (x_dims == y_dims || product(y_dims) == 1) {
-      z_e.device(ctx.GetEigenDevice<Place>()) = x_e * y_e;
-      return;
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e * y_e;
     }
 
-    int axis = ctx.Attr<int>("axis");
-    axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-    PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-                   "Axis should be in range [0, x_dims)");
-
-    int pre, n, post;
-    get_mid_dims(x_dims, y_dims, axis, pre, n, post);
-    if (post == 1) {
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-      z_e.device(ctx.GetEigenDevice<Place>()) = x_e * y_bcast;
-      return;
-    } else {
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-      z_e.device(ctx.GetEigenDevice<Place>()) = x_e * y_bcast;
-      return;
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = x_e * dz_e;
     }
   }
 };
 
-template <typename Place, typename T>
-class ElementWiseMulGradKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
+template <typename T>
+struct ElementwiseMulBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
     auto x_e = framework::EigenVector<T>::Flatten(*x);
     auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dout_e = framework::EigenVector<T>::Flatten(*dout);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
 
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e * y_e_bcast;
     }
+
     if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e * dz_e)
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
     }
+  }
+};
 
-    if (x_dims == y_dims || product(y_dims) == 1) {
-      if (dx) {
-        auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-        dx_e.device(ctx.GetEigenDevice<Place>()) = dout_e * y_e;
-      }
-
-      if (dy) {
-        auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-        dy_e.device(ctx.GetEigenDevice<Place>()) = x_e * dout_e;
-      }
-      return;
+template <typename T>
+struct ElementwiseMulBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e * y_e_bcast;
     }
 
-    int axis = ctx.Attr<int>("axis");
-    axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-
-    int pre, n, post;
-    get_mid_dims(x_dims, y_dims, axis, pre, n, post);
-
-    // TODO(gongweibao): wrap reshape to a function.
-    if (post == 1) {
-      auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                           .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                           .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-      if (dx) {
-        auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-        dx_e.device(ctx.GetEigenDevice<Place>()) = dout_e * y_e_bcast;
-      }
-
-      if (dy) {
-        auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-        dy_e.device(ctx.GetEigenDevice<Place>()) =
-            (x_e * dout_e)
-                .reshape(Eigen::DSizes<int, 2>(pre, n))
-                .sum(Eigen::array<int, 1>{{0}});
-      }
-      return;
-    } else {
-      auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                           .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                           .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-      if (dx) {
-        auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-        dx_e.device(ctx.GetEigenDevice<Place>()) = dout_e * y_e_bcast;
-      }
-
-      if (dy) {
-        auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-        dy_e.device(ctx.GetEigenDevice<Place>()) =
-            (x_e * dout_e)
-                .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                .sum(Eigen::array<int, 2>{{0, 2}});
-      }
-      return;
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e * dz_e)
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
     }
   }
 };
 
+template <typename Place, typename T>
+class ElementwiseMulGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<Place, T, ElementwiseMulGradFunctor<T>,
+                           ElementwiseMulGradFunctor<T>,
+                           ElementwiseMulBroadCastGradFunctor<T>,
+                           ElementwiseMulBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
new file mode 100644
index 0000000000..f224722c1b
--- /dev/null
+++ b/paddle/operators/elementwise_op.h
@@ -0,0 +1,312 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+/*
+ * Out = X ⊙ Y
+ * If Y's shape does not match X' shape, they will be reshaped.
+ * For example:
+ * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+ *    pre=2, n=3*4, post=5
+ *    x.shape(2, 12, 5) * y.shape(1,12,1).broadcast(2,12,5)
+ * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
+ *    pre=2*3, n=4*5, post=1
+ *    x.shape(2, 3, 20) * y.shape(1,1,20).broadcast(2,3,20)
+ */
+inline void get_mid_dims(const framework::DDim& x_dims,
+                         const framework::DDim& y_dims, const int axis,
+                         int& pre, int& n, int& post) {
+  pre = 1;
+  n = 1;
+  post = 1;
+  for (int i = 0; i < axis; ++i) {
+    pre *= x_dims[i];
+  }
+
+  for (int i = 0; i < y_dims.size(); ++i) {
+    PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
+                      "Broadcast dimension mismatch.");
+    n *= y_dims[i];
+  }
+
+  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    post *= x_dims[i];
+  }
+}
+
+#define EIGEN_FUNCTOR(name, eigen_op)                                          \
+  struct Eigen##name##Functor {                                                \
+    template <typename Place, typename T>                                      \
+    inline void Run(const framework::Tensor* x, const framework::Tensor* y,    \
+                    framework::Tensor* z,                                      \
+                    const framework::ExecutionContext& ctx) {                  \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_e);            \
+    }                                                                          \
+    template <typename Place, typename T>                                      \
+    inline void RunBroadCast(const framework::Tensor* x,                       \
+                             const framework::Tensor* y, framework::Tensor* z, \
+                             const framework::ExecutionContext& ctx, int pre,  \
+                             int n) {                                          \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))                  \
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))             \
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
+      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
+    }                                                                          \
+    template <typename Place, typename T>                                      \
+    inline void RunBroadCast2(const framework::Tensor* x,                      \
+                              const framework::Tensor* y,                      \
+                              framework::Tensor* z,                            \
+                              const framework::ExecutionContext& ctx, int pre, \
+                              int n, int post) {                               \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))               \
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))       \
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
+      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
+    }                                                                          \
+  }
+
+template <class functor, typename Place, typename T>
+void ElementwiseCompute(const framework::ExecutionContext& ctx) {
+  using Tensor = framework::Tensor;
+
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* z = ctx.Output<Tensor>("Out");
+  z->mutable_data<T>(ctx.GetPlace());
+
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                    "Rank of first input must >= rank of second input.")
+
+  if (x_dims == y_dims || product(y_dims) == 1) {
+    functor f;
+    f.template Run<Place, T>(x, y, z, ctx);
+    return;
+  }
+
+  int axis = ctx.Attr<int>("axis");
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                 "Axis should be in range [0, x_dims)");
+
+  int pre, n, post;
+  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+  if (post == 1) {
+    functor f;
+    f.template RunBroadCast<Place, T>(x, y, z, ctx, pre, n);
+    return;
+  } else {
+    functor f;
+    f.template RunBroadCast2<Place, T>(x, y, z, ctx, pre, n, post);
+    return;
+  }
+}
+
+#define EIGEN_ADD(x, y) ((x) + (y))
+EIGEN_FUNCTOR(Add, EIGEN_ADD);
+
+#define EIGEN_SUB(x, y) ((x) - (y))
+EIGEN_FUNCTOR(Sub, EIGEN_SUB);
+
+#define EIGEN_MUL(x, y) ((x) * (y))
+EIGEN_FUNCTOR(Mul, EIGEN_MUL);
+
+#define EIGEN_DIV(x, y) ((x) / (y))
+EIGEN_FUNCTOR(Div, EIGEN_DIV);
+
+template <typename Place, typename T, typename functor, typename functor1,
+          typename broadcastfunctor, typename broadcast2functor>
+void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
+  using Tensor = framework::Tensor;
+
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* out = ctx.Input<Tensor>("Out");
+  auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+  auto place = ctx.GetEigenDevice<Place>();
+
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+
+  auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+  auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+  if (dx) {
+    dx->mutable_data<T>(ctx.GetPlace());
+  }
+  if (dy) {
+    dy->mutable_data<T>(ctx.GetPlace());
+  }
+
+  if (x_dims == y_dims) {
+    functor f;
+    f(place, x, y, out, dx, dy, dout);
+    return;
+  }
+
+  if (product(y_dims) == 1) {
+    functor1 f;
+    f(place, x, y, out, dx, dy, dout);
+    return;
+  }
+
+  int axis = ctx.Attr<int>("axis");
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+
+  int pre, n, post;
+  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+
+  if (post == 1) {
+    broadcastfunctor f;
+    f(place, x, y, out, dx, dy, dout, pre, n);
+    return;
+  } else {
+    broadcast2functor f;
+    f(place, x, y, out, dx, dy, dout, pre, n, post);
+    return;
+  }
+}
+
+class ElementwiseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  using Tensor = framework::Tensor;
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of elementwise op should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
+                            "Input(Y) of elementwise op should not be null");
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.OutputVar("Out"),
+        "Output(Out) of elementwise op should not be null.");
+
+    auto x_dim = ctx.Input<Tensor>("X")->dims();
+    auto y_dim = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
+                      "Rank of first input must >= rank of second input.")
+    ctx.Output<framework::Tensor>("Out")->Resize(x_dim);
+    ctx.ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ElementwiseOpMaker(framework::OpProto* proto,
+                     framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", R"DOC(
+The first input of elementwise op, it's a tensor of any dimensions.
+)DOC");
+    AddInput("Y", R"DOC(
+The sencond input of elementwise op, it's a tensor and it's dimensions
+must be small or equal to X's dimensions.
+)DOC");
+    AddAttr<int>("axis",
+                 R"DOC(
+When the shape(Y) does not equal the shape(X),Y will be broadcasted 
+to match the shape of X and axis should be dimension index Y in X
+        )DOC")
+        .SetDefault(-1)
+        .EqualGreaterThan(-1);
+
+    AddOutput("Out", "The output of elementwise op");
+    comment_ = R"DOC(
+Limited elementwise {name} operator.The equation is: Out = {equation}.
+1. The shape of Y should be same with X or
+2. Y's shape is a subset of X. 
+   Y will be broadcasted to match the shape of X and axis should be dimension index Y in X.
+
+   example:
+      shape(X) = (2, 3, 4, 5), shape(Y) = (,)
+      shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
+      shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
+      shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+      shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+
+Both the input X and Y can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input X.
+)DOC";
+    AddComment(comment_);
+  }
+
+ protected:
+  std::string comment_;
+
+  void Replace(std::string& src, std::string from, std::string to) {
+    std::size_t len_from = std::strlen(from.c_str());
+    std::size_t len_to = std::strlen(to.c_str());
+    for (std::size_t pos = src.find(from); pos != std::string::npos;
+         pos = src.find(from, pos + len_to)) {
+      src.replace(pos, len_from, to);
+    }
+  }
+
+  void SetComment(std::string name, std::string equation) {
+    Replace(comment_, "{name}", name);
+    Replace(comment_, "{equation}", equation);
+  }
+};
+
+class ElementwiseOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  using Tensor = framework::Tensor;
+
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) should not be null");
+
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto y_dims = ctx.Input<Tensor>("Y")->dims();
+    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                      "Rank of first input must >= rank of second input.")
+
+    if (x_grad) {
+      x_grad->Resize(x_dims);
+    }
+
+    if (y_grad) {
+      y_grad->Resize(y_dims);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/operators/elementwise_sub_op.cc
new file mode 100644
index 0000000000..31c37ff7ab
--- /dev/null
+++ b/paddle/operators/elementwise_sub_op.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/elementwise_sub_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseSubOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseSubOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Sub", "Out = X - Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
+            elementwise_sub_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_sub,
+    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_sub_grad,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/elementwise_sub_op.cu b/paddle/operators/elementwise_sub_op.cu
new file mode 100644
index 0000000000..0efb92fce9
--- /dev/null
+++ b/paddle/operators/elementwise_sub_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_sub_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    elementwise_sub,
+    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    elementwise_sub_grad,
+    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h
new file mode 100644
index 0000000000..faa38cf401
--- /dev/null
+++ b/paddle/operators/elementwise_sub_op.h
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class ElementwiseSubKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseCompute<EigenSubFunctor, Place, T>(ctx);
+  }
+};
+
+template <typename T>
+struct ElementwiseSubGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) * dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseSubOneGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) * dz_e.sum();
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseSubBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) *
+                       dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseSubBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) *
+                       dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename Place, typename T>
+class ElementwiseSubGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<Place, T, ElementwiseSubGradFunctor<T>,
+                           ElementwiseSubOneGradFunctor<T>,
+                           ElementwiseSubBroadCastGradFunctor<T>,
+                           ElementwiseSubBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/v2/framework/tests/test_elementwise_add_op.py b/python/paddle/v2/framework/tests/test_elementwise_add_op.py
new file mode 100644
index 0000000000..f3101a709b
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_elementwise_add_op.py
@@ -0,0 +1,96 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestElementwiseOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+class TestElementwiseAddOp_Vector(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.random((32, )).astype("float32"),
+            'Y': np.random.random((32, )).astype("float32")
+        }
+        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseAddOp_broadcast_0(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(2).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(2, 1, 1)
+        }
+
+
+class TestElementwiseAddOp_broadcast_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(3).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 1)
+        }
+
+
+class TestElementwiseAddOp_broadcast_2(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(4).astype(np.float32)
+        }
+
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 1, 4)
+        }
+
+
+class TestElementwiseAddOp_broadcast_3(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
+            'Y': np.random.rand(3, 4).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 4, 1)
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_elementwise_div_op.py b/python/paddle/v2/framework/tests/test_elementwise_div_op.py
new file mode 100644
index 0000000000..41cb2b7767
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_elementwise_div_op.py
@@ -0,0 +1,105 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class ElementwiseDivOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        """ Warning
+        CPU gradient check error!
+        'X': np.random.random((32,84)).astype("float32"),
+        'Y': np.random.random((32,84)).astype("float32")
+        """
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
+
+
+class TestElementwiseDivOp_Vector(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [32]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [32]).astype("float32")
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseDivOp_broadcast_0(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2]).astype("float32")
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(2, 1, 1))
+        }
+
+
+class TestElementwiseDivOp_broadcast_1(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [3]).astype("float32")
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 1))
+        }
+
+
+class TestElementwiseDivOp_broadcast_2(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [4]).astype("float32")
+        }
+
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 4))
+        }
+
+
+class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [3, 4]).astype("float32")
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 4, 1))
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
index e268cfddb2..cee4385a81 100644
--- a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
@@ -3,14 +3,9 @@ import numpy as np
 from op_test import OpTest
 
 
-class TestElementwiseMulOp_Matrix(OpTest):
+class ElementwiseMulOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_mul"
-        """ Warning
-        CPU gradient check error!
-        'X': np.random.random((32,84)).astype("float32"),
-        'Y': np.random.random((32,84)).astype("float32")
-        """
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
             'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
@@ -32,7 +27,7 @@ class TestElementwiseMulOp_Matrix(OpTest):
             ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
 
 
-class TestElementwiseMulOp_Vector(OpTest):
+class TestElementwiseMulOp_Vector(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -41,22 +36,8 @@ class TestElementwiseMulOp_Vector(OpTest):
         }
         self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.1)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
-
 
-class TestElementwiseMulOp_broadcast_0(OpTest):
+class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -69,22 +50,8 @@ class TestElementwiseMulOp_broadcast_0(OpTest):
             'Out': self.inputs['X'] * self.inputs['Y'].reshape(2, 1, 1)
         }
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.1)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
-
 
-class TestElementwiseMulOp_broadcast_1(OpTest):
+class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -97,22 +64,8 @@ class TestElementwiseMulOp_broadcast_1(OpTest):
             'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 3, 1)
         }
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.1)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
-
 
-class TestElementwiseMulOp_broadcast_2(OpTest):
+class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -124,22 +77,8 @@ class TestElementwiseMulOp_broadcast_2(OpTest):
             'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 1, 4)
         }
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.1)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
-
 
-class TestElementwiseMulOp_broadcast_3(OpTest):
+class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
diff --git a/python/paddle/v2/framework/tests/test_elementwise_sub_op.py b/python/paddle/v2/framework/tests/test_elementwise_sub_op.py
new file mode 100644
index 0000000000..be982e8c57
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_elementwise_sub_op.py
@@ -0,0 +1,96 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestElementwiseOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+class TestElementwiseSubOp_Vector(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.random((32, )).astype("float32"),
+            'Y': np.random.random((32, )).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+
+class TestElementwiseSubOp_broadcast_0(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(2).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(2, 1, 1)
+        }
+
+
+class TestElementwiseSubOp_broadcast_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(3).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 3, 1)
+        }
+
+
+class TestElementwiseSubOp_broadcast_2(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(4).astype(np.float32)
+        }
+
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 1, 4)
+        }
+
+
+class TestElementwiseSubOp_broadcast_3(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
+            'Y': np.random.rand(3, 4).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 3, 4, 1)
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()