diff --git a/doc/fluid/dev/contribute_to_paddle_cn.md b/doc/fluid/dev/contribute_to_paddle_cn.md
new file mode 120000
index 0000000000..955216ca62
--- /dev/null
+++ b/doc/fluid/dev/contribute_to_paddle_cn.md
@@ -0,0 +1 @@
+../../v2/dev/contribute_to_paddle_cn.md
\ No newline at end of file
diff --git a/doc/fluid/dev/contribute_to_paddle_en.md b/doc/fluid/dev/contribute_to_paddle_en.md
new file mode 120000
index 0000000000..f9fc68c37e
--- /dev/null
+++ b/doc/fluid/dev/contribute_to_paddle_en.md
@@ -0,0 +1 @@
+../../v2/dev/contribute_to_paddle_en.md
\ No newline at end of file
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
index ad798003f5..37e608160d 100644
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -4,6 +4,8 @@
 .. toctree::
   :maxdepth: 1
 
+  contribute_to_paddle_cn.md
+  write_docs_cn.md
   api_doc_std_cn.md
   new_op_cn.md
   new_op_kernel.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
index 80c899a82f..d7f8303501 100644
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -4,6 +4,8 @@ Development
 .. toctree::
   :maxdepth: 1
 
+  contribute_to_paddle_en.md
+  write_docs_en.md
   api_doc_std_en.md
   new_op_en.md
   new_op_kernel.md
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
index 0c3f88d9c3..0fa3252d22 100644
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@@ -54,10 +54,10 @@
 </table>
 
 
-实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
+实现新的op都添加至目录[paddle/fluid/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
 
 
-下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
+下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
 
 
 ## 实现C++类
@@ -85,17 +85,17 @@ The equation is: Out = X * Y
 };
 ```
 
-[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数含有2个参数：
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)继承自`framework::OpProtoAndCheckerMaker`，构造函数含有2个参数：
 
    - `framework::OpProto` ： 前者存储Op的输入输出和参数属性，将用于Python API接口的生成。
    - `framework::OpAttrChecker` ：后者用于检查参数属性的合法性。
 
 构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。
 
-上面的代码在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md)。
+上面的代码在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md)。
 
 
-再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)为例：
+再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55)为例：
 
 ```cpp
 template <typename AttrType>
@@ -103,21 +103,21 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensor of scale operator.").NotInGradient();
-    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
-    AddComment(R"DOC(Scale operator
-The equation is: Out = scale*X
+    AddInput("X", "(Tensor) Input tensor of scale operator.");
+    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
+    AddComment(R"DOC(
+Scale operator
+$$Out = scale*X$$
 )DOC");
-    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+    AddAttr<AttrType>("scale",
+                      "(float, default 1.0)"
+                      "The scaling factor of the scale operator.")
+        .SetDefault(1.0);
   }
 };
 ```
 
-这个例子有两处不同：
-
-- `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中，如果Op的某个输入不参与反向梯度的计算，请显示地调用`.NotInGradient()`进行设置。
-
-- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
+这个例子有`AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
 
 
 ### 定义Operator类
@@ -205,7 +205,6 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
 为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
 
-
 到此，前向Op实现完成。接下来，需要在`.cc`文件中注册该op和kernel。
 反向Op类的定义，反向OpKernel的定义与前向Op类似，这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
 
@@ -215,7 +214,9 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
     ```cpp
     namespace ops = paddle::operators;
-    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+    REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+    REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
     REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
     REGISTER_OP_CPU_KERNEL(mul_grad,
                   ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
@@ -223,8 +224,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
    在上面的代码中：
 
-    - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
-    - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
+    - `REGISTER_OPERATOR` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
     - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
 
 
@@ -255,7 +255,7 @@ make mul_op
 
 ## 实现单元测试
 
-单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py)。
 
 ### 前向Operator单测
 
@@ -315,7 +315,7 @@ Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp
 
 ### 编译和执行
 
-`python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
+`python/paddle/fluid/tests/unittests/` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
 
 请注意，**不同于Op的编译测试，运行单元测试测时需要编译整个工程**，并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后，执行下面的命令来运行单元测试：
 
@@ -331,7 +331,6 @@ ctest -R test_mul_op
 
 ## 注意事项
 
-- 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。不允许一个文件中包含多个Op，这将会导致编译出错。
-- 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`等，这将会导致单元测试出错。
+- 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OPERATOR(B, ...)`等，这将会导致单元测试出错。
 - 如果Op没有实现CUDA Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
 - 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
diff --git a/doc/fluid/dev/new_op_en.md b/doc/fluid/dev/new_op_en.md
index a566a09131..c8fb7e3b3b 100644
--- a/doc/fluid/dev/new_op_en.md
+++ b/doc/fluid/dev/new_op_en.md
@@ -61,10 +61,10 @@ Registering the Op           | Ops are registered in `.cc` files; For Kernel reg
 </table>
 
 
-New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
+New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
 
 
-Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
+Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
 
 
 ## Implementing C++ Types
@@ -92,17 +92,17 @@ The equation is: Out = X * Y
 };
 ```
 
-[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor：
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor：
 
    - `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces.
    - `framework::OpAttrChecker` is used to validate variable attributes.
 
 The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`.
 
-The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md).
+The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md).
 
 
-An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37) is implemented as follows:
+An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55) is implemented as follows:
 
 ```cpp
 template <typename AttrType>
@@ -120,11 +120,7 @@ The equation is: Out = scale*X
 };
 ```
 
-There are two changes in this example:
-
-- `AddInput("X","...").NotInGradient()` expresses that input `X` is not involved in `ScaleOp`'s corresponding computation. If an input to an operator is not participating in back-propagation, please explicitly set `.NotInGradient()`.
-
-- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);`  adds `scale`constant as an attribute, and sets the default value to 1.0.
+Note `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` adds `scale`constant as an attribute, and sets the default value to 1.0.
 
 
 ### Defining Operator
@@ -154,7 +150,7 @@ class MulOp : public framework::OperatorWithKernel {
 };
 ```
 
-[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22) is inherited from `OperatorWithKernel`. Its `public` member
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L24) is inherited from `OperatorWithKernel`. Its `public` member
 
 ```cpp
 using framework::OperatorWithKernel::OperatorWithKernel;
@@ -209,7 +205,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
 
 Note that **different devices (CPU, CUDA)share one Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions can support both devices.**
 
-`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.cc).
 
 To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
 
@@ -224,7 +220,9 @@ The definition of its corresponding backward operator, if applicable, is similar
 
     ```cpp
     namespace ops = paddle::operators;
-    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+    REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+    REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
 
     REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
     REGISTER_OP_CPU_KERNEL(mul_grad,
@@ -233,9 +231,8 @@ The definition of its corresponding backward operator, if applicable, is similar
 
    In that code block,
 
-    - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
+    - `REGISTER_OPERATOR` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
     - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
-
     - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
 
 
@@ -275,7 +272,7 @@ Unit tests for an operator include
 
 3. a scaling test for the backward operator.
 
-Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py).
+Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py).
 
 ### Testing Forward Operators
 
@@ -339,7 +336,7 @@ Some key points in checking gradient above include:
 ### Compiling and Running
 
 
-Any new unit testing file of the format `test_*.py`  added to the director `python/paddle/v2/framework/tests` is automatically added to the project to compile.
+Any new unit testing file of the format `test_*.py`  added to the director `python/paddle/fluid/tests/unittests/` is automatically added to the project to compile.
 
 Note that **unlike the compile test for Ops, running unit tests requires compiling the entire project** and requires compiling with flag `WITH_TESTING` on i.e. `cmake paddle_dir -DWITH_TESTING=ON`.
 
@@ -357,7 +354,6 @@ ctest -R test_mul_op
 
 ## Remarks
 
-- Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file.
-- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures.
+- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OPERATOR(B, ...)` in `A_op.cc` will cause unit testing failures.
 - If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
 - If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`.
diff --git a/doc/fluid/dev/write_docs_cn.rst b/doc/fluid/dev/write_docs_cn.rst
new file mode 120000
index 0000000000..2c281eaaf4
--- /dev/null
+++ b/doc/fluid/dev/write_docs_cn.rst
@@ -0,0 +1 @@
+../../v2/dev/write_docs_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/dev/write_docs_en.rst b/doc/fluid/dev/write_docs_en.rst
new file mode 120000
index 0000000000..cb2b9b0ff1
--- /dev/null
+++ b/doc/fluid/dev/write_docs_en.rst
@@ -0,0 +1 @@
+../../v2/dev/write_docs_en.rst
\ No newline at end of file
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
deleted file mode 120000
index c44cd9a731..0000000000
--- a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
+++ /dev/null
@@ -1 +0,0 @@
-k8s_aws_en.md
\ No newline at end of file
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
new file mode 100644
index 0000000000..afc753aa42
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
@@ -0,0 +1,672 @@
+# Kubernetes on AWS
+
+我们将向你展示怎么样在AWS的Kubernetes集群上运行分布式PaddlePaddle训练，让我们从核心概念开始
+
+## PaddlePaddle分布式训练的核心概念
+
+### 分布式训练任务
+
+一个分布式训练任务可以看做是一个Kubernetes任务
+每一个Kubernetes任务都有相应的配置文件，此配置文件指定了像任务的pod个数之类的环境变量信息
+
+在分布式训练任务中，我们可以如下操作：
+
+1. 在分布式文件系统中，准备分块数据和配置文件（在此次教学中，我们会用到亚马逊分布式存储服务（EFS））
+2. 创建和提交一个kubernetes任务配置到集群中开始训练
+
+### Parameter Server和Trainer
+
+在paddlepaddle集群中有两个角色：参数服务器（pserver）者和trainer， 每一个参数服务器过程都会保存一部分模型的参数。每一个trainer都保存一份完整的模型参数，并可以利用本地数据更新模型。在这个训练过程中，trainer发送模型更新到参数服务器中，参数服务器职责就是聚合这些更新，以便于trainer可以把全局模型同步到本地。
+
+为了能够和pserver通信，trainer需要每一个pserver的IP地址。在Kubernetes中利用服务发现机制（比如：DNS、hostname）要比静态的IP地址要好一些，因为任何一个pod都会被杀掉然后新的pod被重启到另一个不同IP地址的node上。现在我们可以先用静态的IP地址方式，这种方式是可以更改的。
+
+参数服务器和trainer一块被打包成一个docker镜像，这个镜像会运行在被Kubernetes集群调度的pod中。
+
+### 训练者ID
+
+每一个训练过程都需要一个训练ID，以0作为基础值，作为命令行参数传递。训练过程因此用这个ID去读取数据分片。
+
+### 训练
+
+PaddlePaddle容器的入口是一个shell脚本，这个脚本可以读取Kubernetes内预置的环境变量。这里可以定义任务identity，在任务中identity可以用来远程访问包含所有pod的Kubernetes apiserver服务。
+
+每一个pod通过ip来排序。每一个pod的序列作为“pod id”。因为我们会在每一个pod中运行训练和参数服务，可以用“pod id”作为训练ID。入口脚本详细工作流程如下：
+
+1. 查找apiserver得到pod信息，通过ip排序来分配一个trainer_id。
+2. 从EFS持久化卷中复制训练数据到容器中。
+3. 从环境变量中解析paddle pserver和 paddle trainer的启动参数，然后开始启动流程。
+4. 以trainer_id来训练将自动把结果写入到EFS卷中。
+
+
+## AWS的Kubernetes中的PaddlePaddle
+
+### 选择AWS服务区域
+这个教程需要多个AWS服务工作在一个区域中。在AWS创建任何东西之前，请检查链接https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/ 选择一个可以提供如下服务的区域：EC2, EFS, VPS, CloudFormation, KMS, VPC, S3。在教程中我们使用“Oregon(us-west-2)”作为例子。
+
+### 创建aws账户和IAM账户
+
+在每一个aws账户下可以创建多个IAM用户。允许为每一个IAM用户赋予权限，作为IAM用户可以创建/操作aws集群
+
+注册aws账户，请遵循用户指南。在AWS账户下创建IAM用户和用户组，请遵循用户指南
+
+请注意此教程需要如下的IAM用户权限：
+
+- AmazonEC2FullAccess
+- AmazonS3FullAccess
+- AmazonRoute53FullAccess
+- AmazonRoute53DomainsFullAccess
+- AmazonElasticFileSystemFullAccess
+- AmazonVPCFullAccess
+- IAMUserSSHKeys
+- IAMFullAccess
+- NetworkAdministrator
+- AWSKeyManagementServicePowerUser
+
+
+### 下载kube-aws and kubectl
+
+#### kube-aws
+
+在AWS中[kube-aws](https://github.com/coreos/kube-aws)是一个自动部署集群的CLI工具
+
+##### kube-aws完整性验证
+提示：如果你用的是非官方版本（e.g RC release）的kube-aws，可以跳过这一步骤。引入coreos的应用程序签名公钥:
+
+```
+gpg2 --keyserver pgp.mit.edu --recv-key FC8A365E
+```
+
+指纹验证：
+
+```
+gpg2 --fingerprint FC8A365E
+```
+正确的指纹是： `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
+
+我们可以从发布页面中下载kube-aws，教程使用0.9.1版本 [release page](https://github.com/coreos/kube-aws/releases).
+
+验证tar包的GPG签名：
+
+```
+PLATFORM=linux-amd64
+ # Or
+PLATFORM=darwin-amd64
+
+gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz
+```
+##### 安装kube-aws
+解压:
+
+```
+tar zxvf kube-aws-${PLATFORM}.tar.gz
+```
+
+添加到环境变量:
+
+```
+mv ${PLATFORM}/kube-aws /usr/local/bin
+```
+
+
+#### kubectl
+
+[kubectl](https://Kubernetes.io/docs/user-guide/kubectl-overview/) 是一个操作Kubernetes集群的命令行接口
+
+利用`curl`工具从Kubernetes发布页面中下载`kubectl`
+
+```
+# OS X
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/darwin/amd64/kubectl
+
+# Linux
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/linux/amd64/kubectl
+```
+
+为了能是kubectl运行必须将之添加到环境变量中 (e.g. `/usr/local/bin`):
+
+```
+chmod +x ./kubectl
+sudo mv ./kubectl /usr/local/bin/kubectl
+```
+
+### 配置AWS证书
+
+首先检查这里 [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) 安装AWS命令行工具
+
+然后配置aws账户信息:
+
+```
+aws configure
+```
+
+
+添加如下信息:
+
+
+```
+AWS Access Key ID: YOUR_ACCESS_KEY_ID
+AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY
+Default region name: us-west-2
+Default output format: json
+```
+
+`YOUR_ACCESS_KEY_ID`, and `YOUR_SECRETE_ACCESS_KEY` 是创建aws账户和IAM账户的IAM的key和密码 [Create AWS Account and IAM Account](#create-aws-account-and-iam-account)
+
+描述任何运行在你账户中的实例来验证凭据是否工作:
+
+```
+aws ec2 describe-instances
+```
+
+### 定义集群参数
+
+#### EC2秘钥对
+
+秘钥对将认证ssh访问你的EC2实例。秘钥对的公钥部分将配置到每一个COREOS节点中。
+
+遵循 [EC2 Keypair User Guide](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) Keypair用户指南来创建EC2秘钥对
+
+你可以使用创建好的秘钥对名称来配置集群.
+
+在同一工作区中秘钥对为EC2实例唯一码。在教程中使用 us-west-2 ，所以请确认在这个区域（Oregon）中创建秘钥对。
+
+在浏览器中下载一个`key-name.pem`文件用来访问EC2实例，我们待会会用到.
+
+
+#### KMS秘钥
+
+亚马逊的KMS秘钥在TLS秘钥管理服务中用来加密和解密集群。如果你已经有可用的KMS秘钥，你可以跳过创建新秘钥这一步，提供现存秘钥的ARN字符串。
+
+利用aws命令行创建kms秘钥:
+
+```
+aws kms --region=us-west-2 create-key --description="kube-aws assets"
+{
+    "KeyMetadata": {
+        "CreationDate": 1458235139.724,
+        "KeyState": "Enabled",
+        "Arn": "arn:aws:kms:us-west-2:aaaaaaaaaaaaa:key/xxxxxxxxxxxxxxxxxxx",
+        "AWSAccountId": "xxxxxxxxxxxxx",
+        "Enabled": true,
+        "KeyUsage": "ENCRYPT_DECRYPT",
+        "KeyId": "xxxxxxxxx",
+        "Description": "kube-aws assets"
+    }
+}
+```
+
+我们稍后用到`Arn` 的值.
+
+在IAM用户许可中添加多个内联策略.
+
+进入[IAM Console](https://console.aws.amazon.com/iam/home?region=us-west-2#/home)。点击`Users`按钮，点击刚才创建的用户，然后点击`Add inline policy`按钮，选择`Custom Policy`
+
+粘贴内联策略:
+
+```
+ (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node){
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "Stmt1482205552000",
+            "Effect": "Allow",
+            "Action": [
+                "kms:Decrypt",
+                "kms:Encrypt"
+            ],
+            "Resource": [
+                "arn:aws:kms:*:AWS_ACCOUNT_ID:key/*"
+            ]
+        },
+		{
+            "Sid": "Stmt1482205746000",
+            "Effect": "Allow",
+            "Action": [
+                "cloudformation:CreateStack",
+                "cloudformation:UpdateStack",
+                "cloudformation:DeleteStack",
+                "cloudformation:DescribeStacks",
+                "cloudformation:DescribeStackResource",
+                "cloudformation:GetTemplate",
+                "cloudformation:DescribeStackEvents"
+            ],
+            "Resource": [
+                "arn:aws:cloudformation:us-west-2:AWS_ACCOUNT_ID:stack/MY_CLUSTER_NAME/*"
+            ]
+        }
+    ]
+}
+```
+`Version` : 值必须是"2012-10-17".
+`AWS_ACCOUNT_ID`: 你可以从命令行中获取:
+
+```
+aws sts get-caller-identity --output text --query Account
+```
+
+`MY_CLUSTER_NAME`: 选择一个你喜欢的MY_CLUSTER_NAME，稍后会用到。
+请注意，堆栈名称必须是正则表达式：[a-zA-Z][-a-zA-Z0-9*]*， 在名称中不能有"_"或者"-"，否则kube-aws在下面步骤中会抛出异常
+
+#### 外部DNS名称
+
+当集群被创建后，基于DNS名称控制器将会暴露安全的TLS API.
+
+DNS名称含有CNAME指向到集群DNS名称或者记录指向集群的IP地址。
+
+我们稍后会用到DNS名称，如果没有DNS名称的话，你可以选择一个（比如：`paddle`）还可以修改`/etc/hosts`用本机的DNS名称和集群IP关联。还可以在AWS上增加一个名称服务来关联paddle集群IP，稍后步骤中会查找集群IP.
+
+#### S3 bucket
+
+在启动Kubernetes集群前需要创建一个S3 bucket
+
+在AWS上创建s3 bucket会有许多的bugs，所以使用[s3 console](https://console.aws.amazon.com/s3/home?region=us-west-2)。
+
+链接到 `Create Bucket`，确保在us-west-2 (Oregon)上创建一个唯一的BUCKET_NAME。
+
+#### 初始化assets
+
+在本机创建一个目录用来存放产生的assets:
+
+```
+$ mkdir my-cluster
+$ cd my-cluster
+```
+
+利用KMS Arn、秘钥对名称和前一步产生的DNS名称来初始化集群的CloudFormation栈:
+
+```
+kube-aws init \
+--cluster-name=MY_CLUSTER_NAME \
+--external-dns-name=MY_EXTERNAL_DNS_NAME \
+--region=us-west-2 \
+--availability-zone=us-west-2a \
+--key-name=KEY_PAIR_NAME \
+--kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
+```
+
+`MY_CLUSTER_NAME`: the one you picked in [KMS key](#kms-key)
+
+`MY_EXTERNAL_DNS_NAME`: see [External DNS name](#external-dns-name)
+
+`KEY_PAIR_NAME`: see [EC2 key pair](#ec2-key-pair)
+
+`--kms-key-arn`: the "Arn" in [KMS key](#kms-key)
+
+这里的`us-west-2a`用于参数`--availability-zone`，但必须在AWS账户的有效可用区中
+
+如果不能切换到其他的有效可用区（e.g., `us-west-2a`, or `us-west-2b`），请检查`us-west-2a`是支持`aws ec2 --region us-west-2 describe-availability-zones`。
+
+现在在asset目录中就有了集群的主配置文件cluster.yaml。
+
+默认情况下kube-aws会创建一个工作节点，修改`cluster.yaml`让`workerCount`从1个节点变成3个节点.
+
+#### 呈现asset目录内容
+
+在这个简单的例子中，你可以使用kuber-aws生成TLS身份和证书
+
+```
+kube-aws render credentials --generate-ca
+```
+
+下一步在asset目录中生成一组集群assets.
+
+```
+kube-aws render stack
+```
+asserts(模板和凭证)用于创建、更新和当前目录被创建的Kubernetes集群相关联
+
+### 启动Kubernetes集群
+
+#### 创建一个在CloudFormation模板上定义好的实例
+
+现在让我们创建集群（在命令行中选择任意的 `PREFIX`）
+
+```
+kube-aws up --s3-uri s3://BUCKET_NAME/PREFIX
+```
+
+`BUCKET_NAME`: t在[S3 bucket](#s3-bucket)上使用的bucket名称
+
+
+#### 配置DNS
+
+你可以执行命令 `kube-aws status`来查看创建后集群的API.
+
+```
+$ kube-aws status
+Cluster Name:		paddle-cluster
+Controller DNS Name:	paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+```
+如果你用DNS名称，在ip上设置任何记录或是安装CNAME点到`Controller DNS Name` (`paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com`)
+
+##### 查询IP地址
+
+用命令`dig`去检查负载均衡器的域名来获取ip地址.
+
+```
+$ dig paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+
+;; QUESTION SECTION:
+;paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. IN A
+
+;; ANSWER SECTION:
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.241.164.52
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.67.102.112
+```
+
+在上面的例子中，`54.241.164.52`, `54.67.102.112`这两个ip都将是工作状态
+
+*如果你有DNS名称*，设置记录到ip上，然后你可以跳过“Access the cluster”这一步
+
+*如果没有自己的DNS名称*
+
+编辑/etc/hosts文件用DNS关联IP
+
+##### 更新本地的DNS关联
+编辑`/etc/hosts`文件用DNS关联IP
+##### 在VPC上添加route53私有名称服务
+ - 打开[Route53 Console](https://console.aws.amazon.com/route53/home)
+ - 根据配置创建域名zone
+   - domain名称为: "paddle"
+   - Type: "Private hosted zone for amazon VPC"
+   - VPC ID: `<Your VPC ID>`
+
+   ![route53 zone setting](src/route53_create_zone.png)
+ - 添加记录
+    - 点击zone中刚创建的“paddle”
+    - 点击按钮“Create record set”
+        - Name : leave blank
+        - type: "A"
+        - Value: `<kube-controller ec2 private ip>`
+
+        ![route53 create recordset](src/route53_create_recordset.png)
+ - 检查名称服务
+    - 连接通过kube-aws via ssh创建的任何实例
+    - 运行命令"host paddle"，看看是否ip为返回的kube-controller的私有IP
+
+#### 进入集群
+
+集群运行后如下命令会看到:
+
+```
+$ kubectl --kubeconfig=kubeconfig get nodes
+NAME                                       STATUS    AGE
+ip-10-0-0-134.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-238.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-50.us-west-2.compute.internal    Ready     6m
+ip-10-0-0-55.us-west-2.compute.internal    Ready     6m
+```
+
+
+### 集群安装弹性文件系统
+
+训练数据存放在AWS上的EFS分布式文件系统中.
+
+1. 在[security group console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId)为EFS创建一个安全组
+  1. 可以看到`paddle-cluster-sg-worker` (在sg-055ee37d镜像中)安全组id
+  <center>![](src/worker_security_group.png)</center>
+
+  2. 增加安全组`paddle-efs` ，以`paddle-cluster-sg-worker`的group id作为用户源和`ALL TCP`入栈规则。增加vpc `paddle-cluster-vpc`, 确保可用区是在[Initialize Assets](#initialize-assets)的时候用到的那一个.
+  <center>![](src/add_security_group.png)</center>
+
+2. 利用`paddle-cluster-vpc`私有网络在[EFS console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2#/wizard/1) 中创建弹性文件系统, 确定子网为`paddle-cluster-Subnet0`和安全区为`paddle-efs`.
+<center>![](src/create_efs.png)</center>
+
+
+### 开始在AWS上进行paddlepaddle的训练
+
+#### 配置Kubernetes卷指向EFS
+
+首先需要创建一个持久卷[PersistentVolume](https://kubernetes.io/docs/user-guide/persistent-volumes/) 到EFS上
+
+用 `pv.yaml`形式来保存
+```
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: efsvol
+spec:
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteMany
+  nfs:
+    server: EFS_DNS_NAME
+    path: "/"
+```
+
+`EFS_DNS_NAME`: DNS名称最好能描述我们创建的`paddle-efs`，看起来像`fs-2cbf7385.efs.us-west-2.amazonaws.com`
+
+运行下面的命令来创建持久卷:
+```
+kubectl --kubeconfig=kubeconfig create -f pv.yaml
+```
+下一步创建 [PersistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/)来声明持久卷
+
+用`pvc.yaml`来保存.
+```
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: efsvol
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 50Gi
+```
+
+行下面命令来创建持久卷声明:
+```
+kubectl --kubeconfig=kubeconfig create -f pvc.yaml
+```
+
+#### 准备训练数据
+
+启动Kubernetes job在我们创建的持久层上进行下载、保存并均匀拆分训练数据为3份.
+
+用`paddle-data-job.yaml`保存
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      containers:
+      - name: paddle-data
+        image: paddlepaddle/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/efs"
+          name: efs
+        env:
+        - name: OUT_DIR
+          value: /efs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: efs
+          persistentVolumeClaim:
+            claimName: efsvol
+      restartPolicy: Never
+```
+
+运行下面的命令来启动任务:
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-data-job.yaml
+```
+任务运行大概需要7分钟，可以使用下面命令查看任务状态，直到`paddle-data`任务的`SUCCESSFUL`状态为`1`时成功，这里here有怎样创建镜像的源码
+```
+$ kubectl --kubeconfig=kubeconfig get jobs
+NAME          DESIRED   SUCCESSFUL   AGE
+paddle-data   1         1            6m
+```
+数据准备完成后的结果是以镜像`paddlepaddle/paddle-tutorial:k8s_data`存放，可以点击这里[here](src/k8s_data/README.md)查看如何创建docker镜像源码
+
+#### 开始训练
+
+现在可以开始运行paddle的训练任务，用`paddle-cluster-job.yaml`进行保存
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: efs
+        persistentVolumeClaim:
+          claimName: efsvol
+      containers:
+      - name: trainer
+        image: paddlepaddle/paddle-tutorial:k8s_train
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: quick_start
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        - name: TRAINER_COUNT
+          value: "3"
+        volumeMounts:
+        - mountPath: "/home/jobpath"
+          name: efs
+        ports:
+        - name: jobport0
+          hostPort: 7164
+          containerPort: 7164
+        - name: jobport1
+          hostPort: 7165
+          containerPort: 7165
+        - name: jobport2
+          hostPort: 7166
+          containerPort: 7166
+        - name: jobport3
+          hostPort: 7167
+          containerPort: 7167
+      restartPolicy: Never
+```
+
+`parallelism: 3, completions: 3` 意思是这个任务会同时开启3个paddlepaddle的pod，当pod启动后3个任务将被完成。
+
+`env` 参数代表容器的环境变量，在这里指定paddlepaddle的参数.
+
+`ports` 指定TCP端口7164 - 7167和`pserver`进行连接，port从`CONF_PADDLE_PORT`(7164)到`CONF_PADDLE_PORT + CONF_PADDLE_PORTS_NUM + CONF_PADDLE_PORTS_NUM_SPARSE - 1`(7167)。我们使用多个端口密集和稀疏参数的更新来提高延迟
+
+运行下面命令来启动任务.
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-claster-job.yaml
+```
+
+检查pods信息
+
+```
+$ kubectl --kubeconfig=kubeconfig get pods
+NAME                       READY     STATUS    RESTARTS   AGE
+paddle-cluster-job-cm469   1/1       Running   0          9m
+paddle-cluster-job-fnt03   1/1       Running   0          9m
+paddle-cluster-job-jx4xr   1/1       Running   0          9m
+```
+
+检查指定pod的控制台输出
+```
+kubectl --kubeconfig=kubeconfig log -f POD_NAME
+```
+
+`POD_NAME`: 任何一个pod的名称 (e.g., `paddle-cluster-job-cm469`).
+
+运行`kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job`来检查训练任务的状态，将会在大约20分钟完成
+
+`pserver`和`trainer`的细节都隐藏在docker镜像`paddlepaddle/paddle-tutorial:k8s_train`中，这里[here](src/k8s_train/README.md) 有创建docker镜像的源码.
+
+#### 检查训练输出
+
+训练输出（模型快照和日志）将被保存在EFS上。我们可以用ssh登录到EC2的工作节点上，查看mount过的EFS和训练输出.
+
+1. ssh登录EC2工作节点
+```
+chmod 400 key-name.pem
+ssh -i key-name.pem core@INSTANCE_IP
+```
+
+`INSTANCE_IP`: EC2上Kubernetes工作节点的公共IP地址，进入[EC2 console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#Instances:sort=instanceId) 中检查任何`paddle-cluster-kube-aws-worker`实例的 `public IP`
+
+2. 挂载EFS
+```
+mkdir efs
+sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 EFS_DNS_NAME:/ efs
+```
+
+`EFS_DNS_NAME`: DNS名称最好能描述我们创建的`paddle-efs`，看起来像`fs-2cbf7385.efs.us-west-2.amazonaws.com`.
+
+文件夹`efs`上有这结构相似的node信息:
+```
+-- paddle-cluster-job
+    |-- ...
+    |-- output
+    |   |-- node_0
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_1
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_2
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- pass-00000
+    |   |   |-- ___fc_layer_0__.w0
+    |   |   |-- ___fc_layer_0__.wbias
+    |   |   |-- done
+    |   |   |-- path.txt
+    |   |   `-- trainer_config.lr.py
+	|   |-- pass-00001...
+```
+`server.log` 是`pserver`的log日志，`train.log`是`trainer`的log日志，模型快照和描述存放在`pass-0000*`.
+
+### Kubernetes集群卸载或删除
+
+#### 删除EFS
+
+到[EFS Console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2) 中删除创建的EFS卷
+
+#### 删除安全组
+
+去[Security Group Console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) 删除安全组`paddle-efs`.
+
+#### 删除S3 bucket
+
+进入 [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2#)删除S3 bucket
+
+#### 销毁集群
+
+```
+kube-aws destroy
+```
+
+命令会立刻返回，但需要大约5分钟来销毁集群
+
+可以进入 [CludFormation Console](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks?filter=active)检查销毁的过程。
diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
index b05b66415f..67c7b774e9 100644
--- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
@@ -134,7 +134,7 @@
 
 **输入不等长** 是指recurrent_group的多个输入序列，在每个时间步的子序列长度可以不相等。但序列输出时，需要指定与某一个输入的序列信息是一致的。使用\ :red:`targetInlink`\ 可以指定哪一个输入和输出序列信息一致，默认指定第一个输入。 
 
-示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.conf>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf>`_\ 。
+示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ 。
 
 示例3对于单层RNN和双层RNN数据完全相同。
 
diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
index e5aa05c117..ae997f0805 100644
--- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
@@ -1,4 +1,226 @@
+..  _algo_hrnn_rnn_api_compare:
+
+#####################
 API comparision between RNN and hierarchical RNN
-================================================
+#####################
+
+This article takes PaddlePaddle's hierarchical RNN unit test as an example. We will use several examples to illestrate the usage of single-layer and hierarchical RNNs. Each example has two model configurations, one for single-layer, and the other for hierarchical RNN. Although the implementations are different, both the two model configurations' effects are the same. All of the examples in this article only describe the API interface of the hierarchical RNN, while we do not use this hierarchical RNN to solve practical problems. If you want to understand the use of hierarchical RNN in specific issues, please refer to \ :ref:`algo_hrnn_demo`\ 。The unit test file used in this article's example is \ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
+
+Example 1：Hierarchical RNN without Memory between subsequences
+================================
+
+The classical case in the hierarchical RNN is to perform sequence operations on each time series data in the inner layers seperately. And the sequence operations in the inner layers is independent, that is, it does not need to use Memory. 
+
+In this example, the network configuration of single-layer RNNs and hierarchical RNNs are all to use LSTM as en encoder to compress a word-segmented sentence into a vector. The difference is that, RNN uses a hierarchical RNN model, treating multiple sentences as a whole to use encoder to compress simultaneously. They are completely consistent in their semantic meanings. This pair of semantically identical example configurations is as follows：
+
+* RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_layer_group.conf>`_
+* Hierarchical RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_layer_group.conf>`_
+
+
+Reading hierarchical sequence data
+----------------
+
+Firstly, the original data in this example is as follows \:
+
+- The original data in this example has 10 samples. Each of the sample includes two components: a lable(all 2 here), and a word-segmented sentence. This data is used by single RNN as well. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg
+    :language: text
+
+
+- The data for hierarchical RNN has 4 samples. Every sample is seperated by a blank line, while the content of the data is the same as the original data. But as for hierarchical LSTM, the first sample will encode two sentences into two vectors simultaneously. The sentence count dealed simultaneously by this 4 samples are \ :code:`[2, 3, 2, 3]`\ .
+
+..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg.nest
+    :language: text
+
+Secondly, as for these two types of different input data formats, the contrast of different DataProviders are as follows (`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequenceGen.py>`_)\：
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 21-39
+    :linenos:
+
+- This is the DataProvider code for an ordinary single-layer time series. Its description is as follows: 
+  
+  * DataProvider returns two parts, that are "words" and "label"，as line 19 in the above code. 
+
+    - "words" is a list of word table indices corresponding to each word in the sentence in the original data. Its data type is integer_value_sequence, that is integer list. So, "words" is a singler-layer time series in the data. 
+    - "label" is the categorical label of each sentence, whose data type is integer_value. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 42-71
+    :linenos:
+
+- As for the same data, the DataProvider code for hierarchical time series. Its description is as follows: 
+
+  - DataProvider returns two lists of data, that are "sentences" and "labels", corresponding to the sentences and labels in each group in the original data of hierarchical time series. 
+  - "sentences" comes from the hierarchical time series original data. As it contains every sentences in each group internally, and each sentences are represented by a list of word table indices, so its data type is integer_value_sub_sequence, which is hierarchical time series. 
+  - "labels" is the categorical lable of each sentence, so it is a sigle-layer time series. 
+
+
+Model configuration
+------------------------------------------
+
+Firstly, let's look at the configuration of single-layer RNN. The hightlighted part of line 9 to line 15 is the usage of single-layer RNN. Here we use the pre-defined RNN process function in PaddlePaddle. In this function, for each time step, RNN passes through an LSTM network. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_layer_group.conf
+    :language: python
+    :lines: 38-63
+    :linenos:
+    :emphasize-lines:  9-15
+
+
+Secondly, let's look at the model configuration of hierarchical RNN which has the same semantic meaning. \:
+
+* Most layers in PaddlePaddle do not care about whether the input is time series or not, e.g. \ :code:`embedding_layer`\ . In these layers, every operation is processed on each time step. 
+
+* In the hightlighted part of line 7 to line 26 of this configuration, we transform the hierarchical time series data into single-layer time series data, then process each single-layer time series. 
+
+  * Use the function \ :code:`recurrent_group`\ to transform. Input sequences need to be passed in when transforming. As we want to transform hierarchical time series into single-layer sequences, we need to lable the input data as \ :code:`SubsequenceInput`\ .
+  
+  * In this example, we disassemble every group of the original data into sentences using \ :code:`recurrent_group`\ . Each of the disassembled sentences passes through an LSTM network. This is equivalent to single-layer RNN configuration. 
+
+* Similar to single-layer RNN configuration, we only use the last vector after the encode of LSTM. So we use the operation of \ :code:`last_seq`\ to \ :code:`recurrent_group`\ . But unlike single-layer RNN, we use the last element of every subsequence, so we need to set \ :code:`agg_level=AggregateLevel.TO_SEQUENCE`\ . 
+
+* Till now, \ :code:`lstm_last`\ has the same result as \ :code:`lstm_last`\ in single-layer RNN configuration. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_layer_group.conf
+    :language: python
+    :lines: 38-64
+    :linenos:
+    :emphasize-lines: 7-26
+
+Example 2：Hierarchical RNN with Memory between subsequences
+================================
+
+This example is intended to implement two fully-equivalent fully-connected RNNs using single-layer RNN and hierarchical RNN. 
+
+* As for single-layer RNN, input is a full time series, e.g. \ :code:`[4, 5, 2, 0, 9, 8, 1, 4]`\ .
+
+* As for hierarchical RNN, input is a hierarchical time series which elements are arbitrarily combination of data in single-layer RNN, e.g. \ :code:`[ [4, 5, 2], [0, 9], [8, 1, 4]]`. 
+
+model configuration
+------------------
+
+We select the different parts between single-layer RNN and hierarchical RNN configurations, to compare and analyze the reason why they have same semantic meanings. 
+
+- single-layer RNN：passes through a simple recurrent_group. For each time step, the current input y and the last time step's output rnn_state pass through a fully-connected layer. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn.conf
+    :language: python
+    :lines: 36-48
+
+- hierarchical RNN, the outer layer's memory is an element. 
+
+  - The recurrent_group of inner layer's inner_step is nearly the same as single-layer sequence, except for the case of boot_layer=outer_mem, which means using the outer layer's outer_mem as the initial state for the inner layer's memory. In the outer layer's out_step, outer_mem is the last vector of a subsequence, that is, the whole hierarchical group uses the last vector of the previous subsequence as the initial state for the next subsequence's memory. 
+  - From the aspect of the input data, sentences from single-layer and hierarchical RNN are the same. The only difference is that, hierarchical RNN disassembes the sequence into subsequences. So in the hierarchical RNN configuration, we must use the last element of the previous subsequence as a boot_layer for the memory of the next subsequence, so that it makes no difference with "every time step uses the output of last time step" in the sigle-layer RNN configuration. 
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn.conf
+    :language: python
+    :lines: 39-66
+
+..  warning::
+    Currently PaddlePaddle only supports the case that the lengths of the time series of Memory in each time step are the same. 
+
+Example 3：hierarchical RNN with unequal length inputs
+==========================
+
+.. role:: red
+
+.. raw:: html
+
+    <style> .red {color:red} </style>
+
+**unequal length inputs** means in the multiple input sequences of recurrent_group, the lengths of subsequences can be unequal. But the output of the sequence, needs to be consistent with one of the input sequences. Using \ :red:`targetInlink`\ can help you specify which of the input sequences and the output sequence can be consistent, by default is the first input. 
+
+The configurations of Example 3 are \ `sequence_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_ \ and \ `sequence_nest_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ . 
+
+The data for the configurations of Example 3's single-layer RNN and hierarchical RNN are exactly the same. 
+
+* For the single-layer RNN, the data has two samples, which are \ :code:`[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]`\ and \ :code:`[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]`\ . Each of the data for the single-layer RNN has two group of features. 
+
+* On the basis of the single-layer's data, hierarchical RNN's data randomly adds some partitions. For example, the first sample is transformed to \ :code:`[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]`\ . 
+
+* You need to pay attention that, PaddlePaddle only supports multiple input hierarchical RNNs that have same amount of subsequences currently. In this example, the two features both have 3 subsequences. Although the length of each subsequence can be different, the amount of subsequences should be the same. 
+
+
+model configuration
+--------
+
+Similar to Example 2's configuration, Example 3's configuration uses single-layer and hierarchical RNN to implement 2 fully-equivalent fully-connected RNNs. 
+
+* single-layer RNN\:
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 42-59
+    :linenos:
+
+* hierarchical RNN\ \:
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 41-80
+    :linenos:
+
+In the above code, the usage of single-layer and hierarchical RNNs are similar to Example 2, which difference is that it processes 2 inputs simultaneously. As for the hierarchical RNN, the lengths of the 2 input's subsequences are not equal. But we use the parameter \ :code:`targetInlink` \ to set the outper layer's \ :code:`recurrent_group` \ 's output format, so the shape of outer layer's output is the same as the shape of \ :code:`emb2`\ . 
+
+
+Glossary
+======
+
+..  _glossary_memory:
+
+Memory
+------
+
+Memory is a concept when PaddlePaddle is implementing RNN. RNN, recurrent neural network, usually requires some dependency between time steps, that is, the neural network in current time step depends on one of the neurons in the neural network in previous time steps, as the following figure shows: 
+
+..  graphviz:: src/glossary_rnn.dot
+
+The dotted connections in the figure, is the network connections across time steps. When PaddlePaddle is implementing RNN, this connection accross time steps is implemented using a special neural network unit, called Memory. Memory can cache the output of one of the neurons in previous time step, then can be passed to another neuron in next time step. The implementation of an RNN using Memory is as follows: 
+
+..  graphviz:: src/glossary_rnn_with_memory.dot
+
+With this method, PaddlePaddle can easily determine which outputs should cross time steps, and which should not. 
+
+..  _glossary_timestep:
+
+time step
+------
+
+refers to time series
+
+
+..  _glossary_sequence:
+
+time series
+--------
+
+Time series is a series of featured data. The order among these featured data is meaningful. So it is a list of features, not a set of features. As for each element of this list, or the featured data in each series, is called a time step. It must be noted that, the concepts of time series and time steps, are not necessarrily related to "time". As long as the "order" in a series of featured data is meaningful, it can be the input of time series. 
+
+For example, in text classification task, we regard a sentence as a time series. So, each word in the sentence can become the index of the word in the word table. So this sentence can be represented as a list of these indices, e.g.:code:`[9, 2, 3, 5, 3]` . 
+
+For a more detailed and accurate definition of the time series, please refer to `Wikipedia of Time series <https://en.wikipedia.org/wiki/Time_series>`_  or `Chinese Wikipedia of time series <https://zh.wikipedia.org/wiki/%E6%99%82%E9%96%93%E5%BA%8F%E5%88%97>`_  . 
+
+In additioin, Paddle always calls time series as :code:`Sequence` . They are a same concept in Paddle's documentations and APIs. 
+
+..  _glossary_RNN:
+
+RNN
+---
+
+In PaddlePaddle's documentations, RNN is usually represented as :code:`Recurrent neural network` . For more information, please refer to `Wikipedia Recurrent neural network <https://en.wikipedia.org/wiki/Recurrent_neural_network>`_ or `Chinese Wikipedia <https://zh.wikipedia.org/wiki/%E9%80%92%E5%BD%92%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C>`_ . 
+
+In PaddlePaddle, RNN usually means, for the input data of a time series, the neural network between each time steps has a certain relevance. For example, the input of a certain neuron is the output of a certain neuron in the neural network of the last time step. Or, as for each time step, the network structure of the neural network has a directed ring structure. 
+
+..  _glossary_hierarchical_RNN:
+
+hierarchical RNN
+-------
+
+Hierarchical RNN, as the name suggests, means there is a nested relationship in RNNs. The input data is a time series, but for each of the inner featured data, it is also a time series, namely 2-dimentional array, or, array of array. Hierarchical RNN is a neural network that can process this type of input data. 
+
+For example, the task of text classification of a paragragh, meaning to classify a paragraph of sentences. We can treat a paragraph as an array of sentences, and each sentence is an array of words. This is a type of the input data for the hierarchical RNN. We encode each sentence of this paragraph into a vector using LSTM, then encode each of the encoded vectors into a vector of this paragraph using LSTM. Finally we use this paragraph vector perform classification, which is the neural network structure of this hierarchical RNN. 
 
-TBD
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
index bcd61335be..efc7051582 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -90,7 +90,7 @@ struct TestBroadcastOpHandle {
     op_handle_->AddInput(dummy_var_handle);
 
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
+      op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
       VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
       vars_.emplace_back(out_var_handle);
       op_handle_->AddOutput(out_var_handle);
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index ff6d91c1da..7ff0efe093 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -28,8 +28,8 @@ ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
 void ComputationOpHandle::RunImpl() {
   auto *cur_ctx = dev_ctxes_[place_];
   for (auto *in : inputs_) {
-    bool need_wait =
-        in->generated_op_ && in->generated_op_->dev_ctxes_[place_] != cur_ctx;
+    bool need_wait = in->generated_op_ &&
+                     in->generated_op_->DeviceContext(place_) != cur_ctx;
     if (need_wait) {
       in->generated_op_->Wait(cur_ctx);
     }
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
index d6d2d731ca..c363b973d9 100644
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -14,6 +14,9 @@
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -24,10 +27,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 struct ComputationOpHandle : public OpHandleBase {
-  std::unique_ptr<OperatorBase> op_;
-  Scope *scope_;
-  platform::Place place_;
-
+ public:
   ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
                       platform::Place place);
 
@@ -35,6 +35,11 @@ struct ComputationOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+
+ private:
+  std::unique_ptr<OperatorBase> op_;
+  Scope *scope_;
+  platform::Place place_;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index e3e7c55d15..946ee91a66 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -51,23 +51,23 @@ void FetchOpHandle::RunImpl() {
     auto *var = static_cast<VarHandle *>(input);
     var->generated_op_->Wait(cpu_ctx);
   }
-
   tensors_.resize(inputs_.size());
-  auto *var = static_cast<VarHandle *>(inputs_[0]);
-  auto &var_name = var->name_;
+  auto *var_handle = static_cast<VarHandle *>(inputs_[0]);
+  auto &var_name = var_handle->name_;
   platform::CPUPlace cpu;
   auto &scopes = *local_scopes_;
 
   for (size_t i = 0; i < scopes.size(); ++i) {
     auto &scope = scopes[i];
-    auto &t = scope->FindVar(kLocalExecScopeName)
-                  ->Get<Scope *>()
-                  ->FindVar(var_name)
-                  ->Get<framework::LoDTensor>();
-    if (platform::is_gpu_place(var->place_)) {
+    auto *var =
+        scope->FindVar(kLocalExecScopeName)->Get<Scope *>()->FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
+                            var_name);
+    auto &t = var->Get<framework::LoDTensor>();
+    if (platform::is_gpu_place(t.place())) {
 #ifdef PADDLE_WITH_CUDA
       TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
-      dev_ctxes_[t.place()]->Wait();
+      dev_ctxes_.at(t.place())->Wait();
 #endif
     } else {
       tensors_[i].ShareDataWith(t);
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
index 904b2d669f..b49f3df338 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -14,6 +14,9 @@
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/scope.h"
@@ -24,11 +27,7 @@ namespace framework {
 namespace details {
 
 struct FetchOpHandle : public OpHandleBase {
-  FeedFetchList *data_;
-  size_t offset_;
-  std::vector<Scope *> *local_scopes_;
-  std::vector<LoDTensor> tensors_;
-
+ public:
   FetchOpHandle(FeedFetchList *data, size_t offset,
                 std::vector<Scope *> *local_scopes);
 
@@ -42,6 +41,12 @@ struct FetchOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+
+ private:
+  FeedFetchList *data_;
+  size_t offset_;
+  std::vector<Scope *> *local_scopes_;
+  std::vector<LoDTensor> tensors_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h
index f576f047f3..c394dd7a14 100644
--- a/paddle/fluid/framework/details/gather_op_handle.h
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@@ -29,9 +29,7 @@ namespace framework {
 namespace details {
 
 struct GatherOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
-
+ public:
   GatherOpHandle(const std::vector<Scope *> &local_scopes,
                  const std::vector<platform::Place> &places);
 
@@ -41,8 +39,11 @@ struct GatherOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
-
   void WaitInputVarGenerated(const std::vector<VarHandle *> &in_var_handles);
+
+ private:
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 2da8c89d2d..9481579f6c 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -78,7 +78,7 @@ struct TestGatherOpHandle {
     op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
     // add input
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
+      op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
       auto* in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
       vars_.emplace_back(in_var_handle);
       op_handle_->AddInput(in_var_handle);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 4d76dbf7f6..002952436e 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -60,7 +60,8 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
                                                 const platform::Place &p,
                                                 const size_t &i) const {
   auto *op_handle = result->ops_.back().get();
-  op_handle->dev_ctxes_[p] = platform::DeviceContextPool::Instance().Get(p);
+  op_handle->SetDeviceContext(p,
+                              platform::DeviceContextPool::Instance().Get(p));
 
   auto var_names = op.InputArgumentNames();
 
@@ -89,101 +90,25 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
 
   bool is_forwarding = true;
   for (auto *op : program.Block(0).AllOps()) {
-    bool change_forward = false;
-    if (!is_forwarding) {
-      // FIXME(yy): Do not hard code like this
-      if (op->OutputArgumentNames().size() == 1 &&
-          op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) {
-        continue;  // Drop fill 1. for backward coeff;
-      }
-    }
-
-    // append send op if program is distributed trainer main program.
-    // always use the first device
-    if (!is_forwarding && op->Type() == "send") {
-      auto &p = places_[0];
-      auto *s = local_scopes_[0];
-      // FIXME(wuyi): send op always copy from GPU 0
-      result.ops_.emplace_back(new SendOpHandle(*op, s, p));
-      // Create inputs for output on original place and no ssa output
-      // is created for send op.
-      CreateOpHandleIOs(&result, *op, p, 0);
-      continue;
-    }
-
-    for (size_t i = 0; i < places_.size(); ++i) {
-      auto &p = places_[i];
-      auto *s = local_scopes_[i];
-
-      result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
-      auto *op_handle = result.ops_.back().get();
-      CreateOpHandleIOs(&result, *op, p, i);
-
-      auto var_names = op->OutputArgumentNames();
-
-      if (is_forwarding) {
-        if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
-// Insert ScaleCost OpHandle
-#ifdef PADDLE_WITH_CUDA
-          auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p);
-#else
-          auto *communication_dev_ctx =
-              platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
-#endif
-
-          op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p,
-                                                communication_dev_ctx);
-          result.ops_.emplace_back(op_handle);
-
-          // FIXME: Currently ScaleLossGradOp only use device_count as scale
-          // factor. So it does not depend on any other operators.
-          // VarHandle *loss = GetVarHandle(loss_var_name, place);
-          // loss->pending_ops_.emplace_back(op_handle);
-          // op_handle->inputs_.emplace_back(loss);
-
-          CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, i);
-          change_forward = true;
-        }
-      }
-    }
-
-    if (change_forward) {
+    if (op->Type() == "send") {
+      // append send op if program is distributed trainer main program.
+      // always use the first device
+      CreateSendOp(&result, *op);
+    } else if (IsScaleLossOp(*op)) {
+      CreateScaleLossGradOp(&result);
       is_forwarding = false;
-    }
-
-    if (!is_forwarding) {
-      auto var_names = op->OutputArgumentNames();
-      // Currently, we assume that once gradient is generated, it can be
-      // broadcast, and each gradient is only broadcast once. But there are no
-      // other cases, for example, we need to adjust the gradient according to
-      // the input when we get the gradient, which is not considered at present.
-      for (auto &og : var_names) {
-        if (grad_names_.count(og) != 0 &&
-            og_has_been_broadcast.count(og) == 0) {  // is param grad
-                                                     // Insert NCCL AllReduce Op
-          og_has_been_broadcast.insert(og);
-#ifdef PADDLE_WITH_CUDA
-          result.ops_.emplace_back(
-              new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
-          auto *op_handle = result.ops_.back().get();
-
-          for (size_t i = 0; i < places_.size(); ++i) {
-            auto &p = places_[i];
-            auto &vars = result.vars_[i][og];
-
-            if (vars.empty()) {  // This device has no data. continue.
-              continue;
-            }
-            auto &prev_grad = vars[vars.size() - 1];
-            op_handle->AddInput(prev_grad.get());
-
-            auto var = new VarHandle(vars.size() - 1, i, og, p);
-            vars.emplace_back(var);
-            op_handle->AddOutput(var);
+    } else {
+      CreateComputationalOps(&result, *op);
+      if (!is_forwarding) {
+        // Currently, we assume that once gradient is generated, it can be
+        // broadcast, and each gradient is only broadcast once. But there are no
+        // other cases, for example, we need to adjust the gradient according to
+        // the input when we get the gradient, which is not considered at
+        // present.
+        for (auto &og : op->OutputArgumentNames()) {
+          if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
+            InsertNCCLAllReduceOp(&result, og);
           }
-#else
-          PADDLE_ENFORCE("Not implemented");
-#endif
         }
       }
     }
@@ -207,7 +132,95 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   }
 
   return std::unique_ptr<SSAGraph>(graph);
-}  // namespace details
+}
+
+void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
+    SSAGraph *result, const std::string &og) const {
+#ifdef PADDLE_WITH_CUDA
+  result->ops_.emplace_back(
+      new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
+  auto *op_handle = result->ops_.back().get();
+
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    auto &vars = result->vars_[i][og];
+    PADDLE_ENFORCE(!vars.empty());
+    auto &prev_grad = vars.back();
+    op_handle->AddInput(prev_grad.get());
+
+    auto var = new VarHandle(vars.size() - 1, i, og, p);
+    vars.emplace_back(var);
+    op_handle->AddOutput(var);
+  }
+#else
+  PADDLE_ENFORCE("Not implemented");
+#endif
+}
+
+bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
+    const std::string &og,
+    std::unordered_set<std::string> *og_has_been_broadcast) const {
+  bool is_pg_once =
+      grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
+  if (is_pg_once) {
+    // Insert NCCL AllReduce Op
+    og_has_been_broadcast->insert(og);
+  }
+  return is_pg_once;
+}
+
+void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
+  for (size_t i = 0; i < places_.size(); ++i) {
+// Insert ScaleCost OpHandle
+#ifdef PADDLE_WITH_CUDA
+    auto *communication_dev_ctx = nccl_ctxs_->DevCtx(places_[i]);
+#else
+    auto *communication_dev_ctx =
+        platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+#endif
+
+    auto *op_handle =
+        new ScaleLossGradOpHandle(local_scopes_.size(), local_scopes_[i],
+                                  places_[i], communication_dev_ctx);
+    result->ops_.emplace_back(op_handle);
+
+    // FIXME: Currently ScaleLossGradOp only use device_count as scale
+    // factor. So it does not depend on any other operators.
+    // VarHandle *loss = GetVarHandle(loss_var_name, place);
+    // loss->pending_ops_.emplace_back(op_handle);
+    // op_handle->inputs_.emplace_back(loss);
+
+    CreateOpOutput(result, op_handle, GradVarName(loss_var_name_), places_[i],
+                   i);
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
+                                                     const OpDesc &op) const {
+  for (size_t scope_idx = 0; scope_idx < places_.size(); ++scope_idx) {
+    auto p = places_[scope_idx];
+    auto s = local_scopes_[scope_idx];
+    result->ops_.emplace_back(new ComputationOpHandle(op, s, p));
+    CreateOpHandleIOs(result, op, p, scope_idx);
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
+                                           const OpDesc &op) const {
+  auto &p = places_[0];
+  auto *s = local_scopes_[0];
+  // FIXME(wuyi): send op always copy from GPU 0
+  result->ops_.emplace_back(new SendOpHandle(op, s, p));
+  // Create inputs for output on original place and no ssa output
+  // is created for send op.
+  CreateOpHandleIOs(result, op, p, 0);
+}
+
+bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
+  // FIXME(yy): Do not hard code like this
+  return op.OutputArgumentNames().size() == 1 &&
+         op.OutputArgumentNames()[0] == GradVarName(loss_var_name_);
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index f1518d75b4..b5ba2dbd3c 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -57,6 +57,20 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #ifdef PADDLE_WITH_CUDA
   platform::NCCLContextMap *nccl_ctxs_;
 #endif
+
+  bool IsScaleLossOp(const OpDesc &op) const;
+
+  void CreateSendOp(SSAGraph *result, const OpDesc &op) const;
+
+  void CreateComputationalOps(SSAGraph *result, const OpDesc &op) const;
+
+  void CreateScaleLossGradOp(SSAGraph *result) const;
+
+  bool IsParameterGradientOnce(
+      const std::string &og,
+      std::unordered_set<std::string> *og_has_been_broadcast) const;
+
+  void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
index ad14a3c5cb..a0c321843e 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -27,10 +27,6 @@ namespace framework {
 namespace details {
 
 struct NCCLAllReduceOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
-  const platform::NCCLContextMap &nccl_ctxs_;
-
   NCCLAllReduceOpHandle(const std::vector<Scope *> &local_scopes,
                         const std::vector<platform::Place> &places,
                         const platform::NCCLContextMap &ctxs);
@@ -43,6 +39,11 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+
+ private:
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+  const platform::NCCLContextMap &nccl_ctxs_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index a9a6c8d39c..00f213f3ed 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -27,28 +27,15 @@ namespace details {
 constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
 
 class OpHandleBase {
- private:
-  DISABLE_COPY_AND_ASSIGN(OpHandleBase);
-
  public:
-  std::vector<VarHandleBase *> inputs_;
-  std::vector<VarHandleBase *> outputs_;
-  std::unordered_map<platform::Place, platform::DeviceContext *,
-                     platform::PlaceHash>
-      dev_ctxes_;
-
-#ifdef PADDLE_WITH_CUDA
-  std::unordered_map<int, cudaEvent_t> events_;
-#endif
-
   OpHandleBase() {}
 
+  virtual ~OpHandleBase();
+
   std::string DebugString() const;
 
   virtual std::string Name() const = 0;
 
-  virtual ~OpHandleBase();
-
   void Run(bool use_event);
 
   virtual void Wait(platform::DeviceContext *waited_dev);
@@ -61,6 +48,18 @@ class OpHandleBase {
   // will likely block other computations.
   virtual bool IsMultiDeviceTransfer() { return false; }
 
+  const platform::DeviceContext *DeviceContext(platform::Place place) {
+    return dev_ctxes_[place];
+  }
+
+  void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
+    dev_ctxes_[place] = ctx_;
+  }
+
+  const std::vector<VarHandleBase *> &Inputs() const { return inputs_; }
+
+  const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
+
  protected:
   void RunAndRecordEvent(const std::function<void()> &callback);
 
@@ -68,6 +67,18 @@ class OpHandleBase {
                          const std::function<void()> &callback);
 
   virtual void RunImpl() = 0;
+
+  std::vector<VarHandleBase *> inputs_;
+  std::vector<VarHandleBase *> outputs_;
+  std::unordered_map<platform::Place, platform::DeviceContext *,
+                     platform::PlaceHash>
+      dev_ctxes_;
+
+#ifdef PADDLE_WITH_CUDA
+  std::unordered_map<int, cudaEvent_t> events_;
+#endif
+
+  DISABLE_COPY_AND_ASSIGN(OpHandleBase);
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
index ab7353a4fc..d93d599d46 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <string>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -23,10 +25,6 @@ namespace framework {
 namespace details {
 
 struct ScaleLossGradOpHandle : public OpHandleBase {
-  float coeff_;
-  Scope *scope_;
-  platform::Place place_;
-
   ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place,
                         platform::DeviceContext *context);
 
@@ -36,6 +34,11 @@ struct ScaleLossGradOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+
+ private:
+  float coeff_;
+  Scope *scope_;
+  platform::Place place_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/send_op_handle.h b/paddle/fluid/framework/details/send_op_handle.h
index 173f9d7261..2f78811fad 100644
--- a/paddle/fluid/framework/details/send_op_handle.h
+++ b/paddle/fluid/framework/details/send_op_handle.h
@@ -28,10 +28,6 @@ namespace framework {
 namespace details {
 
 struct SendOpHandle : public OpHandleBase {
-  std::unique_ptr<OperatorBase> op_;
-  const Scope* local_scope_;
-  const platform::Place& place_;
-
   SendOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
                const platform::Place& place);
 
@@ -43,6 +39,11 @@ struct SendOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
+
+ private:
+  std::unique_ptr<OperatorBase> op_;
+  const Scope* local_scope_;
+  const platform::Place& place_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index 25e8c77bb4..6a56752755 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -117,12 +117,12 @@ void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
     std::string op_name = "op_" + std::to_string(op_id++);
     sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
          << std::endl;
-    for (auto in : op->inputs_) {
+    for (auto in : op->Inputs()) {
       std::string var_name = "var_" + std::to_string(vars[in]);
       sout << var_name << " -> " << op_name << std::endl;
     }
 
-    for (auto out : op->outputs_) {
+    for (auto out : op->Outputs()) {
       std::string var_name = "var_" + std::to_string(vars[out]);
       sout << op_name << " -> " << var_name << std::endl;
     }
@@ -133,7 +133,7 @@ void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
 
 void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
   for (auto &op : graph->ops_) {
-    if (!op->outputs_.empty()) {
+    if (!op->Outputs().empty()) {
       continue;
     }
     auto *dummy_leaf = new DummyVarHandle();
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 3d2bd633af..14e75e7b7b 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -53,7 +53,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   };
 
   auto InsertPendingOp = [&pending_ops](OpHandleBase &op_instance) {
-    pending_ops.insert({&op_instance, op_instance.inputs_.size()});
+    pending_ops.insert({&op_instance, op_instance.Inputs().size()});
   };
 
   // Transform SSAGraph to pending_ops & pending_vars
@@ -69,7 +69,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   }
 
   for (auto &op : graph_->ops_) {
-    if (op->inputs_.empty()) {  // Special case, Op has no input.
+    if (op->Inputs().empty()) {  // Special case, Op has no input.
       ready_ops.insert(op.get());
     } else {
       InsertPendingOp(*op);
@@ -99,7 +99,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     fetch_ops.emplace_back(op);
 
     for (auto &p : places_) {
-      op->dev_ctxes_[p] = fetch_ctxs_.Get(p);
+      op->SetDeviceContext(p, fetch_ctxs_.Get(p));
     }
 
     for (auto *var : vars) {
@@ -180,7 +180,7 @@ void ThreadedSSAGraphExecutor::RunOp(
       op->Run(use_event_);
       VLOG(10) << op << " " << op->Name() << " Done ";
       running_ops_--;
-      ready_var_q->Extend(op->outputs_);
+      ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << "Signal posted";
     } catch (platform::EnforceNotMet ex) {
       exception_.reset(new platform::EnforceNotMet(ex));
diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h
index cf697187d6..b4d3fa25c3 100644
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -69,8 +70,7 @@ class GradOpDescMakerBase {
                       " for input argument with a list of variables, "
                       " drop_empty_grad is not allowed because it makes"
                       " the correspondence bewteen a variable and its gradient"
-                      " ambiguous. Use REGISTER_OP_EX to register the op"
-                      " or call InputGrad(?,false) in GradOpDescMaker."
+                      " ambiguous."
                       " Op type %s",
                       fwd_op_.Type());
 
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 614dd8cd00..cd6777e60a 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -119,7 +119,7 @@ class OpDesc {
 
   void InferVarType(BlockDesc *block) const;
 
-  void MarkAsTarget() { desc_.set_is_target(true); }
+  void SetIsTarget(bool is_target) { desc_.set_is_target(is_target); }
 
   void Flush();
 
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index f1424f13b4..748317438b 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include <algorithm>
 #include <atomic>
+#include <string>
+#include <tuple>
 #include <type_traits>
 #include <typeinfo>
 #include <unordered_map>
@@ -141,36 +143,6 @@ class OpKernelRegistrar : public Registrar {
     return 0;                                                          \
   }
 
-/**
- * Macro to register Operator. When the input is duplicable, you should
- * use REGISTER_OP_EX with drop_empty_grad=false instead.
- */
-#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
-                    grad_op_class)                                   \
-  REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,    \
-                 grad_op_class, true)
-
-// When an argument is duplicable, we need to use this version.
-// Perhaps we can omit DropEmptyIG template parameter and
-// only have one version of REGISTER_OP.
-#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,       \
-                       grad_op_class, drop_empty_grad)                        \
-  REGISTER_OPERATOR(grad_op_type, grad_op_class);                             \
-  class _GradOpDescMaker_##grad_op_type##_                                    \
-      : public ::paddle::framework::DefaultGradOpDescMaker<drop_empty_grad> { \
-    using ::paddle::framework::DefaultGradOpDescMaker<                        \
-        drop_empty_grad>::DefaultGradOpDescMaker;                             \
-                                                                              \
-   protected:                                                                 \
-    virtual std::string GradOpType() const { return #grad_op_type; }          \
-  };                                                                          \
-  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_,    \
-                    op_maker_class);
-
-#define REGISTER_OP_WITH_KERNEL(op_type, ...)                         \
-  REGISTER_OPERATOR(op_type, ::paddle::framework::OperatorWithKernel, \
-                    ##__VA_ARGS__)
-
 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
   REGISTER_OPERATOR(op_type, op_class, op_maker_class)
 
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 106b5f866e..67e02e2f11 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -44,6 +44,7 @@ class ParallelExecutorPrivate {
 #endif
 
   std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
+  bool own_local_scope;
 };
 
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -63,11 +64,13 @@ ParallelExecutor::ParallelExecutor(
   // Step 1. Bcast the params to devs.
   // Create local scopes
   if (local_scopes.empty()) {
+    member_->own_local_scope = true;
     member_->local_scopes_.emplace_back(member_->global_scope_);
     for (size_t i = 1; i < member_->places_.size(); ++i) {
       member_->local_scopes_.emplace_back(&scope->NewScope());
     }
   } else {
+    member_->own_local_scope = false;
     PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
     for (size_t i = 0; i < member_->places_.size(); ++i) {
       member_->local_scopes_.emplace_back(local_scopes[i]);
@@ -231,5 +234,13 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
   }
 }
 
+ParallelExecutor::~ParallelExecutor() {
+  if (member_->own_local_scope) {
+    for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
+      member_->global_scope_->DeleteScope(member_->local_scopes_[i]);
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 303ac3bc55..f4f283bb4b 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -42,6 +42,8 @@ class ParallelExecutor {
                             const std::vector<Scope*>& local_scopes,
                             bool allow_op_delay);
 
+  ~ParallelExecutor();
+
   std::vector<Scope*>& GetLocalScopes();
 
   /**
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 7d6781c2c3..3c3fbfaae5 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -110,12 +110,12 @@ function(op_library TARGET)
     # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
     # And for detail pybind information, please see generated paddle/pybind/pybind.h.
     file(READ ${TARGET}.cc TARGET_CONTENT)
-    string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
-    string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
+    string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
+    string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
     if (one_register STREQUAL "")
         string(REPLACE "_op" "" TARGET "${TARGET}")
     else ()
-        string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}")
+        string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
         string(REPLACE "," "" TARGET "${TARGET}")
     endif()
 
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index b261144f3d..56451f8f14 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -558,95 +558,126 @@ $$out = \frac{x}{1 + e^{- \beta x}}$$
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad,
-            ops::ActivationOpGrad);
+REGISTER_OPERATOR(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sigmoid_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker,
-            logsigmoid_grad, ops::ActivationOpGrad);
+REGISTER_OPERATOR(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(logsigmoid_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
-            ops::ActivationOpGrad);
+REGISTER_OPERATOR(exp, ops::ActivationOp, ops::ExpOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(exp_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(relu, ops::ActivationWithMKLDNNOp, ops::ReluOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(relu_grad, ops::ActivationWithMKLDNNOpGrad);
+
+REGISTER_OPERATOR(tanh, ops::ActivationWithMKLDNNOp, ops::TanhOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(tanh_grad, ops::ActivationWithMKLDNNOpGrad);
+
+REGISTER_OPERATOR(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(tanh_shrink_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(softshrink_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(sqrt, ops::ActivationWithMKLDNNOp, ops::SqrtOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sqrt_grad, ops::ActivationWithMKLDNNOpGrad);
+
+REGISTER_OPERATOR(abs, ops::ActivationWithMKLDNNOp, ops::AbsOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(abs_grad, ops::ActivationWithMKLDNNOpGrad);
+
+REGISTER_OPERATOR(ceil, ops::ActivationOp, ops::CeilOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(ceil_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(floor, ops::ActivationOp, ops::FloorOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(floor_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(cos, ops::ActivationOp, ops::CosOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(cos_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(sin, ops::ActivationOp, ops::SinOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sin_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(round, ops::ActivationOp, ops::RoundOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(round_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reciprocal_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(log, ops::ActivationOp, ops::LogOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(log_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(square, ops::ActivationOp, ops::SquareOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(square_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(softplus, ops::ActivationOp, ops::SoftplusOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(softplus_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(softsign, ops::ActivationOp, ops::SoftsignOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(softsign_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(brelu, ops::ActivationOp, ops::BReluOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(brelu_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(leaky_relu_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(soft_relu_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(elu, ops::ActivationOp, ops::ELUOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(elu_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(relu6, ops::ActivationOp, ops::Relu6OpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(relu6_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(pow, ops::ActivationOp, ops::PowOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(pow_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(stanh, ops::ActivationOp, ops::STanhOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(stanh_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(relu, ops::ActivationWithMKLDNNOp, ops::ReluOpMaker, relu_grad,
-            ops::ActivationWithMKLDNNOpGrad);
+REGISTER_OPERATOR(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(hard_shrink_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(tanh, ops::ActivationWithMKLDNNOp, ops::TanhOpMaker, tanh_grad,
-            ops::ActivationWithMKLDNNOpGrad);
-
-REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
-            tanh_shrink_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
-            softshrink_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(sqrt, ops::ActivationWithMKLDNNOp, ops::SqrtOpMaker, sqrt_grad,
-            ops::ActivationWithMKLDNNOpGrad);
-
-REGISTER_OP(abs, ops::ActivationWithMKLDNNOp, ops::AbsOpMaker, abs_grad,
-            ops::ActivationWithMKLDNNOpGrad);
-
-REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(cos, ops::ActivationOp, ops::CosOpMaker, cos_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(sin, ops::ActivationOp, ops::SinOpMaker, sin_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
-            reciprocal_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(softplus, ops::ActivationOp, ops::SoftplusOpMaker, softplus_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker, brelu_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker,
-            leaky_relu_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker, soft_relu_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker, elu_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker, relu6_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker, pow_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker, stanh_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker,
-            hard_shrink_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker,
-            thresholded_relu_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
-            hard_sigmoid_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
-            ops::ActivationOpGrad);
+REGISTER_OPERATOR(thresholded_relu, ops::ActivationOp,
+                  ops::ThresholdedReluOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(thresholded_relu_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(hard_sigmoid_grad, ops::ActivationOpGrad);
+
+REGISTER_OPERATOR(swish, ops::ActivationOp, ops::SwishOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(swish_grad, ops::ActivationOpGrad);
 
 #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)   \
   REGISTER_OP_CPU_KERNEL(                                                 \
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
index 2ec984d8e0..e910ad92d1 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
@@ -153,9 +153,11 @@ class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
-            ops::BilinearTensorProductOpMaker, bilinear_tensor_product_grad,
-            ops::BilinearTensorProductOpGrad);
+REGISTER_OPERATOR(bilinear_tensor_product, ops::BilinearTensorProductOp,
+                  ops::BilinearTensorProductOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(bilinear_tensor_product_grad,
+                  ops::BilinearTensorProductOpGrad);
 REGISTER_OP_CPU_KERNEL(
     bilinear_tensor_product,
     ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc
index a3b67964c7..c71139fc7c 100644
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
@@ -81,8 +81,9 @@ class ClipOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
-            ops::ClipOpGrad);
+REGISTER_OPERATOR(clip, ops::ClipOp, ops::ClipOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad);
 REGISTER_OP_CPU_KERNEL(
     clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 4a36b03cb6..3bb3bd4eb1 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -103,10 +103,12 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
-               ops::ConcatOpGrad, false)
+REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<
+                      false> /* set false to disable empty grad */);
+REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>)
+    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>)
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 695db841a4..92748993c3 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -335,14 +335,17 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
-            ops::ConvOpGrad);
+REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
 
 // depthwise convolution op
-REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-            depthwise_conv2d_grad, ops::ConvOpGrad);
-REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
-            ops::ConvOpGrad);
+REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad);
+REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad);
 
 // depthwise conv kernel
 // TODO(xingzhaolong): neon kernel for mobile
diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc
index a1a0b00208..82fdd30820 100644
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
@@ -193,8 +193,9 @@ class ConvShiftGradKernel<platform::CPUPlace, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
-            conv_shift_grad, ops::ConvShiftGradOp);
+REGISTER_OPERATOR(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv_shift_grad, ops::ConvShiftGradOp);
 REGISTER_OP_CPU_KERNEL(conv_shift,
                        ops::ConvShiftKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 08f5939d42..d699dcafa4 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -298,8 +298,10 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
-            conv2d_transpose_grad, ops::ConvTransposeOpGrad);
+REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
+                  ops::Conv2DTransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose,
@@ -311,8 +313,10 @@ REGISTER_OP_CPU_KERNEL(
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                      double>);
 
-REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
-            conv3d_transpose_grad, ops::ConvTransposeOpGrad);
+REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
+                  ops::Conv3DTransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose,
diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
index 4c8af408f6..04ca878e68 100644
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -153,8 +153,9 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad,
-            ops::CosSimOpGrad);
+REGISTER_OPERATOR(cos_sim, ops::CosSimOp, ops::CosSimOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(cos_sim_grad, ops::CosSimOpGrad);
 REGISTER_OP_CPU_KERNEL(
     cos_sim, ops::CosSimKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index fd7ea70c64..a8f1fbd529 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -153,7 +153,9 @@ class CropOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad);
+REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(crop_grad, ops::CropOpGrad);
 REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
 REGISTER_OP_CPU_KERNEL(
     crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 55810371c8..0e0622e290 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -164,8 +164,9 @@ or not. But the output only shares the LoD information with input X.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
-            cross_entropy_grad, ops::CrossEntropyGradientOp);
+REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
 REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>,
                        ops::CrossEntropyOpKernel<double>);
 REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 0da6f18852..f7c516a0ba 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -79,4 +79,4 @@ using CPU = paddle::platform::CPUDeviceContext;
 REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker);
 REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>,
                        ops::CumKernel<CPU, ops::CumsumFunctor<double>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<int>>)
+                       ops::CumKernel<CPU, ops::CumsumFunctor<int>>);
diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu
index 70e2a1de5e..eb5fd99ccb 100644
--- a/paddle/fluid/operators/cumsum_op.cu
+++ b/paddle/fluid/operators/cumsum_op.cu
@@ -19,4 +19,4 @@ using CUDA = paddle::platform::CUDADeviceContext;
 
 REGISTER_OP_CUDA_KERNEL(cumsum, ops::CumKernel<CUDA, ops::CumsumFunctor<float>>,
                         ops::CumKernel<CUDA, ops::CumsumFunctor<double>>,
-                        ops::CumKernel<CUDA, ops::CumsumFunctor<int>>)
+                        ops::CumKernel<CUDA, ops::CumsumFunctor<int>>);
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc
index 16c612c45a..69fcffe9bc 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -82,7 +82,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
         platform::CPUPlace cpu;
         auto& gpu_dev_ctx =
             static_cast<const platform::CUDADeviceContext&>(ctx);
-        auto copy_size = tensor.memory_size();
+        auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
         payload = memory::Alloc(cpu, copy_size);
 
         memory::Copy(cpu, payload,
@@ -99,7 +99,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
       } else {
         payload = tensor.data<void>();
       }
-      payload_size = tensor.memory_size();
+      payload_size = tensor.numel() * framework::SizeOfType(tensor.type());
       e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
     } break;
     case framework::proto::VarType_Type_SELECTED_ROWS: {
@@ -118,7 +118,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
         platform::CPUPlace cpu;
         auto& gpu_dev_ctx =
             static_cast<const platform::CUDADeviceContext&>(ctx);
-        auto copy_size = tensor->memory_size();
+        auto copy_size =
+            tensor->numel() * framework::SizeOfType(tensor->type());
         payload = memory::Alloc(cpu, copy_size);
         memory::Copy(cpu, payload,
                      boost::get<platform::CUDAPlace>(tensor->place()),
@@ -133,7 +134,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
       } else {
         payload = slr->mutable_value()->data<void>();
       }
-      payload_size = tensor->memory_size();
+      payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
       e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
     } break;
     default:
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index e4436549f6..4ed1b54884 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -101,8 +101,9 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad,
-            ops::DropoutOpGrad);
+REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
 REGISTER_OP_CPU_KERNEL(
     dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/elementwise_div_op.cc b/paddle/fluid/operators/elementwise_div_op.cc
index 6f9a090c8e..c7ddafcad1 100644
--- a/paddle/fluid/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
@@ -30,8 +30,10 @@ class ElementwiseDivOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
-            elementwise_div_grad, ops::ElementwiseOpGrad);
+REGISTER_OPERATOR(elementwise_div, ops::ElementwiseOp,
+                  ops::ElementwiseDivOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(elementwise_div_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_div,
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_max_op.cc b/paddle/fluid/operators/elementwise_max_op.cc
index 61da7c5944..a4fe386bb1 100644
--- a/paddle/fluid/operators/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise_max_op.cc
@@ -29,8 +29,10 @@ class ElementwiseMaxOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_max, ops::ElementwiseOp, ops::ElementwiseMaxOpMaker,
-            elementwise_max_grad, ops::ElementwiseOpGrad);
+REGISTER_OPERATOR(elementwise_max, ops::ElementwiseOp,
+                  ops::ElementwiseMaxOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(elementwise_max_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_max,
     ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_min_op.cc b/paddle/fluid/operators/elementwise_min_op.cc
index c74ff36db1..68cd6ddb4a 100644
--- a/paddle/fluid/operators/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise_min_op.cc
@@ -29,8 +29,10 @@ class ElementwiseMinOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_min, ops::ElementwiseOp, ops::ElementwiseMinOpMaker,
-            elementwise_min_grad, ops::ElementwiseOpGrad);
+REGISTER_OPERATOR(elementwise_min, ops::ElementwiseOp,
+                  ops::ElementwiseMinOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(elementwise_min_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_min,
     ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise_mul_op.cc
index 5d7f2cdffd..2dec27136a 100644
--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
@@ -31,8 +31,10 @@ class ElementwiseMulOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
-            elementwise_mul_grad, ops::ElementwiseOpGrad);
+REGISTER_OPERATOR(elementwise_mul, ops::ElementwiseOp,
+                  ops::ElementwiseMulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(elementwise_mul_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise_sub_op.cc
index 6f770820c8..9d0598fc39 100644
--- a/paddle/fluid/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
@@ -29,8 +29,10 @@ class ElementwiseSubOpMaker : public ElementwiseOpMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
-            elementwise_sub_grad, ops::ElementwiseOpGrad);
+REGISTER_OPERATOR(elementwise_sub, ops::ElementwiseOp,
+                  ops::ElementwiseSubOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(elementwise_sub_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 51a66bd832..9c71ee6d3b 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/expand_op.h"
+#include <vector>
+
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -128,8 +131,9 @@ class ExpandGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
-            ops::ExpandGradOp);
+REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp);
 REGISTER_OP_CPU_KERNEL(
     expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index 2c2d5c7c42..75dbf1d8bf 100644
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -14,13 +14,14 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
+
 #include <boost/preprocessor/arithmetic/div.hpp>
 #include <boost/preprocessor/arithmetic/mod.hpp>
 #include <boost/preprocessor/comparison/greater.hpp>
 #include <boost/preprocessor/comparison/greater_equal.hpp>
 #include <boost/preprocessor/control/if.hpp>
 #include <boost/preprocessor/repetition/repeat.hpp>
-#include <iostream>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 381771f157..45e4d5b2b8 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -98,5 +98,6 @@ FCOpMaker::FCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker, fc_grad,
-            paddle::operators::FCOpGrad);
+REGISTER_OPERATOR(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(fc_grad, paddle::operators::FCOpGrad);
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 6be06b8816..4c82f5c429 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -100,7 +100,8 @@ Out = [[3, 4],
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
-            ops::GatherGradOp);
+REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
 REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>);
 REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>);
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 3819549c71..7e014dd1cb 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "gather.cu.h"
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "scatter.cu.h"
+#include "paddle/fluid/operators/scatter.cu.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
index 5a8b1ebbe3..2dd726bebb 100644
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "gather.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "scatter.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/scatter.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
index 7625bd45d9..9c0561b016 100644
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -12,38 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
-
 #include <gtest/gtest.h>
 #include <iostream>
 #include <string>
 
-TEST(Gather, GatherData) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::operators;
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/platform/place.h"
 
-  Tensor* src = new Tensor();
-  Tensor* index = new Tensor();
-  Tensor* output = new Tensor();
+TEST(Gather, GatherData) {
+  paddle::framework::Tensor* src = new paddle::framework::Tensor();
+  paddle::framework::Tensor* index = new paddle::framework::Tensor();
+  paddle::framework::Tensor* output = new paddle::framework::Tensor();
 
   int* p_src = nullptr;
   int* p_index = nullptr;
-  p_src = src->mutable_data<int>(make_ddim({3, 4}), CPUPlace());
-  p_index = index->mutable_data<int>(make_ddim({2}), CPUPlace());
+  p_src = src->mutable_data<int>(paddle::framework::make_ddim({3, 4}),
+                                 paddle::platform::CPUPlace());
+  p_index = index->mutable_data<int>(paddle::framework::make_ddim({2}),
+                                     paddle::platform::CPUPlace());
 
   for (int i = 0; i < 12; ++i) p_src[i] = i;
   p_index[0] = 1;
   p_index[1] = 0;
 
-  int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
+  int* p_output = output->mutable_data<int>(
+      paddle::framework::make_ddim({2, 4}), paddle::platform::CPUPlace());
 
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  CPUGather<int>(ctx, *src, *index, output);
+  paddle::operators::CPUGather<int>(ctx, *src, *index, output);
 
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
   for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
diff --git a/paddle/fluid/operators/get_places_op.cc b/paddle/fluid/operators/get_places_op.cc
index 9002ce4717..0d7219ac5c 100644
--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thread>
+#include <thread>  // NOLINT
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 2490b83b8c..0a524c914d 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -216,7 +216,9 @@ class GRUGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
+REGISTER_OPERATOR(gru, ops::GRUOp, ops::GRUOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(gru_grad, ops::GRUGradOp);
 REGISTER_OP_CPU_KERNEL(
     gru, ops::GRUKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GRUKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc
index f4c766db0a..80496e4c77 100644
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
@@ -198,8 +198,9 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
-            ops::GRUUnitGradOp);
+REGISTER_OPERATOR(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(gru_unit_grad, ops::GRUUnitGradOp);
 REGISTER_OP_CPU_KERNEL(
     gru_unit, ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index efe84f1409..086b5a97de 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -103,8 +103,9 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
-            hinge_loss_grad, ops::HingeLossGradOp);
+REGISTER_OPERATOR(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(hinge_loss_grad, ops::HingeLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     hinge_loss,
     ops::HingeLossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
index 134b23b461..74d8e0e2b7 100644
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -121,8 +121,9 @@ class HuberLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
-            huber_loss_grad, ops::HuberLossGradOp);
+REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     huber_loss,
     ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 5b387d8d34..8c120eec86 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -148,8 +148,9 @@ class Im2SequenceGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
-            im2sequence_grad, ops::Im2SequenceGradOp);
+REGISTER_OPERATOR(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(im2sequence_grad, ops::Im2SequenceGradOp);
 REGISTER_OP_CPU_KERNEL(
     im2sequence,
     ops::Im2SequenceKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index ec2e641679..5d8710a9b3 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -89,4 +89,4 @@ REGISTER_OP_CPU_KERNEL(
     increment, ops::IncrementKernel<paddle::platform::CPUDeviceContext, float>,
     ops::IncrementKernel<paddle::platform::CPUDeviceContext, double>,
     ops::IncrementKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int64_t>)
+    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/increment_op.cu b/paddle/fluid/operators/increment_op.cu
index 7fb6425fe9..228063bf3d 100644
--- a/paddle/fluid/operators/increment_op.cu
+++ b/paddle/fluid/operators/increment_op.cu
@@ -19,4 +19,4 @@ REGISTER_OP_CUDA_KERNEL(
     increment, ops::IncrementKernel<paddle::platform::CUDADeviceContext, float>,
     ops::IncrementKernel<paddle::platform::CUDADeviceContext, double>,
     ops::IncrementKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>)
+    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/iou_similarity_op.cc b/paddle/fluid/operators/iou_similarity_op.cc
old mode 100755
new mode 100644
diff --git a/paddle/fluid/operators/iou_similarity_op.cu b/paddle/fluid/operators/iou_similarity_op.cu
old mode 100755
new mode 100644
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index 963b0587c3..0c143b7c8a 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -67,8 +67,9 @@ $$Out = \sum{|X|}$$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad,
-            ops::L1NormGradOp);
+REGISTER_OPERATOR(l1_norm, ops::L1NormOp, ops::L1NormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp);
 REGISTER_OP_CPU_KERNEL(
     l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index c2a8c7f867..a73c626032 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -117,8 +117,9 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
 
-REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
-            label_smooth_grad, ops::LabelSmoothGradOp);
+REGISTER_OPERATOR(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(label_smooth_grad, ops::LabelSmoothGradOp);
 REGISTER_OP_CPU_KERNEL(
     label_smooth,
     ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index 88b3b08af5..de1056aef7 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -162,8 +162,9 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
-            layer_norm_grad, ops::LayerNormGradOp);
+REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp);
 REGISTER_OP_CPU_KERNEL(
     layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
     ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index ef568a578b..2f29e377fd 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -256,8 +256,10 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFOpMaker,
-            linear_chain_crf_grad, ops::LinearChainCRFGradOp);
+REGISTER_OPERATOR(linear_chain_crf, ops::LinearChainCRFOp,
+                  ops::LinearChainCRFOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp);
 REGISTER_OP_CPU_KERNEL(
     linear_chain_crf,
     ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
index 7d5687f2d0..92ebfc274b 100644
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -155,8 +155,9 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad,
-            ops::LoDResetGradOp);
+REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp);
 REGISTER_OP_CPU_KERNEL(
     lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
     ops::LoDResetKernel<paddle::platform::CPUPlace, double>,
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index f44996d8ac..a8258a1afd 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -106,8 +106,9 @@ class LogLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
-            ops::LogLossGradOp);
+REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 553a06c3dc..f5c0e47fda 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -276,7 +276,9 @@ class LRNOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker<float>, lrn_grad, ops::LRNOpGrad);
+REGISTER_OPERATOR(lrn, ops::LRNOp, ops::LRNOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lrn_grad, ops::LRNOpGrad);
 REGISTER_OP_CPU_KERNEL(
     lrn, ops::LRNKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index e062d62c66..084ee1cfe6 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -273,7 +273,9 @@ class LSTMGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp);
+REGISTER_OPERATOR(lstm, ops::LSTMOp, ops::LSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp);
 REGISTER_OP_CPU_KERNEL(
     lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>,
     ops::LSTMKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc
index b3c9d7c34d..e1157ef6c6 100644
--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ b/paddle/fluid/operators/lstm_unit_op.cc
@@ -97,8 +97,9 @@ class LstmUnitGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker, lstm_unit_grad,
-            ops::LstmUnitGradOp);
+REGISTER_OPERATOR(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lstm_unit_grad, ops::LstmUnitGradOp);
 REGISTER_OP_CPU_KERNEL(lstm_unit,
                        ops::LstmUnitKernel<paddle::platform::CPUPlace, float>,
                        ops::LstmUnitKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index 82541517e1..f9261323f0 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -322,8 +322,9 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, lstmp_grad,
-            ops::LSTMPGradOp);
+REGISTER_OPERATOR(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lstmp_grad, ops::LSTMPGradOp);
 REGISTER_OP_CPU_KERNEL(
     lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
     ops::LSTMPKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc
index b146b50883..0b41a3e1ff 100644
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
@@ -111,9 +111,10 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
 
-REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp,
-            ops::MarginRankLossOpMaker<float>, margin_rank_loss_grad,
-            ops::MarginRankLossGradOp);
+REGISTER_OPERATOR(margin_rank_loss, ops::MarginRankLossOp,
+                  ops::MarginRankLossOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     margin_rank_loss,
     ops::MarginRankLossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 1f52558873..e5d33fbc36 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -237,8 +237,9 @@ class MatMulOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad,
-            ops::MatMulOpGrad);
+REGISTER_OPERATOR(matmul, ops::MatMulOp, ops::MatMulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(matmul_grad, ops::MatMulOpGrad);
 REGISTER_OP_CPU_KERNEL(
     matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
index 4e28d98834..e2bcba5a5e 100644
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -101,8 +101,9 @@ class MaxOutOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
-            ops::MaxOutOpGrad);
+REGISTER_OPERATOR(maxout, ops::MaxOutOp, ops::MaxOutOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(maxout_grad, ops::MaxOutOpGrad);
 REGISTER_OP_CPU_KERNEL(
     maxout, ops::MaxOutKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
index a8fbd48c4d..3a0fc74584 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -108,9 +108,10 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp,
-            ops::ModifiedHuberLossOpMaker, modified_huber_loss_grad,
-            ops::ModifiedHuberLossGradOp);
+REGISTER_OPERATOR(modified_huber_loss, ops::ModifiedHuberLossOp,
+                  ops::ModifiedHuberLossOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(modified_huber_loss_grad, ops::ModifiedHuberLossGradOp);
 
 REGISTER_OP_CPU_KERNEL(
     modified_huber_loss,
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 5038287527..bfb20fefba 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -160,7 +160,9 @@ class MulGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulGradOp);
+REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(mul_grad, ops::MulGradOp);
 REGISTER_OP_CPU_KERNEL(
     mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/multiclass_nms_op.cc b/paddle/fluid/operators/multiclass_nms_op.cc
index 0f80f752c9..a12b975326 100644
--- a/paddle/fluid/operators/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/multiclass_nms_op.cc
@@ -173,8 +173,8 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
 
   void MultiClassNMS(const framework::ExecutionContext& ctx,
                      const Tensor& scores, const Tensor& bboxes,
-                     std::map<int, std::vector<int>>& indices,
-                     int& num_nmsed_out) const {
+                     std::map<int, std::vector<int>>* indices,
+                     int* num_nmsed_out) const {
     int64_t background_label = ctx.Attr<int>("background_label");
     int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
     int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
@@ -189,15 +189,15 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
       if (c == background_label) continue;
       Tensor score = scores.Slice(c, c + 1);
       NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k,
-              &(indices[c]));
-      num_det += indices[c].size();
+              &((*indices)[c]));
+      num_det += (*indices)[c].size();
     }
 
-    num_nmsed_out = num_det;
+    *num_nmsed_out = num_det;
     const T* scores_data = scores.data<T>();
     if (keep_top_k > -1 && num_det > keep_top_k) {
       std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-      for (const auto& it : indices) {
+      for (const auto& it : *indices) {
         int label = it.first;
         const T* sdata = scores_data + label * predict_dim;
         const std::vector<int>& label_indices = it.second;
@@ -220,13 +220,13 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
         int idx = score_index_pairs[j].second.second;
         new_indices[label].push_back(idx);
       }
-      new_indices.swap(indices);
-      num_nmsed_out = keep_top_k;
+      new_indices.swap(*indices);
+      *num_nmsed_out = keep_top_k;
     }
   }
 
   void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
-                        std::map<int, std::vector<int>>& selected_indices,
+                        const std::map<int, std::vector<int>>& selected_indices,
                         Tensor* outs) const {
     int predict_dim = scores.dims()[1];
     auto* scores_data = scores.data<T>();
@@ -273,7 +273,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
 
       std::map<int, std::vector<int>> indices;
       int num_nmsed_out = 0;
-      MultiClassNMS(ctx, ins_score, ins_boxes, indices, num_nmsed_out);
+      MultiClassNMS(ctx, ins_score, ins_boxes, &indices, &num_nmsed_out);
       all_indices.push_back(indices);
       batch_starts.push_back(batch_starts.back() + num_nmsed_out);
     }
diff --git a/paddle/fluid/operators/nccl_op.cu.cc b/paddle/fluid/operators/nccl_op.cu.cc
index ad623e1fe0..8de974bc2b 100644
--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
@@ -135,8 +135,9 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
       auto* x = ctx.Input<LoDTensor>("X");
       VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
       PADDLE_ENFORCE(platform::dynload::ncclBcast(
-          (void*)x->data<T>(), x->numel(), NCCLTypeWrapper<T>::type, root,
-          comm->comms().at(idx), ctx.cuda_device_context().stream()));
+          reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), x->numel(),
+          NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
+          ctx.cuda_device_context().stream()));
       VLOG(3) << "gpu : " << gpu_id << " finished Bcast.";
     } else {
       auto* out = ctx.Output<LoDTensor>("Out");
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 99f38529bb..192bdf8ea5 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/nce_op.h"
 
+#include <vector>
+
 namespace paddle {
 namespace operators {
 
@@ -179,7 +181,9 @@ class NCEOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad);
+REGISTER_OPERATOR(nce, ops::NCEOp, ops::NCEOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(nce_grad, ops::NCEOpGrad);
 REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>,
                        ops::NCEKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(nce_grad,
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 9420763847..2c4c97f28b 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <math.h>
 #include <random>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -108,7 +109,7 @@ class NCEKernel : public framework::OpKernel<T> {
     auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
     for (int64_t i = 0; i < sample_labels->numel(); ++i) {
       Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
-          (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) *
+          (input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
            weight_mat.chip(sample_labels_data[i], 0))
               .sum();
       sample_out_data[i] += result(0);
@@ -190,7 +191,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
       auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
       for (int64_t i = 0; i < sample_labels->numel(); ++i) {
         d_w_matrix.chip(sample_labels_data[i], 0) +=
-            x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) *
+            x_matrix.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
             sample_grad_data[i];
       }
     }
@@ -202,7 +203,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
       auto d_x_matrix = EigenMatrix<T>::From(*d_x);
       auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
       for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) +=
+        d_x_matrix.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) +=
             w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
       }
     }
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
index 5345c5bdb0..30a991224f 100644
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -85,8 +85,9 @@ class NormOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker<float>, norm_grad,
-            ops::NormOpGrad);
+REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(norm_grad, ops::NormOpGrad);
 REGISTER_OP_CPU_KERNEL(
     norm, ops::NormKernel<paddle::platform::CPUDeviceContext, float>,
     ops::NormKernel<paddle::platform::CPUDeviceContext, double, float>);
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index b144ec5f7d..f2de075e0d 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -333,18 +333,20 @@ Example:
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
-            ops::PoolOpGrad);
+REGISTER_OPERATOR(pool2d, ops::PoolOp, ops::Pool2dOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     pool2d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
     ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     pool2d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>)
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
 
-REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
-            ops::PoolOpGrad);
+REGISTER_OPERATOR(pool3d, ops::PoolOp, ops::Pool3dOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     pool3d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index 4df0a14577..848cd61b23 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -258,9 +258,10 @@ Example:
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
-            ops::MaxPool2dWithIndexOpMaker, max_pool2d_with_index_grad,
-            ops::MaxPoolWithIndexOpGrad);
+REGISTER_OPERATOR(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
+                  ops::MaxPool2dWithIndexOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(max_pool2d_with_index_grad, ops::MaxPoolWithIndexOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     max_pool2d_with_index,
@@ -272,11 +273,12 @@ REGISTER_OP_CPU_KERNEL(
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
                                     int>,
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>)
+                                    int>);
 
-REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
-            ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad,
-            ops::MaxPoolWithIndexOpGrad);
+REGISTER_OPERATOR(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
+                  ops::MaxPool3dWithIndexOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(max_pool3d_with_index_grad, ops::MaxPoolWithIndexOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     max_pool3d_with_index,
@@ -288,4 +290,4 @@ REGISTER_OP_CPU_KERNEL(
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
                                     int>,
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>)
+                                    int>);
diff --git a/paddle/fluid/operators/pool_with_index_op.cu.cc b/paddle/fluid/operators/pool_with_index_op.cu.cc
index 5fc418b6fd..5497dcbd9c 100644
--- a/paddle/fluid/operators/pool_with_index_op.cu.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cu.cc
@@ -27,7 +27,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
                                     int>,
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>)
+                                    int>);
 
 REGISTER_OP_CUDA_KERNEL(
     max_pool3d_with_index,
@@ -40,4 +40,4 @@ REGISTER_OP_CUDA_KERNEL(
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
                                     int>,
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>)
+                                    int>);
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index 8eaa12a4a6..a066b3e06e 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -83,8 +83,9 @@ class PReluGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad,
-            ops::PReluGradOp);
+REGISTER_OPERATOR(prelu, ops::PReluOp, ops::PReluOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(prelu_grad, ops::PReluGradOp);
 REGISTER_OP_CPU_KERNEL(
     prelu, ops::PReluKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index a1127f11a7..eb9ff8de3e 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -121,8 +121,9 @@ class RankLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
 
-REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad,
-            ops::RankLossGradOp);
+REGISTER_OPERATOR(rank_loss, ops::RankLossOp, ops::RankLossOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(rank_loss_grad, ops::RankLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     rank_loss, ops::RankLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc
index 7879367830..093db96647 100644
--- a/paddle/fluid/operators/reduce_op.cc
+++ b/paddle/fluid/operators/reduce_op.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/reduce_op.h"
 
+#include <string>
+#include <vector>
+
 namespace paddle {
 namespace operators {
 
@@ -122,18 +125,18 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.
  protected:
   std::string comment_;
 
-  void Replace(std::string &src, std::string from, std::string to) {
+  void Replace(std::string *src, std::string from, std::string to) {
     std::size_t len_from = std::strlen(from.c_str());
     std::size_t len_to = std::strlen(to.c_str());
-    for (std::size_t pos = src.find(from); pos != std::string::npos;
-         pos = src.find(from, pos + len_to)) {
-      src.replace(pos, len_from, to);
+    for (std::size_t pos = src->find(from); pos != std::string::npos;
+         pos = src->find(from, pos + len_to)) {
+      src->replace(pos, len_from, to);
     }
   }
 
   void SetComment(std::string name, std::string op) {
-    Replace(comment_, "{ReduceOp}", name);
-    Replace(comment_, "{reduce}", op);
+    Replace(&comment_, "{ReduceOp}", name);
+    Replace(&comment_, "{reduce}", op);
   }
 };
 
@@ -187,20 +190,25 @@ class ReduceProdOpMaker : public ReduceOpMaker {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker, reduce_sum_grad,
-            ops::ReduceGradOp);
+REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp);
 
-REGISTER_OP(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker,
-            reduce_mean_grad, ops::ReduceGradOp);
+REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp);
 
-REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad,
-            ops::ReduceGradOp);
+REGISTER_OPERATOR(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp);
 
-REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad,
-            ops::ReduceGradOp);
+REGISTER_OPERATOR(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reduce_min_grad, ops::ReduceGradOp);
 
-REGISTER_OP(reduce_prod, ops::ReduceOp, ops::ReduceProdOpMaker,
-            reduce_prod_grad, ops::ReduceGradOp);
+REGISTER_OPERATOR(reduce_prod, ops::ReduceOp, ops::ReduceProdOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reduce_prod_grad, ops::ReduceGradOp);
 
 #define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
   REGISTER_OP_CPU_KERNEL(reduce_type,                                          \
diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h
index b28dd7f209..e42b4bfe42 100644
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
@@ -35,77 +35,77 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 struct SumFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.sum(dim);
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->sum(dim);
   }
 };
 
 struct SumGradFunctor {
   template <typename DeviceContext, typename X, typename Y, typename DX,
             typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    dx.device(place) = dy.broadcast(dim);
+    dx->device(place) = dy->broadcast(dim);
   }
 };
 
 struct MeanFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.mean(dim);
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->mean(dim);
   }
 };
 
 struct MeanGradFunctor {
   template <typename DeviceContext, typename X, typename Y, typename DX,
             typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    dx.device(place) = dy.broadcast(dim) / dx.constant(size);
+    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
   }
 };
 
 struct MaxFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.maximum(dim);
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->maximum(dim);
   }
 };
 
 struct MinFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.minimum(dim);
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->minimum(dim);
   }
 };
 
 struct MaxOrMinGradFunctor {
   template <typename DeviceContext, typename X, typename Y, typename DX,
             typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    auto equals = x == y.broadcast(dim);
-    auto ones = dx.constant(1);
-    auto zeros = dx.constant(0);
+    auto equals = (*x) == y->broadcast(dim);
+    auto ones = dx->constant(1);
+    auto zeros = dx->constant(0);
     // If there are multiple minimum or maximum elements, the subgradient of
     // each is the set [0, 1], and we pass gradient to all of them here.
-    dx.device(place) = dy.broadcast(dim) * equals.select(ones, zeros);
+    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
   }
 };
 
 struct ProdFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.prod(dim);
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->prod(dim);
   }
 };
 
 struct ProdGradFunctor {
   template <typename DeviceContext, typename X, typename Y, typename DX,
             typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    dx.device(place) = dy.broadcast(dim) * y.broadcast(dim) * x.inverse();
+    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
   }
 };
 
@@ -125,7 +125,7 @@ class ReduceKernel : public framework::OpKernel<T> {
           *context.template device_context<DeviceContext>().eigen_device();
       auto reduce_dim = Eigen::array<int, 1>({{0}});
       Functor functor;
-      functor(place, x, out, reduce_dim);
+      functor(place, &x, &out, reduce_dim);
     } else {
       int rank = context.Input<Tensor>("X")->dims().size();
       switch (rank) {
@@ -178,10 +178,10 @@ class ReduceKernel : public framework::OpKernel<T> {
 
     if (D == 1) {
       auto out = EigenScalar<T>::From(*output);
-      functor(place, x, out, reduce_dim);
+      functor(place, &x, &out, reduce_dim);
     } else {
       auto out = EigenTensor<T, (D - 1)>::From(*output, dims);
-      functor(place, x, out, reduce_dim);
+      functor(place, &x, &out, reduce_dim);
     }
   }
 };
@@ -206,7 +206,7 @@ class ReduceGradKernel : public framework::OpKernel<T> {
       auto broadcast_dim =
           Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
       Functor functor;
-      functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
+      functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
               broadcast_dim[0]);
     } else {
       int rank = context.Input<Tensor>("X")->dims().size();
@@ -258,7 +258,7 @@ class ReduceGradKernel : public framework::OpKernel<T> {
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
-    functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
+    functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
             broadcast_dim[dim]);
   }
 };
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 93f9c74b80..5e5ccc3ded 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -113,8 +113,9 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
-REGISTER_OP(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, reshape_grad,
-            ops::ReshapeGradOp);
+REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp);
 REGISTER_OP_CPU_KERNEL(reshape, ops::ReshapeKernel<CPU, float>,
                        ops::ReshapeKernel<CPU, double>,
                        ops::ReshapeKernel<CPU, int>,
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 6d4861f042..224ec93d28 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -153,8 +153,9 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
-            ops::ROIPoolGradOp);
+REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp);
 REGISTER_OP_CPU_KERNEL(
     roi_pool,
     ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index d34beeb650..23f720da0b 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -250,8 +250,9 @@ class RowConvGradKernel<platform::CPUDeviceContext, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad,
-            ops::RowConvGradOp);
+REGISTER_OPERATOR(row_conv, ops::RowConvOp, ops::RowConvOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(row_conv_grad, ops::RowConvGradOp);
 REGISTER_OP_CPU_KERNEL(
     row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc
index 286f75df4c..2773c32a0a 100644
--- a/paddle/fluid/operators/save_load_combine_op_test.cc
+++ b/paddle/fluid/operators/save_load_combine_op_test.cc
@@ -23,17 +23,17 @@ USE_NO_KERNEL_OP(load_combine);
 
 int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
                             std::string var_name,
-                            paddle::platform::CPUPlace& place,
-                            paddle::framework::Scope& scope,
-                            paddle::framework::LoD& expect_lod) {
-  auto var = scope.Var(var_name);
+                            const paddle::platform::CPUPlace& place,
+                            paddle::framework::Scope* scope,
+                            paddle::framework::LoD* expect_lod) {
+  auto var = scope->Var(var_name);
   auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
   tensor->Resize({x, y});
-  expect_lod.resize(1);
+  expect_lod->resize(1);
   for (size_t i = 0; i < lod_info.size(); i++) {
-    expect_lod[0].push_back(lod_info[i]);
+    (*expect_lod)[0].push_back(lod_info[i]);
   }
-  tensor->set_lod(expect_lod);
+  tensor->set_lod(*expect_lod);
   int* expect = tensor->mutable_data<int>(place);
   for (int64_t i = 0; i < tensor->numel(); ++i) {
     expect[i] = static_cast<int>(i);
@@ -42,17 +42,17 @@ int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
 }
 
 paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad(
-    const std::string out_var_name, paddle::framework::Scope& scope) {
-  auto load_var = scope.Var(out_var_name);
+    const std::string out_var_name, paddle::framework::Scope* scope) {
+  auto load_var = scope->Var(out_var_name);
   auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
   return target;
 }
 
 int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
-                                 paddle::framework::Scope& scope,
-                                 paddle::framework::LoD& actual_lod) {
+                                 const paddle::framework::Scope& scope,
+                                 paddle::framework::LoD* actual_lod) {
   int* actual = target->data<int>();
-  actual_lod = target->lod();
+  *actual_lod = target->lod();
   return actual;
 }
 
@@ -78,26 +78,26 @@ TEST(SaveLoadCombineOp, CPU) {
   std::vector<int> lod1 = {0, 1, 2, 3, 10};
   int numel1 = 100;
   paddle::framework::LoD expect_lod1;
-  int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope,
-                                        expect_lod1);
+  int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place,
+                                        &scope, &expect_lod1);
 
   std::vector<int> lod2 = {0, 2, 5, 10};
   int numel2 = 200;
   paddle::framework::LoD expect_lod2;
-  int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope,
-                                        expect_lod2);
+  int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place,
+                                        &scope, &expect_lod2);
 
   std::vector<int> lod3 = {0, 2, 3, 20};
   int numel3 = 4000;
   paddle::framework::LoD expect_lod3;
   int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place,
-                                        scope, expect_lod3);
+                                        &scope, &expect_lod3);
 
   std::vector<int> lod4 = {0, 1, 20};
   int numel4 = 1000;
   paddle::framework::LoD expect_lod4;
-  int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope,
-                                        expect_lod4);
+  int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place,
+                                        &scope, &expect_lod4);
 
   // Set attributes
   std::string filename = "check_tensor.ls";
@@ -111,10 +111,10 @@ TEST(SaveLoadCombineOp, CPU) {
   save_combine_op->Run(scope, place);
 
   // Set up output vars
-  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope);
-  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope);
-  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope);
-  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope);
+  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", &scope);
+  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", &scope);
+  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", &scope);
+  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", &scope);
 
   // Run the load_combine_op
   auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
@@ -123,10 +123,10 @@ TEST(SaveLoadCombineOp, CPU) {
   load_combine_op->Run(scope, place);
 
   paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
-  int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1);
-  int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2);
-  int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3);
-  int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4);
+  int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, &actual_lod1);
+  int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, &actual_lod2);
+  int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, &actual_lod3);
+  int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, &actual_lod4);
 
   CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1);
   CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2);
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index d6fd621471..95b12455ea 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -102,7 +102,8 @@ $$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
-            ops::ScatterGradOp);
+REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp);
 REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
 REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc
index 126753edd0..3c21903e3a 100644
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
@@ -124,9 +124,11 @@ class SequenceConcatGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp,
-               ops::SequenceConcatOpMaker, sequence_concat_grad,
-               ops::SequenceConcatGradOp, false);
+REGISTER_OPERATOR(sequence_concat, ops::SequenceConcatOp,
+                  ops::SequenceConcatOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<
+                      false> /* set false to disable empty grad */);
+REGISTER_OPERATOR(sequence_concat_grad, ops::SequenceConcatGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_concat,
     ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_conv_op.cc b/paddle/fluid/operators/sequence_conv_op.cc
index ec1f3a5da8..94f4b49b00 100644
--- a/paddle/fluid/operators/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_conv_op.h"
 
+#include <algorithm>
+
 namespace paddle {
 namespace operators {
 
@@ -174,8 +176,9 @@ context_length, context_stride and context_start.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
-            sequence_conv_grad, ops::SequenceConvGradOp);
+REGISTER_OPERATOR(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_conv_grad, ops::SequenceConvGradOp);
 
 REGISTER_OP_CPU_KERNEL(
     sequence_conv,
diff --git a/paddle/fluid/operators/sequence_expand_op.cc b/paddle/fluid/operators/sequence_expand_op.cc
index ae52849162..84a35d7172 100644
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
@@ -200,8 +200,10 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker,
-            sequence_expand_grad, ops::SequenceExpandOpGrad);
+REGISTER_OPERATOR(sequence_expand, ops::SequenceExpandOp,
+                  ops::SequenceExpandOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_expand_grad, ops::SequenceExpandOpGrad);
 REGISTER_OP_CPU_KERNEL(
     sequence_expand,
     ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/sequence_slice_op.cc b/paddle/fluid/operators/sequence_slice_op.cc
index d09e5bca56..7cd620af07 100644
--- a/paddle/fluid/operators/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_slice_op.cc
@@ -120,8 +120,10 @@ NOTE: The first dimension size of input, the size of offset and Length, should b
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker,
-            sequence_slice_grad, ops::SequenceSliceGradOp);
+REGISTER_OPERATOR(sequence_slice, ops::SequenceSliceOp,
+                  ops::SequenceSliceOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_slice_grad, ops::SequenceSliceGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_slice,
     ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_slice_op.cu b/paddle/fluid/operators/sequence_slice_op.cu
old mode 100755
new mode 100644
diff --git a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
index 5661f4b42f..0ddacb5710 100644
--- a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
@@ -99,7 +99,7 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(sequence_softmax, CUDNN, ::paddle::platform::CUDAPlace,
                    ops::SequenceSoftmaxCUDNNKernel<float>,
-                   ops::SequenceSoftmaxCUDNNKernel<double>)
+                   ops::SequenceSoftmaxCUDNNKernel<double>);
 REGISTER_OP_KERNEL(sequence_softmax_grad, CUDNN, ::paddle::platform::CUDAPlace,
                    ops::SequenceSoftmaxGradCUDNNKernel<float>,
-                   ops::SequenceSoftmaxGradCUDNNKernel<double>)
+                   ops::SequenceSoftmaxGradCUDNNKernel<double>);
diff --git a/paddle/fluid/operators/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_softmax_op.cc
index d2c1317bef..a0d47c12ba 100644
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
@@ -155,9 +155,10 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp,
-            ops::SequenceSoftmaxOpMaker, sequence_softmax_grad,
-            ops::SequenceSoftmaxGradOp);
+REGISTER_OPERATOR(sequence_softmax, ops::SequenceSoftmaxOp,
+                  ops::SequenceSoftmaxOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_softmax_grad, ops::SequenceSoftmaxGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_softmax,
     ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/sequence_softmax_op.cu.cc b/paddle/fluid/operators/sequence_softmax_op.cu.cc
index 57adea3a1b..397df75415 100644
--- a/paddle/fluid/operators/sequence_softmax_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cu.cc
@@ -18,7 +18,7 @@ namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     sequence_softmax,
     ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, double>)
+    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     sequence_softmax_grad,
     ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 7b93f19bb2..5db77d0493 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -135,11 +135,12 @@ However the output only shares the LoD with input `X`.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sigmoid_cross_entropy_with_logits,
-            ops::SigmoidCrossEntropyWithLogitsOp,
-            ops::SigmoidCrossEntropyWithLogitsOpMaker,
-            sigmoid_cross_entropy_with_logits_grad,
-            ops::SigmoidCrossEntropyWithLogitsGradOp);
+REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits,
+                  ops::SigmoidCrossEntropyWithLogitsOp,
+                  ops::SigmoidCrossEntropyWithLogitsOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
+                  ops::SigmoidCrossEntropyWithLogitsGradOp);
 REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
                        ops::SigmoidCrossEntropyWithLogitsKernel<
                            paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc
index 658eb01952..322581fdef 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
@@ -132,8 +132,9 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker,
-            smooth_l1_loss_grad, ops::SmoothL1LossGradOp);
+REGISTER_OPERATOR(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(smooth_l1_loss_grad, ops::SmoothL1LossGradOp);
 REGISTER_OP_CPU_KERNEL(
     smooth_l1_loss,
     ops::SmoothL1LossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index e1f286f9ba..2741ba95bc 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -160,8 +160,9 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad,
-            ops::SoftmaxOpGrad);
+REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
 REGISTER_OP_CPU_KERNEL(
     softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/split_byref_op.cc b/paddle/fluid/operators/split_byref_op.cc
new file mode 100644
index 0000000000..7413ce3e9c
--- /dev/null
+++ b/paddle/fluid/operators/split_byref_op.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_byref_op.h"
+#include "paddle/fluid/operators/split_op.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+class SplitByrefOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SplitOp should not be null.");
+    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
+                      "Outputs(Out) of SplitOp should not be empty.");
+    auto in_dims = ctx->GetInputDim("X");
+    auto outs_names = ctx->Outputs("Out");
+    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
+    std::vector<int> sections = static_cast<std::vector<int>>(
+        ctx->Attrs().Get<std::vector<int>>("sections"));
+    const size_t outs_number = outs_names.size();
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.reserve(outs_number);
+
+    if (num > 0) {
+      int64_t in_axis_dim = in_dims[0];
+      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
+                        "tensor split does not result"
+                        " in an equal division");
+      size_t out_axis_dim = in_axis_dim / num;
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
+        dim[0] = out_axis_dim;
+        outs_dims.push_back(dim);
+      }
+    } else if (sections.size() > 0) {
+      PADDLE_ENFORCE_EQ(sections.size(), outs_number,
+                        "tensor split sections size"
+                        "should be equal to output size.");
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
+        dim[0] = sections[i];
+        outs_dims.push_back(dim);
+      }
+    }
+    ctx->SetOutputsDim("Out", outs_dims);
+  }
+};
+
+class SplitByrefOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitByrefOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor of the split operator.");
+    AddOutput("Out", "(Tensor) Output tensors of the split operator.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+SplitByref operator
+
+Split source tensor to sevaral tensors by axis 0. No copy in this operator
+is performed, output tensor shares the same blocks of memory.
+)DOC");
+    AddAttr<std::vector<int>>("sections",
+                              "(vector<int>) "
+                              "the length of each output along the "
+                              "specified axis.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<int>("num",
+                 "(int, default 0)"
+                 "Number of sub-tensors. This must evenly divide "
+                 "Input.dims()[axis]")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+// NOTE: concat op default axis must be 0!
+USE_CPU_ONLY_OP(concat);
+
+REGISTER_OPERATOR(split_byref, ops::SplitByrefOp, ops::SplitByrefOpMaker,
+                  ops::SplitGradMaker);
+REGISTER_OP_CPU_KERNEL(
+    split_byref, ops::SplitByrefOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/split_byref_op.cu.cc b/paddle/fluid/operators/split_byref_op.cu.cc
new file mode 100644
index 0000000000..5ee6186f35
--- /dev/null
+++ b/paddle/fluid/operators/split_byref_op.cu.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_byref_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    split_byref,
+    ops::SplitByrefOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/split_byref_op.h b/paddle/fluid/operators/split_byref_op.h
new file mode 100644
index 0000000000..a3aad68ea7
--- /dev/null
+++ b/paddle/fluid/operators/split_byref_op.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SplitByrefOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto place = ctx.GetPlace();
+
+    size_t row_offset = 0;
+    for (size_t i = 0; i < outs.size(); ++i) {
+      // NOTE: no need to call mutable_data here to allocate memory.
+      auto* out = outs[i];
+      VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0];
+      *out = std::move(in->Slice(row_offset, row_offset + out->dims()[0]));
+      row_offset += out->dims()[0];
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index e745509ec8..a4398df36b 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -108,21 +108,6 @@ Example:
   }
 };
 
-class SplitGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto op = new framework::OpDesc();
-    op->SetType("concat");
-    op->SetInput("X", OutputGrad("Out"));
-    op->SetOutput("Out", InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
index e2c41f44ab..f0c417c705 100644
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -44,5 +44,20 @@ class SplitOpKernel : public framework::OpKernel<T> {
   }
 };
 
+class SplitGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto op = new framework::OpDesc();
+    op->SetType("concat");
+    op->SetInput("X", OutputGrad("Out"));
+    op->SetOutput("Out", InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc
index 8c55b4ebbc..1cada95501 100644
--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
@@ -92,7 +92,9 @@ class SppOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(spp, ops::SppOp, ops::SppOpMaker, spp_grad, ops::SppOpGrad);
+REGISTER_OPERATOR(spp, ops::SppOp, ops::SppOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(spp_grad, ops::SppOpGrad);
 REGISTER_OP_CPU_KERNEL(
     spp, ops::SppKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SppKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc
index 1c5e87040a..c32f575b54 100644
--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
@@ -109,9 +109,10 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp,
-            ops::SquaredL2DistanceOpMaker, squared_l2_distance_grad,
-            ops::SquaredL2DistanceGradOp);
+REGISTER_OPERATOR(squared_l2_distance, ops::SquaredL2DistanceOp,
+                  ops::SquaredL2DistanceOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(squared_l2_distance_grad, ops::SquaredL2DistanceGradOp);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_distance,
     ops::SquaredL2DistanceKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/squared_l2_norm_op.cc b/paddle/fluid/operators/squared_l2_norm_op.cc
index b64df2a218..4ce51259da 100644
--- a/paddle/fluid/operators/squared_l2_norm_op.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op.cc
@@ -67,8 +67,10 @@ $$Out = \sum_{i} X_{i}^2$$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker,
-            squared_l2_norm_grad, ops::SquaredL2NormGradOp);
+REGISTER_OPERATOR(squared_l2_norm, ops::SquaredL2NormOp,
+                  ops::SquaredL2NormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(squared_l2_norm_grad, ops::SquaredL2NormGradOp);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_norm,
     ops::SquaredL2NormKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 4aea9cd65b..3555cb68ca 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -118,8 +118,9 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(transpose, ops::TransposeOp, ops::TransposeOpMaker, transpose_grad,
-            ops::TransposeOpGrad);
+REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
 REGISTER_OP_CPU_KERNEL(
     transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index 31859fd1d7..b3cd87efa2 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -132,8 +132,9 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
-            ops::UnpoolOpGrad);
+REGISTER_OPERATOR(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(unpool_grad, ops::UnpoolOpGrad);
 REGISTER_OP_CPU_KERNEL(
     unpool, ops::UnpoolKernel<paddle::platform::CPUDeviceContext, float>,
     ops::UnpoolKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index 940bf4fe7b..6835a5dd62 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -132,8 +132,9 @@ class WarpCTCGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker, warpctc_grad,
-            ops::WarpCTCGradOp);
+REGISTER_OPERATOR(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(warpctc_grad, ops::WarpCTCGradOp);
 REGISTER_OP_CPU_KERNEL(
     warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 93533e5c9d..7de7f84a3d 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -127,6 +127,8 @@ void BindProgramDesc(pybind11::module *m) {
       .def("block", &pd::ProgramDesc::MutableBlock,
            pybind11::return_value_policy::reference)
       .def("num_blocks", &pd::ProgramDesc::Size)
+      .def("get_feed_target_names", &pd::ProgramDesc::GetFeedTargetNames)
+      .def("get_fetch_target_names", &pd::ProgramDesc::GetFetchTargetNames)
       .def("serialize_to_string", SerializeMessage<pd::ProgramDesc>)
       .def("parse_from_string",
            [](pd::ProgramDesc &program_desc, const std::string &data) {
@@ -299,6 +301,7 @@ void BindOpDesc(pybind11::module *m) {
       .def("check_attrs", &pd::OpDesc::CheckAttrs)
       .def("infer_shape", &pd::OpDesc::InferShape)
       .def("infer_var_type", &pd::OpDesc::InferVarType)
+      .def("set_is_target", &pd::OpDesc::SetIsTarget)
       .def("serialize_to_string", SerializeMessage<pd::OpDesc>)
       .def("block", &pd::OpDesc::Block,
            pybind11::return_value_policy::reference);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 19bd30d966..64d92cac7e 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -294,7 +294,7 @@ All parameter, weight, gradient are variables in Paddle.
                     const std::vector<std::array<size_t, 2>> &targets) {
     ProgramDesc prog_with_targets(origin);
     for (const auto &t : targets) {
-      prog_with_targets.MutableBlock(t[0])->Op(t[1])->MarkAsTarget();
+      prog_with_targets.MutableBlock(t[0])->Op(t[1])->SetIsTarget(true);
     }
     proto::ProgramDesc pruned_desc;
     Prune(*prog_with_targets.Proto(), &pruned_desc);
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index aa15392d7e..50be386c51 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -420,13 +420,14 @@ class DistributeTranspiler:
 
         # append op to the current block
         per_opt_block = append_block
-        for _, opt_op in enumerate(opt_op_on_pserver):
+        for idx, opt_op in enumerate(opt_op_on_pserver):
             for _, op in enumerate(self.optimize_ops):
                 # optimizer is connected to itself
                 if ufind.is_connected(op, opt_op) and \
                     op not in global_ops:
                     __append_optimize_op__(op, per_opt_block)
-            per_opt_block = pserver_program.create_block(append_block.idx)
+            if idx == len(opt_op_on_pserver) - 1 and global_ops:
+                per_opt_block = pserver_program.create_block(append_block.idx)
 
         # append global ops
         for glb_op in global_ops:
@@ -824,7 +825,7 @@ class DistributeTranspiler:
                 for v in splited_vars:
                     sections.append(v.shape[0])
                 program.global_block().append_op(
-                    type="split",
+                    type="split_byref",
                     inputs={"X": orig_var},
                     outputs={"Out": splited_vars},
                     attrs={"sections": sections}  # assume split evenly
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 4b841ef31d..5e6c6204c5 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1070,6 +1070,12 @@ class Program(object):
         for t in targets:
             if not isinstance(t, Operator):
                 if isinstance(t, Variable):
+                    if t.op is None:
+                        global_block = self.global_block()
+                        for op in global_block.ops:
+                            if t.name in op.output_arg_names:
+                                t.op = op
+                                break
                     t = t.op
                 else:
                     raise ValueError(("All targets of prune() can only be "
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 1c0f1f6eb4..bf4d81233d 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -340,6 +340,13 @@ def save_inference_model(dirname,
     if not os.path.isdir(dirname):
         os.makedirs(dirname)
 
+    # Clear the is_target information and remove the existed feed and fetch op
+    global_block = main_program.global_block()
+    for i, op in enumerate(global_block.ops):
+        op.desc.set_is_target(False)
+        if op.type == "feed" or op.type == "fetch":
+            global_block.remove_op(i)
+
     pruned_program = main_program.prune(targets=target_vars)
     inference_program = pruned_program.inference_optimize()
     fetch_var_names = [v.name for v in target_vars]
@@ -362,24 +369,6 @@ def save_inference_model(dirname,
     save_persistables(executor, dirname, inference_program, params_filename)
 
 
-def get_feed_targets_names(program):
-    feed_targets_names = []
-    global_block = program.global_block()
-    for op in global_block.ops:
-        if op.desc.type() == 'feed':
-            feed_targets_names.insert(0, op.desc.output('Out')[0])
-    return feed_targets_names
-
-
-def get_fetch_targets_names(program):
-    fetch_targets_names = []
-    global_block = program.global_block()
-    for op in global_block.ops:
-        if op.desc.type() == 'fetch':
-            fetch_targets_names.append(op.desc.input('X')[0])
-    return fetch_targets_names
-
-
 def load_inference_model(dirname,
                          executor,
                          model_filename=None,
@@ -418,8 +407,8 @@ def load_inference_model(dirname,
     program = Program.parse_from_string(program_desc_str)
     load_persistables(executor, dirname, program, params_filename)
 
-    feed_target_names = get_feed_targets_names(program)
-    fetch_target_names = get_fetch_targets_names(program)
+    feed_target_names = program.desc.get_feed_target_names()
+    fetch_target_names = program.desc.get_fetch_target_names()
     fetch_targets = [
         program.global_block().var(name) for name in fetch_target_names
     ]
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index 0027b651e8..d3c14b83fa 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -248,6 +248,10 @@ def infer(use_cuda, save_dirname=None):
 
         print("infer results: ", results[0])
 
+        fluid.io.save_inference_model(save_dirname, feed_target_names,
+                                      fetch_targets, exe,
+                                      inference_transpiler_program)
+
 
 def main(net_type, use_cuda, is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index d5dd63e873..7703dfe013 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -97,15 +97,18 @@ class TestConv3dOp(OpTest):
         }
         self.outputs = {'Output': output}
 
+    def testcudnn(self):
+        return core.is_compiled_with_cuda() and self.use_cudnn
+
     def test_check_output(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
             self.check_output()
 
     def test_check_grad(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place,
@@ -117,7 +120,7 @@ class TestConv3dOp(OpTest):
                 set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
 
     def test_check_grad_no_filter(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, ['Input'],
@@ -132,7 +135,7 @@ class TestConv3dOp(OpTest):
                 no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, ['Filter'],
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 328a9ffd25..f7e1e85732 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -109,8 +109,11 @@ class TestPool2d_Op(OpTest):
 
         self.outputs = {'Out': output}
 
+    def testcudnn(self):
+        return core.is_compiled_with_cuda() and self.use_cudnn
+
     def test_check_output(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
@@ -119,7 +122,7 @@ class TestPool2d_Op(OpTest):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        if self.use_cudnn and self.pool_type != "max":
+        if self.testcudnn() and self.pool_type != "max":
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, set(['X']), 'Out', max_relative_error=0.07)
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index 15a8ac5e20..aaa9484251 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -118,15 +118,18 @@ class TestPool3d_Op(OpTest):
 
         self.outputs = {'Out': output.astype('float32')}
 
+    def testcudnn(self):
+        return core.is_compiled_with_cuda() and self.use_cudnn
+
     def test_check_output(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
             self.check_output()
 
     def test_check_grad(self):
-        if self.use_cudnn and self.pool_type != "max":
+        if self.testcudnn() and self.pool_type != "max":
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, set(['X']), 'Out', max_relative_error=0.07)
diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index 887bdfe8b3..eb49a53e54 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -19,7 +19,7 @@ from op_test import OpTest
 
 class TestSplitOp(OpTest):
     def setUp(self):
-        self.op_type = "split"
+        self._set_op_type()
         axis = 1
         x = np.random.random((4, 5, 6)).astype('float32')
         out = np.split(x, [2, 3], axis)
@@ -28,6 +28,9 @@ class TestSplitOp(OpTest):
         self.outputs = {'Out': [('out%d' % i, out[i]) \
             for i in xrange(len(out))]}
 
+    def _set_op_type(self):
+        self.op_type = "split"
+
     def test_check_output(self):
         self.check_output()
 
@@ -35,5 +38,10 @@ class TestSplitOp(OpTest):
         self.check_grad(['X'], ['out0', 'out1', 'out2'])
 
 
+class TestSplitByrefOp(OpTest):
+    def _set_op_type(self):
+        self.op_type = "split_byref"
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
index 37c4296f9b..00c2a3b992 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -124,7 +124,7 @@ def test(word_idx):
         re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
 
 
-def word_dict():
+def word_dict(cutoff=150):
     """
     Build a word dictionary from the corpus.
 
@@ -132,7 +132,7 @@ def word_dict():
     :rtype: dict
     """
     return build_dict(
-        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), cutoff)
 
 
 def fetch():