diff --git a/doc/design/graph.md b/doc/design/graph.md
index 87f696f90f..51b7f87638 100644
--- a/doc/design/graph.md
+++ b/doc/design/graph.md
@@ -1,4 +1,4 @@
-# Design Doc: Computations as Graphs
+# Design Doc: Computations as a Graph
 
 A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before.
 
@@ -8,6 +8,8 @@ This document explains that the construction of a graph as three steps:
 - construct the backward part
 - construct the optimization part
 
+## The Construction of a Graph
+
 Let us take the problem of image classification as a simple example.  The application program that trains the model looks like:
 
 ```python
@@ -25,7 +27,9 @@ The first four lines of above program build the forward part of the graph.
 
 ![](images/graph_construction_example_forward_only.png)
 
-In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x.  `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b.
+In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x.  `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b, and the initialization operators.
+
+Initialization operators are kind of "run-once" operators -- the `Run` method increments a class data member counter so to run at most once.  By doing so, a parameter wouldn't be initialized repeatedly, say, in every minibatch.
 
 In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`.  These protobuf messages are saved in a `BlockDesc` protobuf message.
 
@@ -49,3 +53,18 @@ According to the chain rule of gradient computation, `ConstructBackwardGraph` wo
 For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient.  Here results in the complete graph:
 
 ![](images/graph_construction_example_all.png)
+
+## Block and Graph
+
+The word block and graph are interchangable in the desgin of PaddlePaddle.  A [Block[(https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions.  A graph of operators and variables is a representation of the block.
+
+A Block keeps operators in an array `BlockDesc::ops`
+
+```protobuf
+message BlockDesc {
+  repeated OpDesc ops = 1;
+  repeated VarDesc vars = 2;
+}
+```
+
+in the order that there appear in user programs, like the Python program at the beginning of this article.  We can imagine that in `ops`,  we have some forward operators, followed by some gradient operators, and then some optimization operators.
diff --git a/doc/design/images/graph_construction_example.dot b/doc/design/images/graph_construction_example.dot
index bedb6de011..8d1b673abf 100644
--- a/doc/design/images/graph_construction_example.dot
+++ b/doc/design/images/graph_construction_example.dot
@@ -2,6 +2,8 @@ digraph ImageClassificationGraph {
         ///////// The forward part /////////
         FeedX [label="Feed", color=blue, shape=box];
         FeedY [label="Feed", color=blue, shape=box];
+        InitW [label="Init", color=blue, shape=diamond];
+        Initb [label="Init", color=blue, shape=diamond];
         FC [label="FC", color=blue, shape=box];
         MSE [label="MSE", color=blue, shape=box];
 
@@ -14,6 +16,8 @@ digraph ImageClassificationGraph {
 
         FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
         FeedY -> l [color=blue];
+        InitW -> W [color=blue];
+        Initb -> b [color=blue];
         W -> FC [color=blue];
         b -> FC [color=blue];
         l -> MSE [color=blue];
diff --git a/doc/design/images/graph_construction_example_all.png b/doc/design/images/graph_construction_example_all.png
index 18d8330b60..1811875034 100644
Binary files a/doc/design/images/graph_construction_example_all.png and b/doc/design/images/graph_construction_example_all.png differ
diff --git a/doc/design/images/graph_construction_example_forward_backward.png b/doc/design/images/graph_construction_example_forward_backward.png
index 61c3a02a04..3049a9315f 100644
Binary files a/doc/design/images/graph_construction_example_forward_backward.png and b/doc/design/images/graph_construction_example_forward_backward.png differ
diff --git a/doc/design/images/graph_construction_example_forward_only.png b/doc/design/images/graph_construction_example_forward_only.png
index 14805df11f..25d19088cb 100644
Binary files a/doc/design/images/graph_construction_example_forward_only.png and b/doc/design/images/graph_construction_example_forward_only.png differ
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index e3bee32f8e..58665e9f2b 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -23,17 +23,20 @@
 - `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
 - `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
 
-依据是否包含kernel，将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorBase`，后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
+依据是否包含kernel，可以将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorBase`，后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
 
-  
- 内容            | 定义位置         
---------------  | :----------------------  
+
+ 内容            | 定义位置
+--------------  | :----------------------
 OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
-Op定义           | `.cc`文件 
-Kernel实现       | CPU、GPU共享Kernel在`.h`文件，否则，CPU可以在`.cc`文件，GPU可在`.cu`文件。 
-注册Op           | Op注册在`.cc`文件；Kernel注册CPU在`.cc`文件，GPU在`.cu`文件
-     
-     
+Op定义           | `.cc`文件
+Kernel实现       | CPU、GPU共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，GPU 实现在`.cu`文件中。
+注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，GPU实现在`.cu`文件中
+
+
+实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。
+
+
 下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
 
 
@@ -42,9 +45,11 @@ Kernel实现       | CPU、GPU共享Kernel在`.h`文件，否则，CPU可以在`
 
 ### 1. 定义ProtoMaker类
 
-矩阵乘的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。首先定义`ProtoMaker`来描述该Op的输入、输出及注释：
-    
-```
+矩阵乘法的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。
+
+首先定义`ProtoMaker`来描述该Op的输入、输出，并添加注释：
+
+```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
@@ -59,20 +64,20 @@ The equation is: Out = X * Y
   }
 };
 ```
-   
-[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数包括2个：
+
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数含有2个参数：
 
    - `framework::OpProto` ： 前者存储Op的输入输出和参数属性，将用于Python API接口的生成。
    - `framework::OpAttrChecker` ：后者用于检查参数属性的合法性。
-   
-构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加该Op的注释，这些函数会将对应内容添加到`OpProto`中。
 
-在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，该命名尽可能的规范。
+构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。
 
-   
-再举个[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)的例子：
-   
-```
+上面的代码在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守命名规范。
+
+
+再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)为例：
+
+```cpp
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -87,17 +92,19 @@ The equation is: Out = scale*X
   }
 };
 ```
- 
- 在这个例子里，两处不同：
- 
-  - `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中。
-  - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
-   
+
+这个例子有两处不同：
+
+- `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中，如果Op的某个输入不参与反向梯度的计算，请显示地调用`.NotInGradient()`进行设置。
+
+- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
+
 
 ### 2. 定义Operator类
 
+下面的点实现了MulOp的定义：
 
-```c++
+```cpp
 class MulOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -121,33 +128,46 @@ class MulOp : public framework::OperatorWithKernel {
 ```
 
 [`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
-	 
-```c++
+
+```cpp
 using framework::OperatorWithKernel::OperatorWithKernel;
 ```
 
 这句表示使用基类`OperatorWithKernel`的构造函数，也可写成：
-   
-```c++
+
+```cpp
 MulOp(const std::string &type, const framework::VariableNameMap &inputs,
       const framework::VariableNameMap &outputs,
       const framework::AttributeMap &attrs)
   : OperatorWithKernel(type, inputs, outputs, attrs) {}
-```	
-	
+```
+
 还需要重写`InferShape`接口。`InferShape`为const函数，不能修改Op的成员变量，参数为`const framework::InferShapeContext &ctx`，通过该参数可获取到输入输出以及属性。它的功能是：
 
   - 1). 做检查， 尽早报错：检查输入数据维度、类型等是否合法。
   - 2). 设置输出Tensor的形状。
 
-通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和要讲到的注册函数一起放在`.cc`中
+通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和下面将要介绍的注册函数一起放在`.cc`中
 
 ### 3. 定义OpKernel类
 
-```C++
-template <typename Place, typename T>
-class MulKernel : public framework::OpKernel {
- public:
+`MulKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
+
+- `typename  Place`: 表示设备类型，不同设备(CPU、GPU)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+
+- `typename T` : 表示数据类型，如`float`, `double`等。
+
+需要为`MulKernel`类重写`Compute`接口。
+- `Compute`接受一个输入参数：`const framework::ExecutionContext& context`。
+- 与`InferShapeContext`相比，`ExecutionContext`增加了设备类型，同样可获取到输入输出和属性参数。
+- `Compute`函数里实现`OpKernel`的具体计算逻辑。
+
+下面是 `MulKernel` `Compute`的实现：
+
+  ```cpp
+  template <typename Place, typename T>
+  class MulKernel : public framework::OpKernel {
+  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<Tensor>("X");
     auto* Y = context.Input<Tensor>("Y");
@@ -157,141 +177,136 @@ class MulKernel : public framework::OpKernel {
         const_cast<platform::DeviceContext*>(context.device_context_);
     math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
   }
-};
-```
+  };
+  ```
 
-`MulKernel`继承自`framework::OpKernel`，带有模板参数:
+需要注意：**不同设备(CPU、GPU)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
 
-  - `typename  Place`: 表示设备类型，不同设备(CPU、GPU)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
-  
- - `typename T` : 表示数据类型，如`float`, `double`等。
-   
-`MulKernel`需要重写`Compute`接口，该接口参数为`const framework::ExecutionContext& context`, `ExecutionContext`相比`InferShapeContext`增加了设备类型，同样可获取到输入输出和属性参数，`Compute`函数里写具体实现时。
-   
-注意，不同设备(CPU、GPU)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。`MulOp`的CPU、GPU实现共享同一个`Kernel`，`OpKernel`不共享的例子可以参考[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 
+`MulOp`的CPU、GPU实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+
+为了使`OpKernel`的计算过程书写更加简单，并且CPU、GPU的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
+
+
+到此，前向Op实现完成。接下来，需要在`.cc`文件中注册该op和kernel。
+反向Op类的定义，反向OpKernel的定义与前向Op类似，这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
 
-为了使得`OpKernel`的计算过程书写较为简单，CPU、GPU的代码可以复用，我们通常借助Eigen unsupported Tensor模块来实现。关于在paddle中如何使用Eigen库，请参考对应的使用[文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)
-   
-到此前向Op实现完成，需要在`.cc`文件中注册该op和kernel。反向Op类的定义和Kernel定义与前向Op类似，这里不再重复。但注意，反向Op没有`ProtoMaker`。
-   
 ### 4. 注册Operator
 
-在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
+- 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
+
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+    REGISTER_OP_CPU_KERNEL(mul_grad,
+                  ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+    ```
+
+   在上面的代码中：
+
+    - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
+    - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
+    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulKernel`类。
 
-```c++
-namespace ops = paddle::operators;
-REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
-REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(mul_grad,
-              ops::MulGradKernel<paddle::platform::CPUPlace, float>);
-```
-    
-  - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`，
-  - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
-  - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulKernel`类。
-
-在 `.cu`文件中注册GPU Kernel。请注意，如果GPU Kernel的实现是基于Eigen unsupported模块，那么在 `.cu`的最前面请加上宏定义 `#define EIGEN_USE_GPU`
-   
-```c++
-// if use Eigen unsupported module before include head files
-#define EIGEN_USE_GPU
-
-namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(mul_grad,
-                       ops::MulGradKernel<paddle::platform::GPUPlace, float>);
-```
+
+- 在 `.cu`文件中注册GPU Kernel。
+    - 请注意，如果GPU Kernel的实现基于Eigen unsupported模块，那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`，代码示例如下：
+
+    ```cpp
+    // if use Eigen unsupported module before include head files
+    #define EIGEN_USE_GPU
+
+    namespace ops = paddle::operators;
+    REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
+    REGISTER_OP_GPU_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+    ```
 
 ### 5. 编译
 
-在[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)文件中添加编译。
-   
-```
-op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
-```
-   
-下面命令可以编译：
-   
-```
-make mul_op
-```
+- 简单**无特殊依赖**的OP无需修改CMakeList.txt文件。[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt) 会自动将 `paddle/operators` 目录下新增的 `*_op.cc` 文件加入编译。
+- 较为复杂、**有额外依赖** 的operator仍需要修改[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)。如，`mul_op` 依赖 `math_function`，需要在`CMakeLists.txt`中添加如下内容：
+
+    ```
+    op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)		 +
+    ```
+
+- 运行下面命令可以进行编译：
+
+    ```
+    make mul_op
+    ```
 
 ## 绑定Python
 
-- 绑定Python 
- 
-    在 [`paddle/pybind/pybind.cc 
-`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc)文件中添加该类：
+- 绑定Python
+
+    在 [`paddle/pybind/pybind.cc
+`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) 使用`USE_OP`告知编译器需要链接的Op，具体解释参考[代码注释](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h#L81)。
 
     ```
     USE_OP(mul);
     ```
     如果只实现了CPU版本，则使用`USE_CPU_ONLY_OP`:
-    
+
     ```
     USE_CPU_ONLY_OP(gather);
     ```
-    
+
     如果OP不带Kernel，则使用`USE_NO_KENREL_OP`:
-    
+
     ```
     USE_NO_KENREL_OP(recurrent);
     ```
-    
-    使用`USE_OP`告知编译器需要链接该Op的目标文件，具体解释参考[代码注释](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h#L81)。
-    
-    
+
+
  - 生成库
 
-   在 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件添加类到`DEPS`中，使得该Op可以链接到生成的lib库中。
-   
-   ```
-   if(WITH_PYTHON)
-     cc_library(paddle_pybind SHARED
-     SRCS pybind.cc
-     DEPS pybind python backward
-     mul_op
-     minus_op)
-   endif(WITH_PYTHON)
-   ```
+   无需修改 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件，`paddle/operators` 目录下新增的 `*_op.cc` 文件会被自动添加链接到生成的lib库中。
 
 ## 实现单元测试
 
-单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单测](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
 
-### 前向Operator单测
+### 前向Operator单元测试
 
-前向Op单测继承自`unittest.TestCase`，并定义元类`__metaclass__ = OpTestMeta`，具体单测流程在`OpTestMeta`里完成。需在`setUp`函数定义输入输出和属性参数，以及Python对比的输出值。
+前向Op单元测试继承自`unittest.TestCase`，并定义元类`__metaclass__ = OpTestMeta`。各项更加具体的单元测试在`OpTestMeta`里完成。测试前向Operator，需要：
 
-```
-import unittest
-import numpy as np
-from gradient_checker import GradientChecker, create_op
-from op_test_util import OpTestMeta
+1. 在`setUp`函数定义输入、输出，以及相关的属性参数。
+2. 生成随机的输入数据。
+3. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比。
 
-class TestMulOp(unittest.TestCase):
-    __metaclass__ = OpTestMeta
 
-    def setUp(self):
-        self.type = "mul"
-        self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((84, 100)).astype("float32")
-        }
-        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
-```
-   首先需要`import`必要的包,下面详细解释其他值：
-   
-   - `self.type = "mul" ` : 定义类型，和注册的类型一致。
-   - `self.inputs` : 定义输入，类型为Numpy.array，并初始化。
-   - `self.outputs` : 定义输出，并得到Python结算结果。
+  ```python
+  import unittest
+  import numpy as np
+  from gradient_checker import GradientChecker, create_op
+  from op_test_util import OpTestMeta
 
- 
-### 反向Operator单测
+  class TestMulOp(unittest.TestCase):
+      __metaclass__ = OpTestMeta
 
-反向Op单测继承自`GradientChecker`，而`GradientChecker`集成自`unittest.TestCase`，所以反向单测函数需要`test_`开头。
+      def setUp(self):
+          self.type = "mul"
+          self.inputs = {
+              'X': np.random.random((32, 84)).astype("float32"),
+              'Y': np.random.random((84, 100)).astype("float32")
+          }
+          self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+  ```
 
-```
+上面的代码首先导入依赖的包，下面是对`setUp`函数中操作的重要变量的详细解释：
+
+- `self.type = "mul" ` : 定义类型，与operator注册时注册的类型一致。
+- `self.inputs` : 定义输入，类型为`numpy.array`，并初始化。
+- `self.outputs` : 定义输出，并在Python脚本中完成与operator同样的计算逻辑，返回Python端的计算结果。
+
+
+### 反向Operator单元测试
+
+反向Op单元测试继承自`GradientChecker`，而`GradientChecker`继承自`unittest.TestCase`，因此，**反向单元测试函数需要以`test_`开头**。
+
+```python
 class TestMulGradOp(GradientChecker):
     def setUp(self):
         self.op = create_op("mul")
@@ -325,33 +340,34 @@ class TestMulGradOp(GradientChecker):
             no_grad_set={"Y"})
 ```
 
-下面解释一些关键的地方:
+下面解释代码中一些关键的地方:
 
-   - 调用`create_op("mul")`创建反向Op对应的前向Op。
-   - 调用`compare_grad`函数对比CPU、GPU计算结果。
-   - `test_normal`中调用`check_grad`检查梯度稳定性，这里采用数值法检测梯度正确性。
-      - 第一个参数`self.op` : 前向Op。
-      - 第二个参数`self.inputs` : 输入词典，词典的Key和`ProtoMaker`定义保持一致。
-      - 第三个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
-      - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
-   - `test_ignore_x`和`test_ignore_y`分支测试只需要计算一个输入梯度的情况。
+- 调用`create_op("mul")`创建反向Op对应的前向Op。
+- 调用`compare_grad`函数对比CPU、GPU计算结果。
+- `test_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
+  - 第一个参数`self.op` : 前向Op。
+  - 第二个参数`self.inputs` : 输入词典，词典的Key和`ProtoMaker`定义保持一致。
+  - 第三个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
+  - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
+- `test_ignore_x`和`test_ignore_y`分支用来测试只需要计算一个输入梯度的情况。
 
 
-### 编译和执行 
+### 编译和执行单元测试
 
-单测完成之后，在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)里添加编译：
+单元测试编写完成之后，在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)中添加以下内容，将单元测试加入工程：
 
 ```
 py_test(test_mul_op SRCS test_mul_op.py)
 ```
 
-编译时需要打开`WITH_TESTING`, 即 `cmake paddle_dir -DWITH_TESTING=ON`，编译成功之后执行单测命令为：
+请注意，**不同于Op的编译测试，运行单元测试测时需要编译整个工程**，并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后，执行下面的命令来运行单元测试：
 
-```
+```bash
 make test ARGS="-R test_mul_op -V"
 ```
+
 或者:
 
-```
+```bash
 ctest -R test_mul_op
 ```
diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index cfd3e8dfde..85b7de7974 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -21,16 +21,16 @@ namespace framework {
 /// @cond HIDDEN
 
 template <int i>
-Dim<i> make_dim(const int* d) {
+Dim<i> make_dim(const int64_t* d) {
   return Dim<i>(*d, make_dim<i - 1>(d + 1));
 }
 
 template <>
-Dim<1> make_dim<1>(const int* d) {
+Dim<1> make_dim<1>(const int64_t* d) {
   return Dim<1>(*d);
 }
 
-void make_ddim(DDim& ddim, const int* dims, int n) {
+void make_ddim(DDim& ddim, const int64_t* dims, int n) {
   switch (n) {
     case 1:
       ddim = make_dim<1>(dims);
@@ -67,13 +67,13 @@ void make_ddim(DDim& ddim, const int* dims, int n) {
 
 /// @endcond
 
-DDim make_ddim(std::initializer_list<int> dims) {
+DDim make_ddim(std::initializer_list<int64_t> dims) {
   DDim result(make_dim(0));
   make_ddim(result, dims.begin(), dims.size());
   return result;
 }
 
-DDim make_ddim(const std::vector<int>& dims) {
+DDim make_ddim(const std::vector<int64_t>& dims) {
   DDim result(make_dim(0));
   make_ddim(result, &dims[0], dims.size());
   return result;
@@ -81,12 +81,12 @@ DDim make_ddim(const std::vector<int>& dims) {
 
 /// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
-class DynamicMutableIndexer : public boost::static_visitor<int&> {
+class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
  public:
   explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
 
   template <int D>
-  int& operator()(Dim<D>& dim) const {
+  int64_t& operator()(Dim<D>& dim) const {
     return dim[idx_];
   }
 
@@ -94,12 +94,12 @@ class DynamicMutableIndexer : public boost::static_visitor<int&> {
   int idx_;
 };
 
-class DynamicConstIndexer : public boost::static_visitor<int> {
+class DynamicConstIndexer : public boost::static_visitor<int64_t> {
  public:
   explicit DynamicConstIndexer(int idx) : idx_(idx) {}
 
   template <int D>
-  int operator()(const Dim<D>& dim) const {
+  int64_t operator()(const Dim<D>& dim) const {
     return dim[idx_];
   }
 
@@ -109,22 +109,22 @@ class DynamicConstIndexer : public boost::static_visitor<int> {
 
 /// @endcond
 
-int& DDim::operator[](int idx) {
+int64_t& DDim::operator[](int idx) {
   return boost::apply_visitor(DynamicMutableIndexer(idx), var);
 }
 
-int DDim::operator[](int idx) const {
+int64_t DDim::operator[](int idx) const {
   return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
 
-ssize_t DDim::size() const { return arity(*this); }
+int64_t DDim::size() const { return arity(*this); }
 
 bool DDim::operator==(DDim d) const {
   if (var.which() != d.getVar().which()) {
     return false;
   } else {
-    std::vector<int> v1 = vectorize(*this);
-    std::vector<int> v2 = vectorize(d);
+    std::vector<int64_t> v1 = vectorize(*this);
+    std::vector<int64_t> v2 = vectorize(d);
 
     for (unsigned int i = 0; i < v1.size(); i++) {
       if (v1[i] != v2[i]) {
@@ -139,10 +139,10 @@ bool DDim::operator==(DDim d) const {
 bool DDim::operator!=(DDim d) const { return !(*this == d); }
 
 DDim DDim::operator+(DDim d) const {
-  std::vector<int> v1 = vectorize(*this);
-  std::vector<int> v2 = vectorize(d);
+  std::vector<int64_t> v1 = vectorize(*this);
+  std::vector<int64_t> v2 = vectorize(d);
 
-  std::vector<int> v3;
+  std::vector<int64_t> v3;
 
   assert(v1.size() == v2.size());
 
@@ -154,10 +154,10 @@ DDim DDim::operator+(DDim d) const {
 }
 
 DDim DDim::operator*(DDim d) const {
-  std::vector<int> v1 = vectorize(*this);
-  std::vector<int> v2 = vectorize(d);
+  std::vector<int64_t> v1 = vectorize(*this);
+  std::vector<int64_t> v2 = vectorize(d);
 
-  std::vector<int> v3;
+  std::vector<int64_t> v3;
 
   assert(v1.size() == v2.size());
 
@@ -168,15 +168,15 @@ DDim DDim::operator*(DDim d) const {
   return make_ddim(v3);
 }
 
-int get(const DDim& ddim, int idx) { return ddim[idx]; }
+int64_t get(const DDim& ddim, int idx) { return ddim[idx]; }
 
 void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
 
 /// @cond HIDDEN
 struct VectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int>& vector;
+  std::vector<int64_t>& vector;
 
-  explicit VectorizeVisitor(std::vector<int>& v) : vector(v) {}
+  explicit VectorizeVisitor(std::vector<int64_t>& v) : vector(v) {}
 
   template <typename T>
   void operator()(const T& t) {
@@ -188,31 +188,31 @@ struct VectorizeVisitor : public boost::static_visitor<> {
 };
 /// @endcond
 
-std::vector<int> vectorize(const DDim& ddim) {
-  std::vector<int> result;
+std::vector<int64_t> vectorize(const DDim& ddim) {
+  std::vector<int64_t> result;
   VectorizeVisitor visitor(result);
   boost::apply_visitor(visitor, ddim);
   return result;
 }
 
-struct ProductVisitor : public boost::static_visitor<ssize_t> {
+struct ProductVisitor : public boost::static_visitor<int64_t> {
   template <int D>
-  ssize_t operator()(const Dim<D>& dim) {
+  int64_t operator()(const Dim<D>& dim) {
     return product(dim);
   }
 };
 
-ssize_t product(const DDim& ddim) {
+int64_t product(const DDim& ddim) {
   ProductVisitor visitor;
   return boost::apply_visitor(visitor, ddim);
 }
 
 struct SliceVectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int>& vector;
+  std::vector<int64_t>& vector;
   int begin;
   int end;
 
-  SliceVectorizeVisitor(std::vector<int>& v, int b, int e)
+  SliceVectorizeVisitor(std::vector<int64_t>& v, int b, int e)
       : vector(v), begin(b), end(e) {
     PADDLE_ENFORCE(begin < end,
                    "Begin index must be less than end index in ddim slice.");
@@ -240,7 +240,7 @@ struct SliceVectorizeVisitor : public boost::static_visitor<> {
 };
 
 DDim slice_ddim(const DDim& dim, int begin, int end) {
-  std::vector<int> vec;
+  std::vector<int64_t> vec;
   vec.reserve(end - begin);
   SliceVectorizeVisitor visitor(vec, begin, end);
   boost::apply_visitor(visitor, dim);
@@ -280,7 +280,7 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
   return os;
 }
 
-DDim::DDim(std::initializer_list<int> init_list) {
+DDim::DDim(std::initializer_list<int64_t> init_list) {
   *this = make_ddim(init_list);
 }
 }  // namespace framework
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 95f294b627..db30c52394 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -40,7 +40,7 @@ struct DDim {
   template <int D>
   explicit DDim(const Dim<D>& in) : var(in) {}
 
-  /*implicit*/ DDim(std::initializer_list<int> init_list);
+  /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
 
   template <int D>
   DDim& operator=(const Dim<D>& in) {
@@ -48,8 +48,8 @@ struct DDim {
     return *this;
   }
 
-  int& operator[](int idx);
-  int operator[](int idx) const;
+  int64_t& operator[](int idx);
+  int64_t operator[](int idx) const;
 
   template <typename Visitor>
   typename Visitor::result_type apply_visitor(Visitor& visitor) {
@@ -71,15 +71,15 @@ struct DDim {
 
   DDim operator*(DDim d) const;
 
-  ssize_t size() const;
+  int64_t size() const;
 };
 
 /**
- * \brief Make a DDim from std::vector<int>
+ * \brief Make a DDim from std::vector<int64_t>
  *
  * \param dims An vector of ints. Must be sized between [1, 9]
  */
-DDim make_ddim(const std::vector<int>& dims);
+DDim make_ddim(const std::vector<int64_t>& dims);
 
 /**
  * \brief Make a DDim from an initializer list
@@ -87,14 +87,14 @@ DDim make_ddim(const std::vector<int>& dims);
  * \param dims An initializer list of ints. Must be sized between [1, 9]
  *
  */
-DDim make_ddim(std::initializer_list<int> dims);
+DDim make_ddim(std::initializer_list<int64_t> dims);
 
-int get(const DDim& dim, int idx);
+int64_t get(const DDim& dim, int idx);
 void set(DDim& dim, int idx, int val);
 
-std::vector<int> vectorize(const DDim& ddim);
+std::vector<int64_t> vectorize(const DDim& ddim);
 
-ssize_t product(const DDim& ddim);
+int64_t product(const DDim& ddim);
 
 /**
  * \brief Slice a ddim
diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc
index 9d18a2972c..756232b1b5 100644
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -12,7 +12,7 @@ TEST(DDim, Equality) {
   EXPECT_EQ(ddim[2], 5);
 
   // construct a DDim from a vector
-  std::vector<int> vec({9, 1, 5});
+  std::vector<int64_t> vec({9, 1, 5});
   paddle::framework::DDim vddim = paddle::framework::make_ddim(vec);
   EXPECT_EQ(ddim[0], 9);
   EXPECT_EQ(ddim[1], 1);
@@ -25,7 +25,7 @@ TEST(DDim, Equality) {
   EXPECT_EQ(paddle::framework::get(ddim, 0), 6);
 
   // vectorize a DDim
-  std::vector<int> res_vec = paddle::framework::vectorize(vddim);
+  std::vector<int64_t> res_vec = paddle::framework::vectorize(vddim);
   EXPECT_EQ(res_vec[0], 9);
   EXPECT_EQ(res_vec[1], 1);
   EXPECT_EQ(res_vec[2], 5);
diff --git a/paddle/framework/dim.h b/paddle/framework/dim.h
index 883fdc55eb..04d4b0e604 100644
--- a/paddle/framework/dim.h
+++ b/paddle/framework/dim.h
@@ -17,13 +17,13 @@ struct Dim {
   static constexpr int dimensions = i;
 
   template <typename... Args>
-  HOSTDEVICE Dim(int _head, Args... _tail) : head(_head), tail(_tail...) {
+  HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
     static_assert(sizeof...(_tail) == i - 1,
                   "Dim initialized with the wrong number of parameters");
   }
 
   HOSTDEVICE
-  Dim(int _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
+  Dim(int64_t _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
 
   HOSTDEVICE
   Dim() : head(0), tail() {}
@@ -31,12 +31,12 @@ struct Dim {
   /** Construct a Dim from a linear index and size.  Uses Fortran order
    * indexing. */
   HOSTDEVICE
-  Dim(int idx, const Dim<i>& size)
+  Dim(int64_t idx, const Dim<i>& size)
       : head(idx % size.head), tail(idx / size.head, size.tail) {}
 
   /** Construct a Dim with each dimension set to the given index */
   HOSTDEVICE
-  Dim(int idx) : head(idx), tail(idx) {}
+  Dim(int64_t idx) : head(idx), tail(idx) {}
 
   HOSTDEVICE
   bool operator==(const Dim<i>& o) const {
@@ -47,13 +47,13 @@ struct Dim {
   bool operator!=(const Dim<i>& o) const { return !(*this == o); }
 
   HOSTDEVICE
-  int& operator[](int idx);
+  int64_t& operator[](int idx);
   HOSTDEVICE
-  int operator[](int idx) const;
+  int64_t operator[](int idx) const;
 
   HOST std::string to_string() const;
 
-  int head;
+  int64_t head;
   Dim<i - 1> tail;
 };
 
@@ -63,7 +63,7 @@ struct Dim<1> {
   static constexpr int dimensions = 1;
 
   HOSTDEVICE
-  Dim(int _head) : head(_head) {}
+  Dim(int64_t _head) : head(_head) {}
 
   HOSTDEVICE
   Dim() : head(0) {}
@@ -86,11 +86,11 @@ struct Dim<1> {
   bool operator!=(const Dim<1>& o) const { return !(*this == o); }
 
   HOSTDEVICE
-  int& operator[](int idx);
+  int64_t& operator[](int idx);
   HOSTDEVICE
-  int operator[](int idx) const;
+  int64_t operator[](int idx) const;
 
-  int head;
+  int64_t head;
 };
 
 namespace {
@@ -100,12 +100,12 @@ template <int i>
 struct DimGetter {
   // Return a copy if Dim is const
   template <typename D>
-  HOSTDEVICE static int impl(const D& d) {
+  HOSTDEVICE static int64_t impl(const D& d) {
     return DimGetter<i - 1>::impl(d.tail);
   }
   // Return a reference if Dim is mutable
   template <typename D>
-  HOSTDEVICE static int& impl(D& d) {
+  HOSTDEVICE static int64_t& impl(D& d) {
     return DimGetter<i - 1>::impl(d.tail);
   }
 };
@@ -115,18 +115,18 @@ template <>
 struct DimGetter<0> {
   // Return a copy if Dim is const
   template <typename D>
-  HOSTDEVICE static int impl(const D& d) {
+  HOSTDEVICE static int64_t impl(const D& d) {
     return d.head;
   }
   // Return a reference if Dim is mutable
   template <typename D>
-  HOSTDEVICE static int& impl(D& d) {
+  HOSTDEVICE static int64_t& impl(D& d) {
     return d.head;
   }
 };
 
 template <int D>
-HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
+HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) {
 #ifndef __CUDA_ARCH__
   if (idx < 0) {
     throw std::invalid_argument("Tried to access a negative dimension");
@@ -141,7 +141,7 @@ HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
 }
 
 template <>
-HOSTDEVICE int& indexer<1>(Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t& indexer<1>(Dim<1>& dim, int idx) {
 #ifndef __CUDA_ARCH__
   if (idx != 0) {
     throw std::invalid_argument("Invalid index");
@@ -153,7 +153,7 @@ HOSTDEVICE int& indexer<1>(Dim<1>& dim, int idx) {
 }
 
 template <int D>
-HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
+HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
 #ifndef __CUDA_ARCH__
   if (idx < 0) {
     throw std::invalid_argument("Tried to access a negative dimension");
@@ -168,7 +168,7 @@ HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
 }
 
 template <>
-HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t indexer<1>(const Dim<1>& dim, int idx) {
 #ifndef __CUDA_ARCH__
   if (idx != 0) {
     throw std::invalid_argument("Invalid index");
@@ -182,73 +182,76 @@ HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
 }  // namespace
 // Static access to constant Dim
 template <int i, int l>
-HOSTDEVICE int get(const Dim<l>& d) {
+HOSTDEVICE int64_t get(const Dim<l>& d) {
   return DimGetter<i>::impl(d);
 }
 
 // Static access to mutable Dim
 template <int i, int l>
-HOSTDEVICE int& get(Dim<l>& d) {
+HOSTDEVICE int64_t& get(Dim<l>& d) {
   return DimGetter<i>::impl(d);
 }
 
 // Dynamic access to constant Dim
 template <int l>
-HOSTDEVICE int Dim<l>::operator[](int i) const {
+HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
   return indexer(*this, i);
 }
 
 // Dynamic access to mutable Dim
 template <int l>
-HOSTDEVICE int& Dim<l>::operator[](int i) {
+HOSTDEVICE int64_t& Dim<l>::operator[](int i) {
   return indexer(*this, i);
 }
 
 // Dynamic access to constant Dim
-inline HOSTDEVICE int Dim<1>::operator[](int i) const {
+inline HOSTDEVICE int64_t Dim<1>::operator[](int i) const {
   return indexer(*this, i);
 }
 
 // Dynamic access to mutable Dim
-inline HOSTDEVICE int& Dim<1>::operator[](int i) { return indexer(*this, i); }
+inline HOSTDEVICE int64_t& Dim<1>::operator[](int i) {
+  return indexer(*this, i);
+}
 
 // Dynamic access to constant Dim
 // without std::enable_if will try to instantiate this on get<0>(d)
 template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int>::type get(const Dim<l>& d,
-                                                           int i) {
+HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l>& d,
+                                                               int i) {
   return d[i];
 }
 
 // Dynamic access to mutable Dim
 template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int&>::type get(Dim<l>& d, int i) {
+HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim<l>& d,
+                                                                int i) {
   return d[i];
 }
 
 // Dot product of two dims
 template <int i>
-HOSTDEVICE int linearize(const Dim<i>& a, const Dim<i>& b) {
+HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) {
   return a.head * b.head + linearize(a.tail, b.tail);
 }
 
 // Base case dot product of two Dims
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int linearize(const Dim<1>& a, const Dim<1>& b) {
+HOSTDEVICE inline int64_t linearize(const Dim<1>& a, const Dim<1>& b) {
   return a.head * b.head;
 }
 
 // Product of a Dim
 template <int i>
-HOSTDEVICE int product(const Dim<i>& a, int prod = 1) {
+HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) {
   return prod * a.head * product(a.tail);
 }
 
 // Base case product of a Dim
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int product(const Dim<1>& a, int prod) {
+HOSTDEVICE inline int64_t product(const Dim<1>& a, int prod) {
   return prod * a.head;
 }
 
diff --git a/paddle/framework/dim_test.cu b/paddle/framework/dim_test.cu
index 3898d0a447..0a6a87669c 100644
--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
@@ -8,7 +8,7 @@ __global__ void test(paddle::framework::Dim<2>* o) {
   o[0] = paddle::framework::make_dim(5, 6);
 }
 
-__global__ void dyn_idx_gpu(int* o) {
+__global__ void dyn_idx_gpu(int64_t* o) {
   auto d = paddle::framework::make_dim(5, 6);
   o[0] = d[1];
 }
@@ -47,9 +47,9 @@ TEST(Dim, Equality) {
   EXPECT_EQ(b[1], 11);
 
   // dynamic access on GPU
-  thrust::device_vector<int> r(1);
+  thrust::device_vector<int64_t> r(1);
   dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
-  int res = r[0];
+  int64_t res = r[0];
   EXPECT_EQ(res, 6);
 
   // ex_prefix_mul
diff --git a/paddle/framework/eigen.h b/paddle/framework/eigen.h
index a4667cc51f..2d8d9ae10c 100644
--- a/paddle/framework/eigen.h
+++ b/paddle/framework/eigen.h
@@ -28,7 +28,7 @@ struct EigenDim {
   static Type From(const DDim& dims) {
     PADDLE_ENFORCE(arity(dims) == D, "D must match arity(DDim)");
     Type ret;
-    for (int d = 0; d < arity(dims); d++) {
+    for (int64_t d = 0; d < arity(dims); d++) {
       ret[d] = dims[d];
     }
     return ret;
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index 368136a972..dfcb5fb621 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -87,3 +87,24 @@ message OpProto {
   repeated Attr attrs = 4;
   required string comment = 5;
 }
+
+enum DataType {
+  BOOL = 0;
+  INT16 = 1;
+  INT32 = 2;
+  INT64 = 3;
+  FP16 = 4;
+  FP32 = 5;
+  FP64 = 6;
+}
+
+message LoDTensorDesc {
+  required DataType data_type = 1;
+  repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  optional int32 lod_level = 3 [ default = 0 ];
+}
+
+message VarDesc {
+  required string name = 1;
+  optional LoDTensorDesc lod_tensor = 2;
+}
diff --git a/paddle/framework/grad_op_builder_test.cc b/paddle/framework/grad_op_builder_test.cc
index 902c2655e9..9e3ca563c6 100644
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -3,7 +3,7 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 
-USE_OP(add_two);
+USE_OP(add);
 
 namespace paddle {
 namespace framework {
@@ -41,7 +41,7 @@ namespace f = paddle::framework;
 
 TEST(GradOpBuilder, AddTwo) {
   std::shared_ptr<f::OperatorBase> add_op(f::OpRegistry::CreateOp(
-      "add_two", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
+      "add", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
   std::shared_ptr<f::OperatorBase> grad_add_op =
       f::OpRegistry::CreateGradOp(*add_op);
   EXPECT_EQ(grad_add_op->Inputs().size(), 4UL);
diff --git a/paddle/framework/lod_tensor.md b/paddle/framework/lod_tensor.md
index 8dfe3ee823..769b61f175 100644
--- a/paddle/framework/lod_tensor.md
+++ b/paddle/framework/lod_tensor.md
@@ -94,7 +94,7 @@ Let's go on slicing this slice.  Its <1,1>-slice is
 |||
 ```
 
-### The General Slicing Algorithm
+### The Slicing Algorithm
 
 The algorithm, with over-simplified data structure, is defined as
 
@@ -106,17 +106,41 @@ struct LoDTensor {
   float* tensor_;
 };
 
-LoDTensor Slice(const LoDTensor& lodt, int level, int sequence) {
+LoDTensor Slice(const LoDTensor& lodt, int level, int sequence);
+```
+
+Let us revisit the example above
 
-}
+```
+         3
+3           1  2
+3   2  4    1  2  3
+||| || |||| |  || |||
 ```
 
-### Slicing the Top Level
+Suppose that we want to retrieve the <1,2>-slice
 
-Please be aware that an RNN operator only slices the top level of a LoD Tensor to get the step inputs.
+```
+2
+2  3
+|| |||
+```
 
-```c++
-LoDTensor Slice(const LoDTensor& lodt, int sequence) {
+we will need to find out the starting position of this slice by summing over all leaf nodes in `LoD` to the left of the slice, i.e., 3 + 2 + 4 + 1 = 10.
+
+To avoid the traversal of the LoD tree at slcing time,  we can do it at the construction time -- instead of saving the lengths of the next level in the LoD tree, we can save the starting offset of the next level.  For example, above LoD Tensor can be transformed into
+
+```
+        0
+0           9  10
+0   3  5    9  10 12
+||| || |||| |  || |||
+```
+
+We don't really need the 0 on top, so the LoD Tensor could be
 
-}
+```
+0           9  10
+0   3  5    9  10 12
+||| || |||| |  || |||
 ```
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index b43f6a8cc5..0e2fb27b65 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -80,7 +80,7 @@ TEST(OpRegistry, CreateOp) {
   paddle::framework::Scope scope;
   paddle::platform::CPUDeviceContext dev_ctx;
   op->Run(scope, dev_ctx);
-  float scale_get = op->GetAttr<float>("scale");
+  float scale_get = op->Attr<float>("scale");
   ASSERT_EQ(scale_get, scale);
 }
 
@@ -121,7 +121,7 @@ TEST(OpRegistry, DefaultValue) {
   paddle::framework::Scope scope;
   paddle::platform::CPUDeviceContext dev_ctx;
   op->Run(scope, dev_ctx);
-  ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
+  ASSERT_EQ(op->Attr<float>("scale"), 1.0);
 }
 
 TEST(OpRegistry, CustomChecker) {
@@ -172,6 +172,6 @@ TEST(OpRegistry, CustomChecker) {
   paddle::platform::CPUDeviceContext dev_ctx;
   paddle::framework::Scope scope;
   op->Run(scope, dev_ctx);
-  int test_attr = op->GetAttr<int>("test_attr");
+  int test_attr = op->Attr<int>("test_attr");
   ASSERT_EQ(test_attr, 4);
 }
\ No newline at end of file
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index da92220b04..9a98d4d3be 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -69,7 +69,7 @@ class OperatorBase {
   virtual ~OperatorBase() {}
 
   template <typename T>
-  inline const T& GetAttr(const std::string& name) const {
+  inline const T& Attr(const std::string& name) const {
     PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
                    name);
     return boost::get<T>(attrs_.at(name));
@@ -238,8 +238,8 @@ class InferShapeContext {
   const Scope& scope() const { return scope_; }
 
   template <typename T>
-  inline const T& GetAttr(const std::string& name) const {
-    return op_.GetAttr<T>(name);
+  inline const T& Attr(const std::string& name) const {
+    return op_.Attr<T>(name);
   }
 
   size_t InputSize(const std::string& name) const {
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 7893e233b7..94f436294f 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -58,7 +58,7 @@ inline T* Tensor::mutable_data(platform::Place place) {
                     "Tensor's numel must be larger than zero to call "
                     "Tensor::mutable_data. Call Tensor::set_dim first.");
   /* some versions of boost::variant don't have operator!= */
-  size_t size = product(dims_) * sizeof(T);
+  int64_t size = product(dims_) * sizeof(T);
   if (holder_ == nullptr || !(holder_->place() == place) ||
       holder_->size() < size + offset_) {
     if (platform::is_cpu_place(place)) {
@@ -131,7 +131,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
   PADDLE_ENFORCE_LT(begin_idx, end_idx,
                     "Begin index must be less than end index.");
   PADDLE_ENFORCE_NE(dims_[0], 1, "Can not slice a tensor with dims_[0] = 1.");
-  int base = product(dims_) / dims_[0];
+  size_t base = product(dims_) / dims_[0];
   Tensor dst;
   dst.holder_ = holder_;
   DDim dst_dims = dims_;
diff --git a/paddle/gserver/layers/Conv3DLayer.cpp b/paddle/gserver/layers/Conv3DLayer.cpp
index 3887aa58b2..9deda2de98 100644
--- a/paddle/gserver/layers/Conv3DLayer.cpp
+++ b/paddle/gserver/layers/Conv3DLayer.cpp
@@ -83,8 +83,8 @@ void Conv3DLayer::forward(PassType passType) {
   int outWidth = getSize();
   resetOutput(batchSize, outWidth);
 
+  REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
     const MatrixPtr &inMat = getInputValue(i);
     const MatrixPtr &outMat = getOutputValue();
     int M = M_[i];
@@ -120,7 +120,6 @@ void Conv3DLayer::forward(PassType passType) {
     }
   }
   if (nullptr != this->biasParameter_) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
     this->addBias();
   }
   forwardActivation();
@@ -134,15 +133,14 @@ void Conv3DLayer::backward(const UpdateCallback &callback) {
     biases_->getParameterPtr()->incUpdate(callback);
   }
 
+  REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
     if (weights_[i]->getWGrad()) {
       bpropWeights(i);
     }
     if (getInputGrad(i)) {
       bpropData(i);
     }
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
     weights_[i]->getParameterPtr()->incUpdate(callback);
   }
 }
diff --git a/paddle/gserver/layers/DeConv3DLayer.cpp b/paddle/gserver/layers/DeConv3DLayer.cpp
index 2838980a97..1b59ed60c5 100644
--- a/paddle/gserver/layers/DeConv3DLayer.cpp
+++ b/paddle/gserver/layers/DeConv3DLayer.cpp
@@ -84,8 +84,8 @@ void DeConv3DLayer::forward(PassType passType) {
   resetOutput(batchSize, outWidth);
   const MatrixPtr outMat = getOutputValue();
 
+  REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
     const MatrixPtr &inMat = getInputValue(i);
     int M = M_[i];
     int N = N_[i];
@@ -120,7 +120,6 @@ void DeConv3DLayer::forward(PassType passType) {
     }
   }
   if (nullptr != this->biasParameter_) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
     this->addBias();
   }
   forwardActivation();
@@ -133,12 +132,12 @@ void DeConv3DLayer::backward(const UpdateCallback &callback) {
     bpropBiases();
     biases_->getParameterPtr()->incUpdate(callback);
   }
+  REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
   for (size_t i = 0; i < inputLayers_.size(); ++i) {
     if (weights_[i]->getWGrad() || this->needGradient_) {
       int M = M_[i];
       int N = N_[i];
       int K = K_[i];
-      REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
       Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
       const MatrixPtr &inMat = getInputValue(i);
       for (int n = 0; n < batchSize; ++n) {
@@ -182,7 +181,6 @@ void DeConv3DLayer::backward(const UpdateCallback &callback) {
           }
         }
       }
-      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
       weights_[i]->getParameterPtr()->incUpdate(callback);
     }
   }
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index e5efcccb0e..8a0ff1eb53 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -14,27 +14,31 @@ function(op_library TARGET)
     cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
             "${multiValueArgs}" ${ARGN})
 
-    foreach(src ${op_library_SRCS})
-        if (${src} MATCHES ".*\\.cu$")
-            list(APPEND cu_srcs ${src})
-        elseif(${src} MATCHES ".*\\.cc$")
-            list(APPEND cc_srcs ${src})
-        else()
-            message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+    list(LENGTH op_library_SRCS op_library_SRCS_len)
+    if (${op_library_SRCS_len} EQUAL 0)
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+            list(APPEND cc_srcs ${TARGET}.cc)
         endif()
-    endforeach()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+            list(APPEND cu_srcs ${TARGET}.cu)
+        endif()
+    else()
+        foreach(src ${op_library_SRCS})
+            if (${src} MATCHES ".*\\.cu$")
+                list(APPEND cu_srcs ${src})
+            elseif(${src} MATCHES ".*\\.cc$")
+                list(APPEND cc_srcs ${src})
+            else()
+                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+            endif()
+        endforeach()
+    endif()
 
     list(LENGTH cc_srcs cc_srcs_len)
     if (${cc_srcs_len} EQUAL 0)
         message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
     endif()
 
-    list(LENGTH cu_srcs cu_srcs_len)
-    list(LENGTH op_library_DEPS dep_len)
-    if (${cu_srcs_len} EQUAL 0 AND ${dep_len} EQUAL 0)
-        message(WARNING "The op library ${TARGET} not support GPU!")
-    endif()
-
     if (WITH_GPU)
         nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
@@ -46,22 +50,22 @@ endfunction()
 
 add_subdirectory(math)
 
-list(REMOVE_ITEM GENERAL_OPS
-     net_op
-     minus_op
-     mul_op
-     recurrent_op
-     scale_op)
-
-op_library(net_op SRCS net_op.cc)
-op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op)
-op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
+set(DEPS_OPS
+    identity_op
+    minus_op
+    mul_op
+    recurrent_op
+    scale_op)
+op_library(identity_op DEPS scale_op)
+op_library(minus_op DEPS scale_op)
+op_library(mul_op DEPS math_function)
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc 
   DEPS framework_proto tensor operator net_op)
-op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op)
+op_library(scale_op DEPS net_op)
 
+list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
-    op_library(${src} SRCS ${src}.cc ${src}.cu)
+    op_library(${src})
 endforeach()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index 8ab748ed71..8dbd47cf0d 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -57,7 +57,6 @@ class AddOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(add_two, ops::AddOp, ops::AddOpMaker, add_two_grad, ops::AddOpGrad);
+REGISTER_OP(add, ops::AddOp, ops::AddOpMaker, add_grad, ops::AddOpGrad);
 
-REGISTER_OP_CPU_KERNEL(add_two,
-                       ops::AddKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(add, ops::AddKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu
index cec5f558cb..d9c6d20a6c 100644
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@@ -12,10 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
-#include "paddle/framework/op_registry.h"
 #include "paddle/operators/add_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(add_two,
-                       ops::AddKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(add, ops::AddKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
new file mode 100644
index 0000000000..c033af3b74
--- /dev/null
+++ b/paddle/operators/cos_sim_op.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/cos_sim_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class CosSimOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) must not be null.");
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
+                      ctx.Input<Tensor>("Y")->dims(),
+                      "Dimensions of Input(X) and Input(Y) must be the same.");
+
+    auto dims = ctx.Input<Tensor>("X")->dims();
+    ctx.Output<Tensor>("Out")->Resize({dims[0], 1});
+    ctx.Output<Tensor>("XNorm")->Resize({dims[0], 1});
+    ctx.Output<Tensor>("YNorm")->Resize({dims[0], 1});
+  }
+};
+
+class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CosSimOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of cos_sim op.");
+    AddInput("Y", "The second input of cos_sim op.");
+    AddOutput("Out", "The output of cos_sim op.");
+    AddOutput("XNorm", "Row norm of the first input.").AsIntermediate();
+    AddOutput("YNorm", "Row norm of the second input.").AsIntermediate();
+
+    AddComment(R"DOC(
+Cosine Similarity Operator.
+
+The equation is: Out = X^T * Y / (sqrt(X^T * X) * sqrt(Y^T * Y))
+)DOC");
+  }
+};
+
+class CosSimOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("XNorm"),
+                            "Input(XNorm) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("YNorm"),
+                            "Input(YNorm) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) must not be null.");
+
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto y_dims = ctx.Input<Tensor>("Y")->dims();
+    auto xnorm_dims = ctx.Input<Tensor>("XNorm")->dims();
+    auto ynorm_dims = ctx.Input<Tensor>("YNorm")->dims();
+    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+    PADDLE_ENFORCE_EQ(x_dims, y_dims,
+                      "Dimensions of Input(X) and Input(Y) must be the same.");
+    PADDLE_ENFORCE_EQ(xnorm_dims[0], x_dims[0],
+                      "1st dimension of XNorm must equal that of Input(X).");
+    PADDLE_ENFORCE_EQ(xnorm_dims[1], 1, "2st dimension of XNorm must be one.");
+    PADDLE_ENFORCE_EQ(ynorm_dims[0], y_dims[0],
+                      "1st dimension of YNorm must equal that of Input(Y).");
+    PADDLE_ENFORCE_EQ(ynorm_dims[1], 1, "2st dimension of YNorm must be one.");
+    PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0],
+                      "1st dimension of Out@GRAD must equal that of Input(X)");
+    PADDLE_ENFORCE_EQ(out_dims[1], 1, "1st dimension of Out@GRAD must be one.");
+
+    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *y_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    if (x_grad) x_grad->Resize(x_dims);
+    if (y_grad) y_grad->Resize(y_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad,
+            ops::CosSimOpGrad);
+REGISTER_OP_CPU_KERNEL(cos_sim,
+                       ops::CosSimKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    cos_sim_grad, ops::CosSimGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/gather_op.cu b/paddle/operators/cos_sim_op.cu
similarity index 72%
rename from paddle/operators/gather_op.cu
rename to paddle/operators/cos_sim_op.cu
index 3f04a7b3f8..0cb8fd26de 100644
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/cos_sim_op.cu
@@ -13,8 +13,10 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/gather_op.h"
+#include "paddle/operators/cos_sim_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(gather,
-                       ops::GatherOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(cos_sim,
+                       ops::CosSimKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    cos_sim_grad, ops::CosSimGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
new file mode 100644
index 0000000000..9e2bcebe3b
--- /dev/null
+++ b/paddle/operators/cos_sim_op.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class CosSimKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input_x = context.Input<Tensor>("X");
+    auto* input_y = context.Input<Tensor>("Y");
+    auto* output_z = context.Output<Tensor>("Out");
+    auto* output_x_norm = context.Output<Tensor>("XNorm");
+    auto* output_y_norm = context.Output<Tensor>("YNorm");
+
+    output_z->mutable_data<T>(context.GetPlace());
+    output_x_norm->mutable_data<T>(context.GetPlace());
+    output_y_norm->mutable_data<T>(context.GetPlace());
+
+    auto dims = input_x->dims();
+    int size = static_cast<int>(framework::product(dims));
+    auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
+    auto x = EigenMatrix<T>::From(*input_x, new_dims);
+    auto y = EigenMatrix<T>::From(*input_y, new_dims);
+    auto z = EigenVector<T>::Flatten(*output_z);
+    auto x_norm = EigenVector<T>::Flatten(*output_x_norm);
+    auto y_norm = EigenVector<T>::Flatten(*output_y_norm);
+
+    auto place = context.GetEigenDevice<Place>();
+    auto xy = (x * y).sum(Eigen::array<int, 1>({{1}}));
+    x_norm.device(place) = x.square().sum(Eigen::array<int, 1>({{1}})).sqrt();
+    y_norm.device(place) = y.square().sum(Eigen::array<int, 1>({{1}})).sqrt();
+    z.device(place) = xy / x_norm / y_norm;
+  }
+};
+
+template <typename Place, typename T>
+class CosSimGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input_x = context.Input<Tensor>("X");
+    auto* input_y = context.Input<Tensor>("Y");
+    auto* input_z = context.Input<Tensor>("Out");
+    auto* input_x_norm = context.Input<Tensor>("XNorm");
+    auto* input_y_norm = context.Input<Tensor>("YNorm");
+    auto* output_grad_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto* input_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto dims = input_x->dims();
+    int size = static_cast<int>(framework::product(dims));
+    auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
+    auto x = EigenMatrix<T>::From(*input_x, new_dims);
+    auto y = EigenMatrix<T>::From(*input_y, new_dims);
+    auto z = EigenMatrix<T>::From(*input_z);
+    auto x_norm = EigenMatrix<T>::From(*input_x_norm);
+    auto y_norm = EigenMatrix<T>::From(*input_y_norm);
+    auto dz = EigenMatrix<T>::From(*input_grad_z);
+
+    Eigen::DSizes<int, 2> bcast(1, new_dims[1]);
+    auto z_bcast = z.broadcast(bcast);
+    auto dz_bcast = dz.broadcast(bcast);
+    auto place = context.GetEigenDevice<Place>();
+    auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast);
+    auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast);
+    auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast);
+    if (output_grad_x) {
+      output_grad_x->mutable_data<T>(context.GetPlace());
+      auto dx = EigenMatrix<T>::From(*output_grad_x, new_dims);
+      dx.device(place) =
+          dz_bcast * (y / norm_prod_bcast - z_bcast * x / x_snorm_bcast);
+    }
+    if (output_grad_y) {
+      output_grad_y->mutable_data<T>(context.GetPlace());
+      auto dy = EigenMatrix<T>::From(*output_grad_y, new_dims);
+      dy.device(place) =
+          dz_bcast * (x / norm_prod_bcast - z_bcast * y / y_snorm_bcast);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index 056447901d..6574880c0e 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -19,20 +19,20 @@ template <typename T>
 class CPUGaussianRandomKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.GetAttr<float>("mean");
-    float std = context.GetAttr<float>("std");
+    float mean = context.Attr<float>("mean");
+    float std = context.Attr<float>("std");
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
-    unsigned int seed = static_cast<unsigned int>(context.GetAttr<int>("seed"));
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
     std::minstd_rand engine;
     if (seed == 0) {
       seed = std::random_device()();
     }
     engine.seed(seed);
     std::normal_distribution<T> dist(mean, std);
-    ssize_t size = framework::product(tensor->dims());
-    for (ssize_t i = 0; i < size; ++i) {
+    int64_t size = framework::product(tensor->dims());
+    for (int64_t i = 0; i < size; ++i) {
       data[i] = dist(engine);
     }
   }
@@ -45,10 +45,15 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
-    auto dims = GetAttr<std::vector<int>>("dims");
+    auto dims = Attr<std::vector<int>>("dims");
+    std::vector<int64_t> temp;
+    temp.reserve(dims.size());
+    for (auto dim : dims) {
+      temp.push_back(static_cast<int64_t>(dim));
+    }
     PADDLE_ENFORCE(dims.size() > 0UL,
                    "dims can be one int or array. dims must be set.");
-    tensor->Resize(framework::make_ddim(dims));
+    tensor->Resize(framework::make_ddim(temp));
   }
 };
 
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
index 833a82bbf2..d9dbc1dcfe 100644
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@@ -42,13 +42,13 @@ class GPUGaussianRandomKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(context.GetAttr<int>("seed"));
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
     }
-    T mean = static_cast<T>(context.GetAttr<float>("mean"));
-    T std = static_cast<T>(context.GetAttr<float>("std"));
+    T mean = static_cast<T>(context.Attr<float>("mean"));
+    T std = static_cast<T>(context.Attr<float>("std"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
     ssize_t N = framework::product(tensor->dims());
     thrust::transform(index_sequence_begin, index_sequence_begin + N,
diff --git a/paddle/operators/identity_op.cc b/paddle/operators/identity_op.cc
new file mode 100644
index 0000000000..be956bf3b3
--- /dev/null
+++ b/paddle/operators/identity_op.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/net_op.h"
+#include "paddle/operators/scale_op.h"
+
+namespace paddle {
+namespace operators {
+
+// identity is a alias of scale op. This is also a example for creating a alias
+// operator.
+template <typename AttrType>
+class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IdentityOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "input tensor of identity op");
+    AddOutput("Out", "output tensor of identity op");
+    AddComment("identity operator. Just a alias of scale op which scale = 1.0");
+  }
+};
+
+template <typename AttrType>
+class IdentityOp : public NetOp {
+ public:
+  IdentityOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    AppendOp(framework::OpRegistry::CreateOp(
+        "scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
+        {{"scale", static_cast<AttrType>(1)}}));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp<float>,
+                             ops::IdentityOpMaker<float>);
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
index a9b65c30f2..97872c67ac 100644
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -61,7 +61,7 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes,
       PADDLE_ENFORCE(step_scope_var != nullptr, "%s not in scope",
                      outlinks[i].internal);
       f::DDim step_dims = step_scope_var->template GetMutable<Tensor>()->dims();
-      std::vector<int> dims_vec = vectorize(step_dims);
+      std::vector<int64_t> dims_vec = vectorize(step_dims);
       dims_vec.insert(dims_vec.begin(), seq_len);
       output->Resize(f::make_ddim(dims_vec));
     } else {
@@ -109,7 +109,7 @@ void InitArgument(const ArgumentName& name, Argument* arg,
   arg->step_scopes = op.Output(name.step_scopes);
 
   auto inlinks = op.Inputs(name.inlinks);
-  auto inlink_alias = op.GetAttr<std::vector<std::string>>(name.inlink_alias);
+  auto inlink_alias = op.Attr<std::vector<std::string>>(name.inlink_alias);
   PADDLE_ENFORCE(inlinks.size() == inlink_alias.size(),
                  "the size of inlinks and inlink_alias don't match:%d,%d",
                  inlinks.size(), inlink_alias.size());
@@ -121,7 +121,7 @@ void InitArgument(const ArgumentName& name, Argument* arg,
   }
 
   auto outlinks = op.Outputs(name.outlinks);
-  auto outlink_alias = op.GetAttr<std::vector<std::string>>(name.outlink_alias);
+  auto outlink_alias = op.Attr<std::vector<std::string>>(name.outlink_alias);
   PADDLE_ENFORCE(outlinks.size() == outlink_alias.size(),
                  "the size of outlinks and outlink_alias don't match:%d,%d",
                  outlinks.size(), outlink_alias.size());
@@ -135,8 +135,8 @@ void InitArgument(const ArgumentName& name, Argument* arg,
   auto boot_memories = op.Inputs(name.boot_memories);
 
   // attributes
-  auto memories = op.GetAttr<std::vector<std::string>>(name.memories);
-  auto pre_memories = op.GetAttr<std::vector<std::string>>(name.pre_memories);
+  auto memories = op.Attr<std::vector<std::string>>(name.memories);
+  auto pre_memories = op.Attr<std::vector<std::string>>(name.pre_memories);
 
   PADDLE_ENFORCE(memories.size() == boot_memories.size(),
                  "the size of memories, boot_memories don't match:%d,%d",
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index 8e96a74c94..3d82b34582 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -48,7 +48,7 @@ The equation is: Out = scale*X
   }
 };
 
-// Identity Op's gradient is identity op, too.
+// Scale Op's gradient is scale op, too.
 // Grad(Out=scale(X)) => Grad(X) = scale(Grad(Out))
 template <typename AttrType>
 class ScaleGradOp : public NetOp {
@@ -60,38 +60,11 @@ class ScaleGradOp : public NetOp {
     AppendOp(framework::OpRegistry::CreateOp(
         "scale", {{"X", {Input(framework::GradVarName("Out"))}}},
         {{"Out", {Output(framework::GradVarName("X"))}}},
-        {{"scale", GetAttr<AttrType>("scale")}}));
+        {{"scale", Attr<AttrType>("scale")}}));
     CompleteAddOp(false);
   }
 };
 
-// identity is a alias of scale op. This is also a example for creating a alias
-// operator.
-template <typename AttrType>
-class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  IdentityOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "input tensor of identity op");
-    AddOutput("Out", "output tensor of identity op");
-    AddComment("identity operator. Just a alias of scale op which scale = 1.0");
-  }
-};
-
-template <typename AttrType>
-class IdentityOp : public NetOp {
- public:
-  IdentityOp(const std::string &type, const framework::VariableNameMap &inputs,
-             const framework::VariableNameMap &outputs,
-             const framework::AttributeMap &attrs)
-      : NetOp(type, inputs, outputs, attrs) {
-    AppendOp(framework::OpRegistry::CreateOp(
-        "scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
-        {{"scale", static_cast<AttrType>(1)}}));
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -101,5 +74,3 @@ REGISTER_OP(scale, ops::ScaleOp, ops::ScaleOpMaker<float>, scale_grad,
             ops::ScaleGradOp<float>);
 REGISTER_OP_CPU_KERNEL(scale,
                        ops::ScaleKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp<float>,
-                             ops::IdentityOpMaker<float>);
diff --git a/paddle/operators/scale_op.h b/paddle/operators/scale_op.h
index 65fb77eefa..02fbdc52bb 100644
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
@@ -27,7 +27,7 @@ class ScaleKernel : public framework::OpKernel {
     auto* in = context.Input<framework::Tensor>("X");
     tensor->mutable_data<T>(in->place());
 
-    auto scale = static_cast<T>(context.GetAttr<AttrType>("scale"));
+    auto scale = static_cast<T>(context.Attr<AttrType>("scale"));
 
     auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
     auto eigen_in = framework::EigenVector<T>::Flatten(*in);
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
deleted file mode 100644
index 6716b47883..0000000000
--- a/paddle/operators/scatter_op.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/operators/scatter_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(scatter,
-                       ops::ScatterOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index 8422b622ee..f8888f9c36 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -31,7 +31,7 @@ class SGDOpKernel : public framework::OpKernel {
     auto param = ctx.Input<Tensor>("param");
     auto grad = ctx.Input<Tensor>("grad");
     auto param_out = ctx.Output<Tensor>("param_out");
-    float lr = ctx.GetAttr<float>("learning_rate");
+    float lr = ctx.Attr<float>("learning_rate");
 
     param_out->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 40c51a64c4..7d062ad67c 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -24,7 +24,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
     PADDLE_ENFORCE(ctx.Input<Tensor>("X")->dims().size() == 2UL,
-                   "The input of softmax op must be matrix");
+                   "The input of softmax op must be a matrix.");
     ctx.Output<Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
   }
 };
@@ -34,9 +34,27 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
   SoftmaxOpMaker(framework::OpProto *proto,
                  framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "input of softmax");
-    AddOutput("Y", "output of softmax");
-    AddComment("Softmax Op");
+    AddInput("X",
+             "The input tensor of softmax. "
+             "2-D with shape [batch_size, input_feature_dimensions].");
+    AddOutput("Y", "The normalized values with the same shape as X.");
+    AddComment(R"DOC(
+The input of softmax operator is a 2-D tensor with shape N x K (N is the
+batch_size, K is the dimension of input feature). The output tensor has the
+same shape as the input tensor.
+
+For each row of the input tensor, the softmax operator squashes the
+K-dimensional vector of arbitrary real values to a K-dimensional vector of real
+values in the range [0, 1] that add up to 1. Specifically, it computes the
+exponential of the given dimension and the sum of exponential values of all
+the other dimensions in the K-dimensional vector input. Then the ratio of the
+exponential of the given dimension and the sum of exponential values of all
+the other dimensions is the output of the softmax operator.
+
+For each row `i` and each column `j` in X, we have:
+    Y[i, j] = exp(X[i, j]) / sum_j(exp(X[i, j]))
+
+)DOC");
   }
 };
 
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 2d943c4508..f2aeef6c31 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -26,17 +26,17 @@ class CPUUniformRandomKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(context.GetAttr<int>("seed"));
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
     std::minstd_rand engine;
     if (seed == 0) {
       seed = std::random_device()();
     }
     engine.seed(seed);
     std::uniform_real_distribution<T> dist(
-        static_cast<T>(context.GetAttr<float>("min")),
-        static_cast<T>(context.GetAttr<float>("max")));
-    ssize_t size = framework::product(tensor->dims());
-    for (ssize_t i = 0; i < size; ++i) {
+        static_cast<T>(context.Attr<float>("min")),
+        static_cast<T>(context.Attr<float>("max")));
+    int64_t size = framework::product(tensor->dims());
+    for (int64_t i = 0; i < size; ++i) {
       data[i] = dist(engine);
     }
   }
@@ -48,11 +48,16 @@ class UniformRandomOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE(GetAttr<float>("min") < GetAttr<float>("max"),
+    PADDLE_ENFORCE(Attr<float>("min") < Attr<float>("max"),
                    "uniform_random's min must less then max");
     auto* tensor = ctx.Output<framework::Tensor>("Out");
-    auto dims = GetAttr<std::vector<int>>("dims");
-    tensor->Resize(framework::make_ddim(dims));
+    auto dims = Attr<std::vector<int>>("dims");
+    std::vector<int64_t> temp;
+    temp.reserve(dims.size());
+    for (auto dim : dims) {
+      temp.push_back(static_cast<int64_t>(dim));
+    }
+    tensor->Resize(framework::make_ddim(temp));
   }
 };
 
diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu
index df993c0779..c2c041b144 100644
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
@@ -45,13 +45,13 @@ class GPUUniformRandomKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(context.GetAttr<int>("seed"));
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
     }
-    T min = static_cast<T>(context.GetAttr<float>("min"));
-    T max = static_cast<T>(context.GetAttr<float>("max"));
+    T min = static_cast<T>(context.Attr<float>("min"));
+    T max = static_cast<T>(context.Attr<float>("max"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
     ssize_t N = framework::product(tensor->dims());
     thrust::transform(index_sequence_begin, index_sequence_begin + N,
diff --git a/paddle/platform/cudnn_helper.h b/paddle/platform/cudnn_helper.h
index 24ddf3441c..2841d2a2db 100644
--- a/paddle/platform/cudnn_helper.h
+++ b/paddle/platform/cudnn_helper.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/macros.h"
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 2200bc2af2..c530df6101 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -30,7 +30,7 @@ limitations under the License. */
 
 namespace py = pybind11;
 
-USE_OP(add_two);
+USE_OP(add);
 USE_OP(onehot_cross_entropy);
 USE_OP(sgd);
 USE_OP(mul);
@@ -46,6 +46,7 @@ USE_OP(lookup_table);
 USE_OP(scale);
 USE_NO_KERNEL_OP(identity);
 USE_OP(minus);
+USE_OP(cos_sim);
 USE_CPU_ONLY_OP(gather);
 USE_CPU_ONLY_OP(scatter);
 USE_OP(clip);
@@ -77,7 +78,7 @@ PYBIND11_PLUGIN(core) {
       .def("get_dims",
            [](const Tensor &self) { return vectorize(self.dims()); })
       .def("set_dims",
-           [](Tensor &self, const std::vector<int> &dim) {
+           [](Tensor &self, const std::vector<int64_t> &dim) {
              self.Resize(make_ddim(dim));
            })
       .def("alloc_float",
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index 39ba60b4dc..95171acf72 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -85,7 +85,7 @@ void PyCPUTensorSetFromArray(
     framework::Tensor &self,
     py::array_t<T, py::array::c_style | py::array::forcecast> array,
     paddle::platform::CPUPlace &place) {
-  std::vector<int> dims;
+  std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
     dims.push_back((int)array.shape()[i]);
@@ -102,7 +102,7 @@ void PyCUDATensorSetFromArray(
     framework::Tensor &self,
     py::array_t<T, py::array::c_style | py::array::forcecast> array,
     paddle::platform::GPUPlace &place) {
-  std::vector<int> dims;
+  std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
     dims.push_back((int)array.shape()[i]);
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 661ebd8964..e0f77d7973 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ py_test(test_scope SRCS test_scope.py)
 
 py_test(test_tensor SRCS test_tensor.py)
 py_test(test_mul_op SRCS test_mul_op.py)
+py_test(test_cos_sim_op SRCS test_cos_sim_op.py)
 
 py_test(test_mean_op SRCS test_mean_op.py)
 
diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
index b8d7e4ea43..fdb06b7988 100644
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -36,13 +36,13 @@ def get_numeric_gradient(op,
                          in_place=False):
     """
     Get Numeric Gradient for an operator's input.
-    
-    :param op: C++ operator instance, could be an network 
-    :param input_values: The input variables. Should be an dictionary, key is 
+
+    :param op: C++ operator instance, could be an network
+    :param input_values: The input variables. Should be an dictionary, key is
     variable name. Value is numpy array.
-    :param output_name: The final output variable name. 
+    :param output_name: The final output variable name.
     :param input_to_check: The input variable need to get gradient.
-    :param delta: The perturbation value for numeric gradient method. The 
+    :param delta: The perturbation value for numeric gradient method. The
     smaller delta is, the more accurate result will get. But if that delta is
      too small, it could occur numerical stability problem.
     :param local_scope: The local scope used for get_numeric_gradient.
@@ -229,9 +229,9 @@ class GradientChecker(unittest.TestCase):
         """Use relative error for the comparison.
 
         :param numeric_grads: the numerical graidents.
-        :type numeric_grads: a list of numpy.array 
+        :type numeric_grads: a list of numpy.array
         :param analytic_grads: the analytical graidents.
-        :type analytic_grads: a list of numpy.array 
+        :type analytic_grads: a list of numpy.array
         :param name: the names of gradients, used to print for debug.
         :type names: a list of string
         :param msg_prefix: string info, used to print for debug.
diff --git a/python/paddle/v2/framework/tests/op_test_util.py b/python/paddle/v2/framework/tests/op_test_util.py
index bb2d0a3f3a..5594b59bf7 100644
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -6,13 +6,13 @@ from paddle.v2.framework.op import Operator
 class OpTestMeta(type):
     """
     Operator Test ClassMeta.
-    
-    It injects `test_all` method into user's OperatorTest class, to make Python 
+
+    It injects `test_all` method into user's OperatorTest class, to make Python
     unittest module run that method.
-    
+
     The `test_all` read what value is stored in `self`. It use self's values to
     create and run a operator, and check whether that op is OK or not.
-    
+
     See `test_add_two_op` for example usage.
     """
 
diff --git a/python/paddle/v2/framework/tests/test_add_two_op.py b/python/paddle/v2/framework/tests/test_add_two_op.py
index 0def484edd..a578e74eca 100644
--- a/python/paddle/v2/framework/tests/test_add_two_op.py
+++ b/python/paddle/v2/framework/tests/test_add_two_op.py
@@ -11,7 +11,7 @@ class TestAddOp(unittest.TestCase):
     __metaclass__ = OpTestMeta
 
     def setUp(self):
-        self.type = "add_two"
+        self.type = "add"
         self.inputs = {
             'X': numpy.random.random((102, 105)).astype("float32"),
             'Y': numpy.random.random((102, 105)).astype("float32")
diff --git a/python/paddle/v2/framework/tests/test_cos_sim_op.py b/python/paddle/v2/framework/tests/test_cos_sim_op.py
new file mode 100644
index 0000000000..32013a7999
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_cos_sim_op.py
@@ -0,0 +1,60 @@
+import unittest
+import numpy as np
+from gradient_checker import GradientChecker, create_op
+from op_test_util import OpTestMeta
+
+
+class TestCosSimOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "cos_sim"
+        self.inputs = {
+            'X': np.random.random((32, 64)).astype("float32"),
+            'Y': np.random.random((32, 64)).astype("float32")
+        }
+        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=1)
+        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=1)
+        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=1) / \
+            expect_x_norm / expect_y_norm
+        self.outputs = {
+            'XNorm': np.expand_dims(expect_x_norm, 1),
+            'YNorm': np.expand_dims(expect_y_norm, 1),
+            'Out': np.expand_dims(expect_out, 1)
+        }
+
+
+class TestCosSimGradOp(GradientChecker):
+    def setUp(self):
+        self.op = create_op("cos_sim")
+        self.inputs = {
+            'X': np.random.random((10, 5)).astype("float32"),
+            'Y': np.random.random((10, 5)).astype("float32")
+        }
+
+    def test_cpu_gpu_compare(self):
+        self.compare_grad(self.op, self.inputs)
+
+    def test_normal(self):
+        self.check_grad(
+            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.05)
+
+    def test_ignore_x(self):
+        self.check_grad(
+            self.op,
+            self.inputs, ["Y"],
+            "Out",
+            max_relative_error=0.05,
+            no_grad_set={"X"})
+
+    def test_ignore_y(self):
+        self.check_grad(
+            self.op,
+            self.inputs, ["X"],
+            "Out",
+            max_relative_error=0.05,
+            no_grad_set={"Y"})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_gradient_checker.py b/python/paddle/v2/framework/tests/test_gradient_checker.py
index e0b3151208..857427cdfb 100644
--- a/python/paddle/v2/framework/tests/test_gradient_checker.py
+++ b/python/paddle/v2/framework/tests/test_gradient_checker.py
@@ -7,7 +7,7 @@ from gradient_checker import get_numeric_gradient
 
 class GetNumericGradientTest(unittest.TestCase):
     def test_add_op(self):
-        add_op = Operator('add_two', X="X", Y="Y", Out="Z")
+        add_op = Operator('add', X="X", Y="Y", Out="Z")
         x = numpy.random.random((10, 1)).astype("float32")
         y = numpy.random.random((10, 1)).astype("float32")
 
diff --git a/python/paddle/v2/framework/tests/test_net.py b/python/paddle/v2/framework/tests/test_net.py
index 9339cf28da..e4b7cd480c 100644
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
@@ -15,7 +15,7 @@ def fc(X, W, Y):
 class TestNet(unittest.TestCase):
     def test_net_all(self):
         net = core.Net.create()
-        op1 = Operator("add_two", X="X", Y="Y", Out="Out")
+        op1 = Operator("add", X="X", Y="Y", Out="Out")
         net.append_op(op1)
 
         net2 = core.Net.create()
@@ -26,7 +26,7 @@ class TestNet(unittest.TestCase):
 
         expected = '''
 Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
-    Op(add_two), inputs:{X[X], Y[Y]}, outputs:{Out[Out]}.
+    Op(add), inputs:{X[X], Y[Y]}, outputs:{Out[Out]}.
     Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
         Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
             Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
diff --git a/python/paddle/v2/framework/tests/test_operator.py b/python/paddle/v2/framework/tests/test_operator.py
index 1abc4eeb57..040556322d 100644
--- a/python/paddle/v2/framework/tests/test_operator.py
+++ b/python/paddle/v2/framework/tests/test_operator.py
@@ -193,10 +193,10 @@ class TestOpDescCreationMethod(unittest.TestCase):
 
 class TestOpCreations(unittest.TestCase):
     def test_all(self):
-        add_op = op.Operator("add_two", X="a", Y="b", Out="z")
+        add_op = op.Operator("add", X="a", Y="b", Out="z")
         self.assertIsNotNone(add_op)
         # Invoke C++ DebugString()
-        self.assertEqual('Op(add_two), inputs:{X[a], Y[b]}, outputs:{Out[z]}.',
+        self.assertEqual('Op(add), inputs:{X[a], Y[b]}, outputs:{Out[z]}.',
                          str(add_op))
 
 
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py
index d6000ab9f9..22e680fd78 100644
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -146,7 +146,7 @@ class TestRecurrentOp(unittest.TestCase):
         stepnet = core.Net.create()
         x_fc_op = Operator("mul", X="x@alias", Y="W", Out="Wx")
         h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("add_two", X="Wx", Y="Uh", Out="sum")
+        sum_op = Operator("add", X="Wx", Y="Uh", Out="sum")
         sig_op = Operator("sigmoid", X="sum", Y="h@alias")
 
         for op in [x_fc_op, h_fc_op, sum_op, sig_op]: