Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into op_transpose

8 years ago · 828008e41d
parent d6651b9b8e 544458e011
commit 828008e41d
113 changed files with 4354 additions and 1225 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -4,7 +4,6 @@ cache:
    - $HOME/.ccache
    - $HOME/.cache/pip
    - $TRAVIS_BUILD_DIR/build/third_party
    - $TRAVIS_BUILD_DIR/build_android/third_party
 sudo: required
 dist: trusty
 os:
@ -12,7 +11,6 @@ os:
 env:
  - JOB=build_doc
  - JOB=check_style
  - JOB=build_android
 addons:
  apt:
    packages:
@ -23,7 +21,6 @@ addons:
      - python
      - python-pip
      - python2.7-dev
      - python-numpy
      - python-wheel
      - libboost-dev
      - curl
@ -37,8 +34,8 @@ before_install:
  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
  # protobuf version.
-  - pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
+  - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
-  - pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
+  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
  - curl https://glide.sh/get | bash
  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
  - go get -u github.com/alecthomas/gometalinter
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -65,8 +65,8 @@ if(NOT CMAKE_BUILD_TYPE)
 endif()
 if(ANDROID)
-    if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
+    if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
-        message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21")
+        message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
    endif()
    set(WITH_GPU OFF CACHE STRING
--- a/Dockerfile.android
+++ b/Dockerfile.android
@ -4,9 +4,15 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 # ENV variables
 ARG ANDROID_ABI
 ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
 ENV HOME=/root \
    ANDROID_NDK_HOME=/opt/android-ndk-linux \
-    ANDROID_STANDALONE_TOOLCHAIN=/opt/android-toolchain-gcc
+    ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain \
    ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain
 RUN apt-get update && \
    apt-get install -y \
@ -15,12 +21,11 @@ RUN apt-get update && \
    apt-get clean -y
 # Install Go and glide
-RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    tar -C /usr/local -xzf go.tgz && \
+    tar -xz -C /usr/local && \
    mkdir /root/gopath && \
    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src && \
+    mkdir /root/gopath/src
    rm go.tgz
 ENV GOROOT=/usr/local/go GOPATH=/root/gopath
 # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
 ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
@ -42,7 +47,8 @@ RUN mkdir /opt/android-ndk-tmp && \
    wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
    unzip -q android-ndk-r14b-linux-x86_64.zip && \
    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
-    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_STANDALONE_TOOLCHAIN} && \
+    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-23 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \
    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-23 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \
    rm -rf /opt/android-ndk-tmp && \
    rm -rf ${ANDROID_NDK_HOME}
--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@ -20,6 +20,7 @@
 # The supported variables are listed belows:
 # 
 # ANDROID_STANDALONE_TOOLCHAIN
 # ANDROID_TOOLCHAIN
 # ANDROID_ABI
 # ANDROID_NATIVE_API_LEVEL
 # ANDROID_ARM_MODE
@ -57,6 +58,10 @@ IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL)
    ENDIF()
 ENDIF()
 IF(NOT DEFINED ANDROID_TOOLCHAIN)
    SET(ANDROID_TOOLCHAIN clang)
 ENDIF()
 IF(NOT DEFINED ANDROID_ABI)
    SET(ANDROID_ABI "armeabi-v7a")
 ENDIF()
@ -82,6 +87,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
            "${CMAKE_VERSION}), when cross-compiling for Android.")
    IF(ANDROID_STANDALONE_TOOLCHAIN)
        # Use standalone toolchain
        SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot")
        IF(NOT CMAKE_SYSTEM_VERSION)
@ -96,26 +102,44 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
        ENDIF()
        # Toolchain
        SET(ANDROID_TOOLCHAIN "gcc")
        SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN})
    ELSE(ANDROID_NDK)
        # TODO: use android ndk
    ENDIF()
    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
        SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
        IF(ANDROID_ABI STREQUAL "armeabi")
            SET(CMAKE_SYSTEM_PROCESSOR armv5te)
            SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi)
        ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
            SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
            SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi)
        ENDIF()
-        ENDIF()
+    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
        IF(ANDROID_ABI STREQUAL "arm64-v8a")
        SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
        SET(CMAKE_SYSTEM_PROCESSOR aarch64)
        SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android)
    ELSE()
        MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.")
    ENDIF()
    SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
    IF(ANDROID_TOOLCHAIN STREQUAL clang)
        SET(ANDROID_C_COMPILER_NAME clang)
        SET(ANDROID_CXX_COMPILER_NAME clang++)
        SET(CMAKE_C_COMPILER_TARGET   ${ANDROID_CLANG_TRIPLE})
        SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE})
    ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc)
        SET(ANDROID_C_COMPILER_NAME gcc)
        SET(ANDROID_CXX_COMPILER_NAME g++)
    ELSE()
        MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}")
    ENDIF()
    # C compiler
    IF(NOT CMAKE_C_COMPILER)
-        SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc")
+        SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}")
    ELSE()
        GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
    ENDIF()
@ -125,7 +149,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
    # CXX compiler
    IF(NOT CMAKE_CXX_COMPILER)
-        SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++")
+        SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}")
    ELSE()
        GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
    ENDIF()
@ -137,7 +161,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
    SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
    # Toolchain and ABI specific flags.
-    SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections -finline-limit=64")
+    SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections")
    SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections")
    IF(ANDROID_ABI STREQUAL "armeabi")
@ -145,8 +169,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
             -march=armv5te
             -mtune=xscale
             -msoft-float)
-    ENDIF()
+    ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
    IF(ANDROID_ABI STREQUAL "armeabi-v7a")
        LIST(APPEND ANDROID_COMPILER_FLAGS
             -march=armv7-a
             -mfloat-abi=softfp)
@ -156,6 +179,8 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
            LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16)
        ENDIF()
        LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8)
    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
    ENDIF()
    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
@ -164,10 +189,18 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
        ELSE()
            LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb)
        ENDIF()
        IF(ANDROID_TOOLCHAIN STREQUAL clang)
            # Disable integrated-as for better compatibility.
            LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as)
        ENDIF()
    ENDIF()
-    IF(ANDROID_ABI STREQUAL "arm64-v8a")
+    IF(ANDROID_TOOLCHAIN STREQUAL clang)
-        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
+        # CMake automatically forwards all compiler flags to the linker,
        # and clang doesn't like having -Wa flags being used for linking.
        # To prevent CMake from doing this would require meddling with
        # the CMAKE_<LANG>_COMPILE_OBJECT rules, which would get quite messy.
        LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments)
    ENDIF()
    STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 IF(USE_EIGEN_FOR_BLAS)
    return()
 ENDIF(USE_EIGEN_FOR_BLAS)
 INCLUDE(cblas)
 IF(NOT ${CBLAS_FOUND})
--- a/doc/design/functions_operators_layers.md
+++ b/doc/design/functions_operators_layers.md
@ -86,12 +86,13 @@ def layer.fc(X):
 We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`.  So we have the following concepts in above illustrative example:
-```
+
 | C++ functions/functors | mul          | add          |             |          |
 |------------------------|--------------|--------------|-------------|----------|
 | C++ operator class     | mulOp        | addOp        | FCOp        |          |
 | Python binding         | operator.mul | operator.add | operator.fc |          |
 | Python function        |              |              |             | layer.fc |
-```
+
 This is how we differentiate layer and operators in PaddlePaddle:
--- a/doc/design/graph.md
+++ b/doc/design/graph.md
@ -1,4 +1,4 @@
-# Design Doc: Computations as Graphs
+# Design Doc: Computations as a Graph
 A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before.
@ -8,6 +8,8 @@ This document explains that the construction of a graph as three steps:
 - construct the backward part
 - construct the optimization part
 ## The Construction of a Graph
 Let us take the problem of image classification as a simple example.  The application program that trains the model looks like:
 ```python
@ -25,7 +27,9 @@ The first four lines of above program build the forward part of the graph.
 ![](images/graph_construction_example_forward_only.png)
-In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x.  `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b.
+In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x.  `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b, and the initialization operators.
 Initialization operators are kind of "run-once" operators -- the `Run` method increments a class data member counter so to run at most once.  By doing so, a parameter wouldn't be initialized repeatedly, say, in every minibatch.
 In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`.  These protobuf messages are saved in a `BlockDesc` protobuf message.
@ -49,3 +53,18 @@ According to the chain rule of gradient computation, `ConstructBackwardGraph` wo
 For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient.  Here results in the complete graph:
 ![](images/graph_construction_example_all.png)
 ## Block and Graph
 The word block and graph are interchangable in the desgin of PaddlePaddle.  A [Block[(https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions.  A graph of operators and variables is a representation of the block.
 A Block keeps operators in an array `BlockDesc::ops`
 ```protobuf
 message BlockDesc {
  repeated OpDesc ops = 1;
  repeated VarDesc vars = 2;
 }
 ```
 in the order that there appear in user programs, like the Python program at the beginning of this article.  We can imagine that in `ops`,  we have some forward operators, followed by some gradient operators, and then some optimization operators.
--- a/doc/design/images/graph_construction_example.dot
+++ b/doc/design/images/graph_construction_example.dot
@ -2,6 +2,8 @@ digraph ImageClassificationGraph {
        ///////// The forward part /////////
        FeedX [label="Feed", color=blue, shape=box];
        FeedY [label="Feed", color=blue, shape=box];
        InitW [label="Init", color=blue, shape=diamond];
        Initb [label="Init", color=blue, shape=diamond];
        FC [label="FC", color=blue, shape=box];
        MSE [label="MSE", color=blue, shape=box];
@ -14,6 +16,8 @@ digraph ImageClassificationGraph {
        FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
        FeedY -> l [color=blue];
        InitW -> W [color=blue];
        Initb -> b [color=blue];
        W -> FC [color=blue];
        b -> FC [color=blue];
        l -> MSE [color=blue];
--- a/doc/design/images/graph_construction_example_all.png
+++ b/doc/design/images/graph_construction_example_all.png
--- a/doc/design/images/graph_construction_example_forward_backward.png
+++ b/doc/design/images/graph_construction_example_forward_backward.png
--- a/doc/design/images/graph_construction_example_forward_only.png
+++ b/doc/design/images/graph_construction_example_forward_only.png
--- a/doc/design/simple_op_design.md
+++ b/doc/design/simple_op_design.md
@ -147,7 +147,7 @@ class CosineOp {
 struct CosineOpProtoMaker : public OpProtoMaker {
 	CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) {
 		AddInput("input", "input of cosine op");
-		AddAttr("scale", "scale of cosine op", float).Default(1.0).LargerThan(0.0);
+		AddAttr("scale", "scale of cosine op", float).Default(1.0).GreaterThan(0.0);
 		AddType("cos");
 		AddComment("This is cos op");
 	}
--- a/doc/design/var_desc.md
+++ b/doc/design/var_desc.md
@ -0,0 +1,124 @@
 ## Background
 PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime.
 PaddlePaddle use proto message to describe compile time graph for
 1. Computation graph should be able to be saved to a file.
 1. In distributed training, the graph will be serialized and send to multiple workers.
 The computation graph is constructed by Data Node and Operation Node. The concept to represent them is in the table below.
 | |compile time|runtime|
 |---|---|---|
 |Data|VarDesc(proto)|Variable(cpp)|
 |Operation|OpDesc(proto)|Operator(cpp)|
 ## Definition of VarDesc
 A VarDesc should have a name and value, in PaddlePaddle, the value will always be a tensor. Since we use LoDTensor most of the time. We add a LoDTesnorDesc to represent it.
 ```proto
 message VarDesc {
  required string name = 1;
  optional LoDTensorDesc lod_tensor = 2;
 }
 ```
 ## Definition of LodTensorDesc
 ```proto
 enum DataType {
  BOOL = 0;
  INT16 = 1;
  INT32 = 2;
  INT64 = 3;
  FP16 = 4;
  FP32 = 5;
  FP64 = 6;
 }
 message LoDTensorDesc {
  required DataType data_type = 1;
  repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
  optional int32 lod_level = 3 [default=0];
 }
 ```
 ## Definition of Variable in Python
 In Python API, layer will take Variable as Input, and return Variable as Output. There should be a class `Variable` in python to help create and manage Variable.
 ```python
 image = Variable(dims=[-1, 640, 480])
 # fc1 and fc2 are both Variable
 fc1 = layer.fc(input=image, output_size=10)
 fc2 = layer.fc(input=fc1, output_size=20)
 ```
 ### what should class `Variable` Have
 1. `name`.a name of string type is used to mark the value of the Variable.
 1. `initializer`. Since our Tensor does not have value. we will always use some Operator to fullfill it when run. So we should have a initialize method to help add the init operator.
 1. `operator`. Variable should record which operator produce itself. The reaon is:
  - we use pd.eval(targets=[var1, var2]) to run the related ops to get the value of var1 and var2. var.op is used to trace the dependency of the current variable.
 In PaddlePaddle, we use Block to describe Computation Graph, so in the code we will use Block but not Graph.
 ```python
 import VarDesc
 import LoDTensorDesc
 import framework
 def AddInitialOperator(variable, initializer):
 	# add an initialize Operator to block to init this Variable
 class Variable(object):
   def __init__(self, name, dims, type, initializer):
      self._block = get_default_block()
      self._name = name
      self.op = None
      tensor_desc = LoDTensorDesc(data_type=type, dims=dims)
      _var_desc = VarDesc(name=name, lod_tensor=tensor_desc)
      self._var = framework.CreateVar(_var_desc)
      self._block.add_var(self)
      # add initial op according to initializer
      if initializer is not None:
          AddInitialOperator(self, initializer)
   def dims(self):
      return self._var.dims()
   def data_type(self):
       return self._var.data_type()
   def to_proto(self):
       pass
 ```
 Then we can use this Variable to create a fc layer in Python.
 ```python
 import paddle as pd
 def flatten_size(X, num_flatten_dims):
  prod = 1 # of last num_flatten_dims
  for i in xrange(num_flatten_dims):
    prod = prod * X.dims[-i-1]
  return prod
 def layer.fc(X, output_size, num_flatten_dims):
  W = Variable(pd.random_uniform(), type=FP32, dims=[flatten_size(X, num_flatten_dims), output_size])
  b = Variable(pd.random_uniform(), type=FP32, dims=[output_size])
  out = Variable(type=FP32)
  y = operator.fc(X, W, b, output=out) # fc will put fc op input into out
  pd.InferShape(y)
  return out
 x = Variable(dims=[-1, 640, 480])
 y = layer.fc(x, output_size=100)
 z = layer.fc(y, output_size=200)
 paddle.eval(targets=[z], ...)
 print(z)
 ```
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@ -45,7 +45,9 @@ Kernel实现       | CPU、GPU共享Kernel实现在`.h`文件中，否则，CPU
 ### 1. 定义ProtoMaker类
-矩阵乘的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。首先定义`ProtoMaker`来描述该Op的输入、输出及注释：
+矩阵乘法的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。
 首先定义`ProtoMaker`来描述该Op的输入、输出，并添加注释：
 ```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
@ -63,17 +65,17 @@ The equation is: Out = X * Y
 };
 ```
-[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数包括2个参数：
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数含有2个参数：
   - `framework::OpProto` ： 前者存储Op的输入输出和参数属性，将用于Python API接口的生成。
   - `framework::OpAttrChecker` ：后者用于检查参数属性的合法性。
-构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加该Op的注释，这些函数会将对应内容添加到`OpProto`中。
+构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。
-在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守命名规范。
+上面的代码在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守命名规范。
-再举个[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)的例子：
+再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)为例：
 ```cpp
 template <typename AttrType>
@ -93,12 +95,14 @@ The equation is: Out = scale*X
 这个例子有两处不同：
-  - `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中。
+- `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中，如果Op的某个输入不参与反向梯度的计算，请显示地调用`.NotInGradient()`进行设置。
 - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
 ### 2. 定义Operator类
 下面的点实现了MulOp的定义：
 ```cpp
 class MulOp : public framework::OperatorWithKernel {
@ -143,10 +147,23 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
  - 1). 做检查， 尽早报错：检查输入数据维度、类型等是否合法。
  - 2). 设置输出Tensor的形状。
-通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和要讲到的注册函数一起放在`.cc`中
+通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和下面将要介绍的注册函数一起放在`.cc`中
 ### 3. 定义OpKernel类
 `MulKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
 - `typename  Place`: 表示设备类型，不同设备(CPU、GPU)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
 - `typename T` : 表示数据类型，如`float`, `double`等。
 需要为`MulKernel`类重写`Compute`接口。
 - `Compute`接受一个输入参数：`const framework::ExecutionContext& context`。
 - 与`InferShapeContext`相比，`ExecutionContext`增加了设备类型，同样可获取到输入输出和属性参数。
 - `Compute`函数里实现`OpKernel`的具体计算逻辑。
 下面是 `MulKernel` `Compute`的实现：
  ```cpp
  template <typename Place, typename T>
  class MulKernel : public framework::OpKernel {
@ -163,23 +180,19 @@ class MulKernel : public framework::OpKernel {
  };
  ```
-`MulKernel`继承自`framework::OpKernel`，带有模板参数:
+需要注意：**不同设备(CPU、GPU)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
-  - `typename  Place`: 表示设备类型，不同设备(CPU、GPU)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+`MulOp`的CPU、GPU实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
- - `typename T` : 表示数据类型，如`float`, `double`等。
+为了使`OpKernel`的计算过程书写更加简单，并且CPU、GPU的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
 `MulKernel`需要重写`Compute`接口，该接口参数为`const framework::ExecutionContext& context`, `ExecutionContext`相比`InferShapeContext`增加了设备类型，同样可获取到输入输出和属性参数，`Compute`函数里写具体实现时。
-注意，不同设备(CPU、GPU)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。`MulOp`的CPU、GPU实现共享同一个`Kernel`，`OpKernel`不共享的例子可以参考[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 
+到此，前向Op实现完成。接下来，需要在`.cc`文件中注册该op和kernel。
-
+反向Op类的定义，反向OpKernel的定义与前向Op类似，这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
 为了使得`OpKernel`的计算过程书写较为简单，CPU、GPU的代码可以复用，我们通常借助Eigen unsupported Tensor模块来实现。关于在paddle中如何使用Eigen库，请参考对应的使用[文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)
 到此前向Op实现完成，需要在`.cc`文件中注册该op和kernel。反向Op类的定义和Kernel定义与前向Op类似，这里不再重复。但注意，反向Op没有`ProtoMaker`。
 ### 4. 注册Operator
-在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
+- 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
    ```cpp
    namespace ops = paddle::operators;
@ -189,11 +202,15 @@ REGISTER_OP_CPU_KERNEL(mul_grad,
                  ops::MulGradKernel<paddle::platform::CPUPlace, float>);
    ```
-  - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`，
+   在上面的代码中：
    - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
    - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulKernel`类。
-在 `.cu`文件中注册GPU Kernel。请注意，如果GPU Kernel的实现是基于Eigen unsupported模块，那么在 `.cu`的最前面请加上宏定义 `#define EIGEN_USE_GPU`
+
 - 在 `.cu`文件中注册GPU Kernel。
    - 请注意，如果GPU Kernel的实现基于Eigen unsupported模块，那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`，代码示例如下：
    ```cpp
    // if use Eigen unsupported module before include head files
@ -225,7 +242,7 @@ REGISTER_OP_GPU_KERNEL(mul_grad,
 - 绑定Python
    在 [`paddle/pybind/pybind.cc
-`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc)文件中添加该类：
+`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) 使用`USE_OP`告知编译器需要链接的Op，具体解释参考[代码注释](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h#L81)。
    ```
    USE_OP(mul);
@ -242,20 +259,23 @@ REGISTER_OP_GPU_KERNEL(mul_grad,
    USE_NO_KENREL_OP(recurrent);
    ```
    使用`USE_OP`告知编译器需要链接该Op的目标文件，具体解释参考[代码注释](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h#L81)。
 - 生成库
-   无需修改 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件，`paddle/operators` 目录下新增的 `*_op.cc` 文件会自动被添加链接到生成的lib库中。
+   无需修改 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件，`paddle/operators` 目录下新增的 `*_op.cc` 文件会被自动添加链接到生成的lib库中。
 ## 实现单元测试
-单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单测](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
 ### 前向Operator单元测试
-前向Op单测继承自`unittest.TestCase`，并定义元类`__metaclass__ = OpTestMeta`，具体单测流程在`OpTestMeta`里完成。需在`setUp`函数定义输入输出和属性参数，以及Python对比的输出值。
+前向Op单元测试继承自`unittest.TestCase`，并定义元类`__metaclass__ = OpTestMeta`。各项更加具体的单元测试在`OpTestMeta`里完成。测试前向Operator，需要：
 1. 在`setUp`函数定义输入、输出，以及相关的属性参数。
 2. 生成随机的输入数据。
 3. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比。
  ```python
  import unittest
@ -274,18 +294,19 @@ class TestMulOp(unittest.TestCase):
          }
          self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
  ```
   首先需要`import`必要的包,下面详细解释其他值：
-   - `self.type = "mul" ` : 定义类型，和注册的类型一致。
+上面的代码首先导入依赖的包，下面是对`setUp`函数中操作的重要变量的详细解释：
-   - `self.inputs` : 定义输入，类型为Numpy.array，并初始化。
+
-   - `self.outputs` : 定义输出，并得到Python结算结果。
+- `self.type = "mul" ` : 定义类型，与operator注册时注册的类型一致。
 - `self.inputs` : 定义输入，类型为`numpy.array`，并初始化。
 - `self.outputs` : 定义输出，并在Python脚本中完成与operator同样的计算逻辑，返回Python端的计算结果。
 ### 反向Operator单元测试
-反向Op单测继承自`GradientChecker`，而`GradientChecker`集成自`unittest.TestCase`，所以反向单测函数需要`test_`开头。
+反向Op单元测试继承自`GradientChecker`，而`GradientChecker`继承自`unittest.TestCase`，因此，**反向单元测试函数需要以`test_`开头**。
-```cpp
+```python
 class TestMulGradOp(GradientChecker):
    def setUp(self):
        self.op = create_op("mul")
@ -319,27 +340,27 @@ class TestMulGradOp(GradientChecker):
            no_grad_set={"Y"})
 ```
-下面解释一些关键的地方:
+下面解释代码中一些关键的地方:
 - 调用`create_op("mul")`创建反向Op对应的前向Op。
 - 调用`compare_grad`函数对比CPU、GPU计算结果。
-   - `test_normal`中调用`check_grad`检查梯度稳定性，这里采用数值法检测梯度正确性。
+- `test_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
  - 第一个参数`self.op` : 前向Op。
  - 第二个参数`self.inputs` : 输入词典，词典的Key和`ProtoMaker`定义保持一致。
  - 第三个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
  - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
-   - `test_ignore_x`和`test_ignore_y`分支测试只需要计算一个输入梯度的情况。
+- `test_ignore_x`和`test_ignore_y`分支用来测试只需要计算一个输入梯度的情况。
 ### 编译和执行单元测试
-单测完成之后，在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)里添加以下内容将单测加入工程中：
+单元测试编写完成之后，在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)中添加以下内容，将单元测试加入工程：
 ```
 py_test(test_mul_op SRCS test_mul_op.py)
 ```
-请注意，**不同于Op的编译测试，运行单元测试测时需要编译整个工程**，并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后，执行下面的命令来运行单测：
+请注意，**不同于Op的编译测试，运行单元测试测时需要编译整个工程**，并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后，执行下面的命令来运行单元测试：
 ```bash
 make test ARGS="-R test_mul_op -V"
--- a/paddle/cuda/include/hl_cpu_gru.cuh
+++ b/paddle/cuda/include/hl_cpu_gru.cuh
@ -18,14 +18,6 @@ limitations under the License. */
 #ifndef __NVCC__
 #include "paddle/math/MathFunctions.h"
 #ifndef PADDLE_TYPE_DOUBLE
 #define     CBLAS_GEMM     paddle::gemm<float>
 #else
 #define     CBLAS_GEMM     paddle::gemm<double>
 #endif
 template<class OpResetOutput>
 void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
                                       real *gateValue,
@ -210,51 +202,6 @@ inline void forward_final_output(OpFinalOutput opFinalOutput,
  }
 }
 template<class OpResetOutput, class OpFinalOutput>
 void hl_cpu_gru_forward(OpResetOutput opResetOutput,
                        OpFinalOutput opFinalOutput,
                        hl_gru_value value,
                        int frameSize,
                        int batchSize,
                        hl_activation_mode_t active_node,
                        hl_activation_mode_t active_gate) {
  if (value.prevOutValue) {
    CBLAS_GEMM(CblasNoTrans,
               CblasNoTrans,
               batchSize,
               2 * frameSize,
               frameSize,
               1,
               value.prevOutValue,
               frameSize,
               value.gateWeight,
               frameSize * 2,
               1,
               value.gateValue,
               frameSize * 3);
  }
  forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate);
  if (value.prevOutValue) {
    CBLAS_GEMM(CblasNoTrans,
               CblasNoTrans,
               batchSize,
               frameSize,
               frameSize,
               1,
               value.resetOutputValue,
               frameSize,
               value.stateWeight,
               frameSize,
               1,
               value.gateValue + frameSize * 2,
               frameSize * 3);
  }
  forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node);
 }
 template<class OpStateGrad>
 void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad,
                                      real *gateValue,
@ -525,86 +472,6 @@ inline void backward_reset_grad(OpResetGrad opResetGrad,
  }
 }
 template<class OpStateGrad, class OpResetGrad>
 void hl_cpu_gru_backward(OpStateGrad opStateGrad,
                         OpResetGrad opResetGrad,
                         hl_gru_value value,
                         hl_gru_grad  grad,
                         int frameSize,
                         int batchSize,
                         hl_activation_mode_t active_node,
                         hl_activation_mode_t active_gate) {
  backward_state_grad(opStateGrad, value, grad,
    frameSize, batchSize, active_node);
  if (value.prevOutValue && grad.prevOutGrad) {
    CBLAS_GEMM(CblasNoTrans,
               CblasTrans,
               batchSize,
               frameSize,
               frameSize,
               1,
               grad.gateGrad + frameSize * 2,
               frameSize * 3,
               value.stateWeight,
               frameSize,
               0,
               grad.resetOutputGrad,
               frameSize);
    if (grad.stateWeightGrad) {
      CBLAS_GEMM(CblasTrans,
                 CblasNoTrans,
                 frameSize,
                 frameSize,
                 batchSize,
                 1,
                 value.resetOutputValue,
                 frameSize,
                 grad.gateGrad + frameSize * 2,
                 frameSize * 3,
                 1,
                 grad.stateWeightGrad,
                 frameSize);
    }
  }
  backward_reset_grad(opResetGrad, value, grad,
    frameSize, batchSize, active_gate);
  if (grad.prevOutGrad && value.prevOutValue) {
    CBLAS_GEMM(CblasNoTrans,
               CblasTrans,
               batchSize,
               frameSize,
               frameSize * 2,
               1,
               grad.gateGrad,
               frameSize * 3,
               value.gateWeight,
               frameSize * 2,
               1,
               grad.prevOutGrad,
               frameSize);
    if (grad.gateWeightGrad) {
      CBLAS_GEMM(CblasTrans,
                 CblasNoTrans,
                 frameSize,
                 frameSize * 2,
                 batchSize,
                 1,
                 value.prevOutValue,
                 frameSize,
                 grad.gateGrad,
                 frameSize * 3,
                 1,
                 grad.gateWeightGrad,
                 frameSize * 2);
    }
  }
 }
 #endif
 #endif  // HL_CPU_GRU_CUH_
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@ -41,11 +41,23 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc);
 // check whether a value(attribute) fit a certain limit
 template <typename T>
-class LargerThanChecker {
+class GreaterThanChecker {
 public:
-  explicit LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
+  explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
  void operator()(T& value) const {
-    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
+    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
  }
 private:
  T lower_bound_;
 };
 template <typename T>
 class EqualGreaterThanChecker {
 public:
  explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
  void operator()(T& value) const {
    PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
  }
 private:
@ -110,8 +122,13 @@ class TypedAttrChecker {
    return *this;
  }
-  TypedAttrChecker& LargerThan(const T& lower_bound) {
+  TypedAttrChecker& GreaterThan(const T& lower_bound) {
-    value_checkers_.push_back(LargerThanChecker<T>(lower_bound));
+    value_checkers_.push_back(GreaterThanChecker<T>(lower_bound));
    return *this;
  }
  TypedAttrChecker& EqualGreaterThan(const T& lower_bound) {
    value_checkers_.push_back(EqualGreaterThanChecker<T>(lower_bound));
    return *this;
  }
--- a/paddle/framework/backward.md
+++ b/paddle/framework/backward.md
@ -2,20 +2,20 @@
 ## Motivation
-In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass.
+In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the gradient operators/expressions together with the chain rule. Every forward network needs a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass.
 ## Backward Operator Registry
-A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs and output gradients and then calculate its input gradients.
+A backward network is built up with several backward operators. Backward operators take forward operators' inputs outputs, and output gradients and then calculate its input gradients.
 |                        | forward operator | backward operator 
 | ---------------------- | ---------------- |------------------------- |		
 | **Operator::inputs_**  | Inputs       | Inputs, Outputs, OutputGradients |	
 | **Operator::outputs_** | Outputs          | InputGradients            |
- In most cases, there is a one-to-one correspondence between forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced.
+ In most cases, there is a one-to-one correspondence between the forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced.
-For example, we have got a `mul_op`, and we can register it's information and corresponding backward operator by the following macro:
+For example, we have got a `mul_op`, and we can register its information and corresponding backward operator by the following macro:
 ```cpp
 REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
@ -27,7 +27,7 @@ REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
 ## Backward Opeartor Creating
-Given a certain forward operator, we can get its corresponding backward opeartor by calling:
+Given a certain forward operator, we can get its corresponding backward operator by calling:
 ```cpp
 OperatorBase* bwd_op = BuildGradOp(const OperatorBase* fwd_op);
@ -37,7 +37,7 @@ The function `BuildGradOp` will sequentially execute following processes:
 1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`.
-2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these are not necessary for gradient computing.
+2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these, are not necessary for gradient computing.
 3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`.
@ -53,27 +53,27 @@ given a forward network, it generates the backward network. We only care about t
 1. Op 
-   when the input forward network is a Op, return its gradient Operator Immediately.
+   when the input forward network is an Op, return its gradient Operator Immediately.
 2. NetOp 
-   when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to forward NetOp.
+   when the input forward network is a NetOp, it needs to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to the forward NetOp.
-   **shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwirte their shared input variable.  
+   **shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwrite their shared input variable.  
   <p align="center">
-   <img src="./images/duplicate_op.png" width="70%" ><br/>
+   <img src="./images/duplicate_op.png" width="50%" ><br/>
-   1. shared variable in two operators. 
+   1. Shared variable in operators. 
   </p>
-   Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator replace the overwirte links. 
+   Share variable between operators or same input variable used in multiple operators leads to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively and add a generic add operator replace the overwrite links. 
   <p align="center">
-   <img src="images/duplicate_op2.png" width="90%" ><br/>
+   <img src="images/duplicate_op2.png" width="50%" ><br/>
-   2. replace shared variable gradient with `Add` Operator
+   2. Replace shared variable's gradient with `Add` operator.
   </p>
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@ -283,5 +283,14 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
 DDim::DDim(std::initializer_list<int64_t> init_list) {
  *this = make_ddim(init_list);
 }
 DDim flatten_to_2d(const DDim& src, int num_col_dims) {
  int rank = src.size();
  return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
                    product(slice_ddim(src, num_col_dims, rank))});
 }
 DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@ -115,6 +115,12 @@ int arity(const DDim& ddim);
 std::ostream& operator<<(std::ostream&, const DDim&);
 // Reshape a tensor to a matrix. The matrix's first dimension(column length)
 // will be the product of tensor's first `num_col_dims` dimensions.
 DDim flatten_to_2d(const DDim& src, int num_col_dims);
 DDim flatten_to_1d(const DDim& src);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/eigen.h
+++ b/paddle/framework/eigen.h
@ -63,20 +63,35 @@ struct EigenTensor {
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
-struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {};
+struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
  static typename EigenMatrix::Type Reshape(Tensor& tensor, int num_col_dims) {
    int rank = tensor.dims_.size();
    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
                   "`num_col_dims` must be between (0, rank_of_tensor).");
    return EigenMatrix::From(tensor,
                             flatten_to_2d(tensor.dims(), num_col_dims));
  }
  static typename EigenMatrix::ConstType Reshape(const Tensor& tensor,
                                                 int num_col_dims) {
    int rank = tensor.dims_.size();
    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
                   "`num_col_dims` must be between (0, rank_of_tensor).");
    return EigenMatrix::From(tensor,
                             flatten_to_2d(tensor.dims(), num_col_dims));
  }
 };
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
  // Flatten reshapes a Tensor into an EigenVector.
  static typename EigenVector::Type Flatten(Tensor& tensor) {
-    return EigenVector::From(
+    return EigenVector::From(tensor, {product(tensor.dims_)});
        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
  }
  static typename EigenVector::ConstType Flatten(const Tensor& tensor) {
-    return EigenVector::From(
+    return EigenVector::From(tensor, {product(tensor.dims_)});
        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
  }
 };
--- a/paddle/framework/eigen_test.cc
+++ b/paddle/framework/eigen_test.cc
@ -108,5 +108,24 @@ TEST(Eigen, Matrix) {
  }
 }
 TEST(Eigen, MatrixReshape) {
  Tensor t;
  float* p = t.mutable_data<float>({2, 3, 6, 4}, platform::CPUPlace());
  for (int i = 0; i < 2 * 3 * 6 * 4; ++i) {
    p[i] = static_cast<float>(i);
  }
  EigenMatrix<float>::Type em = EigenMatrix<float>::Reshape(t, 2);
  ASSERT_EQ(2 * 3, em.dimension(0));
  ASSERT_EQ(6 * 4, em.dimension(1));
  for (int i = 0; i < 2 * 3; i++) {
    for (int j = 0; j < 6 * 4; j++) {
      ASSERT_NEAR(i * 6 * 4 + j, em(i, j), 1e-6f);
    }
  }
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@ -87,3 +87,24 @@ message OpProto {
  repeated Attr attrs = 4;
  required string comment = 5;
 }
 enum DataType {
  BOOL = 0;
  INT16 = 1;
  INT32 = 2;
  INT64 = 3;
  FP16 = 4;
  FP32 = 5;
  FP64 = 6;
 }
 message LoDTensorDesc {
  required DataType data_type = 1;
  repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
  optional int32 lod_level = 3 [ default = 0 ];
 }
 message VarDesc {
  required string name = 1;
  optional LoDTensorDesc lod_tensor = 2;
 }
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@ -3,7 +3,7 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
-USE_OP(add_two);
+USE_OP(add);
 namespace paddle {
 namespace framework {
@ -41,7 +41,7 @@ namespace f = paddle::framework;
 TEST(GradOpBuilder, AddTwo) {
  std::shared_ptr<f::OperatorBase> add_op(f::OpRegistry::CreateOp(
-      "add_two", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
+      "add", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
  std::shared_ptr<f::OperatorBase> grad_add_op =
      f::OpRegistry::CreateGradOp(*add_op);
  EXPECT_EQ(grad_add_op->Inputs().size(), 4UL);
--- a/paddle/framework/images/duplicate_op2.graffle
+++ b/paddle/framework/images/duplicate_op2.graffle
--- a/paddle/framework/images/duplicate_op2.png
+++ b/paddle/framework/images/duplicate_op2.png
--- a/Show More
+++ b/Show More