Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into op_transpose

8 years ago · 828008e41d
parent d6651b9b8e 544458e011
commit 828008e41d
113 changed files with 4354 additions and 1225 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -4,7 +4,6 @@ cache:
    - $HOME/.ccache
    - $HOME/.cache/pip
    - $TRAVIS_BUILD_DIR/build/third_party
-    - $TRAVIS_BUILD_DIR/build_android/third_party
 sudo: required
 dist: trusty
 os:
@ -12,7 +11,6 @@ os:
 env:
  - JOB=build_doc
  - JOB=check_style
-  - JOB=build_android
 addons:
  apt:
    packages:
@ -23,7 +21,6 @@ addons:
      - python
      - python-pip
      - python2.7-dev
-      - python-numpy
      - python-wheel
      - libboost-dev
      - curl
@ -37,8 +34,8 @@ before_install:
  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
  # protobuf version.
-  - pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
-  - pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
+  - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
+  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
  - curl https://glide.sh/get | bash
  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
  - go get -u github.com/alecthomas/gometalinter
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -65,8 +65,8 @@ if(NOT CMAKE_BUILD_TYPE)
 endif()

 if(ANDROID)
-    if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
-        message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21")
+    if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
+        message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
    endif()

    set(WITH_GPU OFF CACHE STRING
--- a/Dockerfile.android
+++ b/Dockerfile.android
@ -4,9 +4,15 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'

+# ENV variables
+ARG ANDROID_ABI
+
+ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
+
 ENV HOME=/root \
    ANDROID_NDK_HOME=/opt/android-ndk-linux \
-    ANDROID_STANDALONE_TOOLCHAIN=/opt/android-toolchain-gcc
+    ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain \
+    ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain

 RUN apt-get update && \
    apt-get install -y \
@ -15,12 +21,11 @@ RUN apt-get update && \
    apt-get clean -y

 # Install Go and glide
-RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
-    tar -C /usr/local -xzf go.tgz && \
+RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
    mkdir /root/gopath && \
    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src && \
-    rm go.tgz
+    mkdir /root/gopath/src
 ENV GOROOT=/usr/local/go GOPATH=/root/gopath
 # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
 ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
@ -42,7 +47,8 @@ RUN mkdir /opt/android-ndk-tmp && \
    wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
    unzip -q android-ndk-r14b-linux-x86_64.zip && \
    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
-    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_STANDALONE_TOOLCHAIN} && \
+    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-23 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \
+    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-23 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \
    rm -rf /opt/android-ndk-tmp && \
    rm -rf ${ANDROID_NDK_HOME}

--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@ -20,6 +20,7 @@
 # The supported variables are listed belows:
 # 
 # ANDROID_STANDALONE_TOOLCHAIN
+# ANDROID_TOOLCHAIN
 # ANDROID_ABI
 # ANDROID_NATIVE_API_LEVEL
 # ANDROID_ARM_MODE
@ -57,6 +58,10 @@ IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL)
    ENDIF()
 ENDIF()

+IF(NOT DEFINED ANDROID_TOOLCHAIN)
+    SET(ANDROID_TOOLCHAIN clang)
+ENDIF()
+
 IF(NOT DEFINED ANDROID_ABI)
    SET(ANDROID_ABI "armeabi-v7a")
 ENDIF()
@ -82,6 +87,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
            "${CMAKE_VERSION}), when cross-compiling for Android.")

    IF(ANDROID_STANDALONE_TOOLCHAIN)
+        # Use standalone toolchain
        SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot")

        IF(NOT CMAKE_SYSTEM_VERSION)
@ -96,26 +102,44 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
        ENDIF()

        # Toolchain
-        SET(ANDROID_TOOLCHAIN "gcc")
        SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN})
-        IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-            SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
-            IF(ANDROID_ABI STREQUAL "armeabi")
-                SET(CMAKE_SYSTEM_PROCESSOR armv5te)
-            ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
-                SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
-            ENDIF()
-        ENDIF()
-        IF(ANDROID_ABI STREQUAL "arm64-v8a")
-            SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
-            SET(CMAKE_SYSTEM_PROCESSOR aarch64)
+    ELSE(ANDROID_NDK)
+        # TODO: use android ndk
+    ENDIF()
+
+    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+        SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
+        IF(ANDROID_ABI STREQUAL "armeabi")
+            SET(CMAKE_SYSTEM_PROCESSOR armv5te)
+            SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi)
+        ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
+            SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
+            SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi)
        ENDIF()
-        SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
+    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
+        SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
+        SET(CMAKE_SYSTEM_PROCESSOR aarch64)
+        SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android)
+    ELSE()
+        MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.")
+    ENDIF()
+    SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
+
+    IF(ANDROID_TOOLCHAIN STREQUAL clang)
+        SET(ANDROID_C_COMPILER_NAME clang)
+        SET(ANDROID_CXX_COMPILER_NAME clang++)
+        SET(CMAKE_C_COMPILER_TARGET   ${ANDROID_CLANG_TRIPLE})
+        SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE})
+    ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc)
+        SET(ANDROID_C_COMPILER_NAME gcc)
+        SET(ANDROID_CXX_COMPILER_NAME g++)
+    ELSE()
+        MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}")
    ENDIF()

    # C compiler
    IF(NOT CMAKE_C_COMPILER)
-        SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc")
+        SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}")
    ELSE()
        GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
    ENDIF()
@ -125,7 +149,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")

    # CXX compiler
    IF(NOT CMAKE_CXX_COMPILER)
-        SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++")
+        SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}")
    ELSE()
        GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
    ENDIF()
@ -137,7 +161,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
    SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)

    # Toolchain and ABI specific flags.
-    SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections -finline-limit=64")
+    SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections")
    SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections")

    IF(ANDROID_ABI STREQUAL "armeabi")
@ -145,8 +169,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
             -march=armv5te
             -mtune=xscale
             -msoft-float)
-    ENDIF()
-    IF(ANDROID_ABI STREQUAL "armeabi-v7a")
+    ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
        LIST(APPEND ANDROID_COMPILER_FLAGS
             -march=armv7-a
             -mfloat-abi=softfp)
@ -156,6 +179,8 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
            LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16)
        ENDIF()
        LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8)
+    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
+        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
    ENDIF()

    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
@ -164,10 +189,18 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
        ELSE()
            LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb)
        ENDIF()
+        IF(ANDROID_TOOLCHAIN STREQUAL clang)
+            # Disable integrated-as for better compatibility.
+            LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as)
+        ENDIF()
    ENDIF()

-    IF(ANDROID_ABI STREQUAL "arm64-v8a")
-        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
+    IF(ANDROID_TOOLCHAIN STREQUAL clang)
+        # CMake automatically forwards all compiler flags to the linker,
+        # and clang doesn't like having -Wa flags being used for linking.
+        # To prevent CMake from doing this would require meddling with
+        # the CMAKE_<LANG>_COMPILE_OBJECT rules, which would get quite messy.
+        LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments)
    ENDIF()

    STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+IF(USE_EIGEN_FOR_BLAS)
+    return()
+ENDIF(USE_EIGEN_FOR_BLAS)
+
 INCLUDE(cblas)

 IF(NOT ${CBLAS_FOUND})
--- a/doc/design/functions_operators_layers.md
+++ b/doc/design/functions_operators_layers.md
@ -86,12 +86,13 @@ def layer.fc(X):

 We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`.  So we have the following concepts in above illustrative example:

-```
+
 | C++ functions/functors | mul          | add          |             |          |
+|------------------------|--------------|--------------|-------------|----------|
 | C++ operator class     | mulOp        | addOp        | FCOp        |          |
 | Python binding         | operator.mul | operator.add | operator.fc |          |
 | Python function        |              |              |             | layer.fc |
-```
+

 This is how we differentiate layer and operators in PaddlePaddle:

--- a/doc/design/graph.md
+++ b/doc/design/graph.md
@ -1,4 +1,4 @@
-# Design Doc: Computations as Graphs
+# Design Doc: Computations as a Graph

 A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before.

@ -8,6 +8,8 @@ This document explains that the construction of a graph as three steps:
 - construct the backward part
 - construct the optimization part

+## The Construction of a Graph
+
 Let us take the problem of image classification as a simple example.  The application program that trains the model looks like:

 ```python
@ -25,7 +27,9 @@ The first four lines of above program build the forward part of the graph.

 ![](images/graph_construction_example_forward_only.png)

-In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x.  `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b.
+In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x.  `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b, and the initialization operators.
+
+Initialization operators are kind of "run-once" operators -- the `Run` method increments a class data member counter so to run at most once.  By doing so, a parameter wouldn't be initialized repeatedly, say, in every minibatch.

 In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`.  These protobuf messages are saved in a `BlockDesc` protobuf message.

@ -49,3 +53,18 @@ According to the chain rule of gradient computation, `ConstructBackwardGraph` wo
 For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient.  Here results in the complete graph:

 ![](images/graph_construction_example_all.png)
+
+## Block and Graph
+
+The word block and graph are interchangable in the desgin of PaddlePaddle.  A [Block[(https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions.  A graph of operators and variables is a representation of the block.
+
+A Block keeps operators in an array `BlockDesc::ops`
+
+```protobuf
+message BlockDesc {
+  repeated OpDesc ops = 1;
+  repeated VarDesc vars = 2;
+}
+```
+
+in the order that there appear in user programs, like the Python program at the beginning of this article.  We can imagine that in `ops`,  we have some forward operators, followed by some gradient operators, and then some optimization operators.
--- a/doc/design/images/graph_construction_example.dot
+++ b/doc/design/images/graph_construction_example.dot
@ -2,6 +2,8 @@ digraph ImageClassificationGraph {
        ///////// The forward part /////////
        FeedX [label="Feed", color=blue, shape=box];
        FeedY [label="Feed", color=blue, shape=box];
+        InitW [label="Init", color=blue, shape=diamond];
+        Initb [label="Init", color=blue, shape=diamond];
        FC [label="FC", color=blue, shape=box];
        MSE [label="MSE", color=blue, shape=box];

@ -14,6 +16,8 @@ digraph ImageClassificationGraph {

        FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
        FeedY -> l [color=blue];
+        InitW -> W [color=blue];
+        Initb -> b [color=blue];
        W -> FC [color=blue];
        b -> FC [color=blue];
        l -> MSE [color=blue];
--- a/doc/design/images/graph_construction_example_all.png
+++ b/doc/design/images/graph_construction_example_all.png
--- a/doc/design/images/graph_construction_example_forward_backward.png
+++ b/doc/design/images/graph_construction_example_forward_backward.png
--- a/doc/design/images/graph_construction_example_forward_only.png
+++ b/doc/design/images/graph_construction_example_forward_only.png
--- a/doc/design/simple_op_design.md
+++ b/doc/design/simple_op_design.md
@ -147,7 +147,7 @@ class CosineOp {
 struct CosineOpProtoMaker : public OpProtoMaker {
 	CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) {
 		AddInput("input", "input of cosine op");
-		AddAttr("scale", "scale of cosine op", float).Default(1.0).LargerThan(0.0);
+		AddAttr("scale", "scale of cosine op", float).Default(1.0).GreaterThan(0.0);
 		AddType("cos");
 		AddComment("This is cos op");
 	}
--- a/doc/design/var_desc.md
+++ b/doc/design/var_desc.md
@ -0,0 +1,124 @@
+## Background
+PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime.
+
+PaddlePaddle use proto message to describe compile time graph for
+
+1. Computation graph should be able to be saved to a file.
+1. In distributed training, the graph will be serialized and send to multiple workers.
+
+The computation graph is constructed by Data Node and Operation Node. The concept to represent them is in the table below.
+
+| |compile time|runtime|
+|---|---|---|
+|Data|VarDesc(proto)|Variable(cpp)|
+|Operation|OpDesc(proto)|Operator(cpp)|
+
+
+## Definition of VarDesc
+
+A VarDesc should have a name and value, in PaddlePaddle, the value will always be a tensor. Since we use LoDTensor most of the time. We add a LoDTesnorDesc to represent it.
+
+```proto
+message VarDesc {
+  required string name = 1;
+  optional LoDTensorDesc lod_tensor = 2;
+}
+```
+
+## Definition of LodTensorDesc
+
+```proto
+enum DataType {
+  BOOL = 0;
+  INT16 = 1;
+  INT32 = 2;
+  INT64 = 3;
+  FP16 = 4;
+  FP32 = 5;
+  FP64 = 6;
+}
+
+message LoDTensorDesc {
+  required DataType data_type = 1;
+  repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  optional int32 lod_level = 3 [default=0];
+}
+```
+
+## Definition of Variable in Python
+
+In Python API, layer will take Variable as Input, and return Variable as Output. There should be a class `Variable` in python to help create and manage Variable.
+
+```python
+image = Variable(dims=[-1, 640, 480])
+# fc1 and fc2 are both Variable
+fc1 = layer.fc(input=image, output_size=10)
+fc2 = layer.fc(input=fc1, output_size=20)
+```
+### what should class `Variable` Have
+1. `name`.a name of string type is used to mark the value of the Variable.
+1. `initializer`. Since our Tensor does not have value. we will always use some Operator to fullfill it when run. So we should have a initialize method to help add the init operator.
+1. `operator`. Variable should record which operator produce itself. The reaon is:
+  - we use pd.eval(targets=[var1, var2]) to run the related ops to get the value of var1 and var2. var.op is used to trace the dependency of the current variable.
+
+In PaddlePaddle, we use Block to describe Computation Graph, so in the code we will use Block but not Graph.
+
+```python
+import VarDesc
+import LoDTensorDesc
+import framework
+
+def AddInitialOperator(variable, initializer):
+	# add an initialize Operator to block to init this Variable
+
+class Variable(object):
+   def __init__(self, name, dims, type, initializer):
+      self._block = get_default_block()
+      self._name = name
+      self.op = None
+
+      tensor_desc = LoDTensorDesc(data_type=type, dims=dims)
+      _var_desc = VarDesc(name=name, lod_tensor=tensor_desc)
+      self._var = framework.CreateVar(_var_desc)
+      self._block.add_var(self)
+
+      # add initial op according to initializer
+      if initializer is not None:
+          AddInitialOperator(self, initializer)
+
+   def dims(self):
+      return self._var.dims()
+
+   def data_type(self):
+       return self._var.data_type()
+
+   def to_proto(self):
+       pass
+```
+
+Then we can use this Variable to create a fc layer in Python.
+
+```python
+import paddle as pd
+
+def flatten_size(X, num_flatten_dims):
+  prod = 1 # of last num_flatten_dims
+  for i in xrange(num_flatten_dims):
+    prod = prod * X.dims[-i-1]
+  return prod
+
+def layer.fc(X, output_size, num_flatten_dims):
+  W = Variable(pd.random_uniform(), type=FP32, dims=[flatten_size(X, num_flatten_dims), output_size])
+  b = Variable(pd.random_uniform(), type=FP32, dims=[output_size])
+  out = Variable(type=FP32)
+  y = operator.fc(X, W, b, output=out) # fc will put fc op input into out
+  pd.InferShape(y)
+  return out
+
+x = Variable(dims=[-1, 640, 480])
+y = layer.fc(x, output_size=100)
+z = layer.fc(y, output_size=200)
+
+paddle.eval(targets=[z], ...)
+print(z)
+```
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
--- a/paddle/cuda/include/hl_cpu_gru.cuh
+++ b/paddle/cuda/include/hl_cpu_gru.cuh
@ -18,14 +18,6 @@ limitations under the License. */

 #ifndef __NVCC__

-#include "paddle/math/MathFunctions.h"
-
-#ifndef PADDLE_TYPE_DOUBLE
-#define     CBLAS_GEMM     paddle::gemm<float>
-#else
-#define     CBLAS_GEMM     paddle::gemm<double>
-#endif
-
 template<class OpResetOutput>
 void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
                                       real *gateValue,
@ -210,51 +202,6 @@ inline void forward_final_output(OpFinalOutput opFinalOutput,
  }
 }

-template<class OpResetOutput, class OpFinalOutput>
-void hl_cpu_gru_forward(OpResetOutput opResetOutput,
-                        OpFinalOutput opFinalOutput,
-                        hl_gru_value value,
-                        int frameSize,
-                        int batchSize,
-                        hl_activation_mode_t active_node,
-                        hl_activation_mode_t active_gate) {
-  if (value.prevOutValue) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasNoTrans,
-               batchSize,
-               2 * frameSize,
-               frameSize,
-               1,
-               value.prevOutValue,
-               frameSize,
-               value.gateWeight,
-               frameSize * 2,
-               1,
-               value.gateValue,
-               frameSize * 3);
-  }
-
-  forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate);
-
-  if (value.prevOutValue) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasNoTrans,
-               batchSize,
-               frameSize,
-               frameSize,
-               1,
-               value.resetOutputValue,
-               frameSize,
-               value.stateWeight,
-               frameSize,
-               1,
-               value.gateValue + frameSize * 2,
-               frameSize * 3);
-  }
-
-  forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node);
-}
-
 template<class OpStateGrad>
 void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad,
                                      real *gateValue,
@ -525,86 +472,6 @@ inline void backward_reset_grad(OpResetGrad opResetGrad,
  }
 }

-template<class OpStateGrad, class OpResetGrad>
-void hl_cpu_gru_backward(OpStateGrad opStateGrad,
-                         OpResetGrad opResetGrad,
-                         hl_gru_value value,
-                         hl_gru_grad  grad,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate) {
-  backward_state_grad(opStateGrad, value, grad,
-    frameSize, batchSize, active_node);
-
-  if (value.prevOutValue && grad.prevOutGrad) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasTrans,
-               batchSize,
-               frameSize,
-               frameSize,
-               1,
-               grad.gateGrad + frameSize * 2,
-               frameSize * 3,
-               value.stateWeight,
-               frameSize,
-               0,
-               grad.resetOutputGrad,
-               frameSize);
-
-    if (grad.stateWeightGrad) {
-      CBLAS_GEMM(CblasTrans,
-                 CblasNoTrans,
-                 frameSize,
-                 frameSize,
-                 batchSize,
-                 1,
-                 value.resetOutputValue,
-                 frameSize,
-                 grad.gateGrad + frameSize * 2,
-                 frameSize * 3,
-                 1,
-                 grad.stateWeightGrad,
-                 frameSize);
-    }
-  }
-
-  backward_reset_grad(opResetGrad, value, grad,
-    frameSize, batchSize, active_gate);
-
-  if (grad.prevOutGrad && value.prevOutValue) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasTrans,
-               batchSize,
-               frameSize,
-               frameSize * 2,
-               1,
-               grad.gateGrad,
-               frameSize * 3,
-               value.gateWeight,
-               frameSize * 2,
-               1,
-               grad.prevOutGrad,
-               frameSize);
-
-    if (grad.gateWeightGrad) {
-      CBLAS_GEMM(CblasTrans,
-                 CblasNoTrans,
-                 frameSize,
-                 frameSize * 2,
-                 batchSize,
-                 1,
-                 value.prevOutValue,
-                 frameSize,
-                 grad.gateGrad,
-                 frameSize * 3,
-                 1,
-                 grad.gateWeightGrad,
-                 frameSize * 2);
-    }
-  }
-}
-
 #endif

 #endif  // HL_CPU_GRU_CUH_
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@ -41,11 +41,23 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc);

 // check whether a value(attribute) fit a certain limit
 template <typename T>
-class LargerThanChecker {
+class GreaterThanChecker {
 public:
-  explicit LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
+  explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
  void operator()(T& value) const {
-    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
+    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
+  }
+
+ private:
+  T lower_bound_;
+};
+
+template <typename T>
+class EqualGreaterThanChecker {
+ public:
+  explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
+  void operator()(T& value) const {
+    PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
  }

 private:
@ -110,8 +122,13 @@ class TypedAttrChecker {
    return *this;
  }

-  TypedAttrChecker& LargerThan(const T& lower_bound) {
-    value_checkers_.push_back(LargerThanChecker<T>(lower_bound));
+  TypedAttrChecker& GreaterThan(const T& lower_bound) {
+    value_checkers_.push_back(GreaterThanChecker<T>(lower_bound));
+    return *this;
+  }
+
+  TypedAttrChecker& EqualGreaterThan(const T& lower_bound) {
+    value_checkers_.push_back(EqualGreaterThanChecker<T>(lower_bound));
    return *this;
  }

--- a/paddle/framework/backward.md
+++ b/paddle/framework/backward.md
@ -2,20 +2,20 @@

 ## Motivation

-In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass.
-  
+In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the gradient operators/expressions together with the chain rule. Every forward network needs a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass.
+
 ## Backward Operator Registry

-A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs and output gradients and then calculate its input gradients.
+A backward network is built up with several backward operators. Backward operators take forward operators' inputs outputs, and output gradients and then calculate its input gradients.

 |                        | forward operator | backward operator 
 | ---------------------- | ---------------- |------------------------- |		
 | **Operator::inputs_**  | Inputs       | Inputs, Outputs, OutputGradients |	
 | **Operator::outputs_** | Outputs          | InputGradients            |

- In most cases, there is a one-to-one correspondence between forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced.
+ In most cases, there is a one-to-one correspondence between the forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced.

-For example, we have got a `mul_op`, and we can register it's information and corresponding backward operator by the following macro:
+For example, we have got a `mul_op`, and we can register its information and corresponding backward operator by the following macro:

 ```cpp
 REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
@ -27,17 +27,17 @@ REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);

 ## Backward Opeartor Creating

-Given a certain forward operator, we can get its corresponding backward opeartor by calling:
+Given a certain forward operator, we can get its corresponding backward operator by calling:

 ```cpp
 OperatorBase* bwd_op = BuildGradOp(const OperatorBase* fwd_op);
-``` 
+```

 The function `BuildGradOp` will sequentially execute following processes:

 1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`.

-2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these are not necessary for gradient computing.
+2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these, are not necessary for gradient computing.

 3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`.

@ -49,31 +49,31 @@ A backward network is a series of backward operators. The main idea of building

 In our design, the network itself is also a kind of operator. So the operators contained by a big network may be some small network. 

-given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`,`InputGradients`.
+given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`, `InputGradients`.

 1. Op 

-   when the input forward network is a Op, return its gradient Operator Immediately.
+   when the input forward network is an Op, return its gradient Operator Immediately.

 2. NetOp 

-   when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to forward NetOp.
+   when the input forward network is a NetOp, it needs to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to the forward NetOp.

-   **shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwirte their shared input variable.  
+   **shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwrite their shared input variable.  

   <p align="center">
-   <img src="./images/duplicate_op.png" width="70%" ><br/>
+   <img src="./images/duplicate_op.png" width="50%" ><br/>

-   1. shared variable in two operators. 
+   1. Shared variable in operators. 

   </p>

-   Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator replace the overwirte links. 
+   Share variable between operators or same input variable used in multiple operators leads to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively and add a generic add operator replace the overwrite links. 

   <p align="center">
-   <img src="images/duplicate_op2.png" width="90%" ><br/>
+   <img src="images/duplicate_op2.png" width="50%" ><br/>

-   2. replace shared variable gradient with `Add` Operator
+   2. Replace shared variable's gradient with `Add` operator.

   </p>

--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@ -283,5 +283,14 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
 DDim::DDim(std::initializer_list<int64_t> init_list) {
  *this = make_ddim(init_list);
 }
+
+DDim flatten_to_2d(const DDim& src, int num_col_dims) {
+  int rank = src.size();
+  return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
+                    product(slice_ddim(src, num_col_dims, rank))});
+}
+
+DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); }
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@ -115,6 +115,12 @@ int arity(const DDim& ddim);

 std::ostream& operator<<(std::ostream&, const DDim&);

+// Reshape a tensor to a matrix. The matrix's first dimension(column length)
+// will be the product of tensor's first `num_col_dims` dimensions.
+DDim flatten_to_2d(const DDim& src, int num_col_dims);
+
+DDim flatten_to_1d(const DDim& src);
+
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/framework/eigen.h
+++ b/paddle/framework/eigen.h
@ -63,20 +63,35 @@ struct EigenTensor {

 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
-struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {};
+struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
+  static typename EigenMatrix::Type Reshape(Tensor& tensor, int num_col_dims) {
+    int rank = tensor.dims_.size();
+    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
+                   "`num_col_dims` must be between (0, rank_of_tensor).");
+    return EigenMatrix::From(tensor,
+                             flatten_to_2d(tensor.dims(), num_col_dims));
+  }
+
+  static typename EigenMatrix::ConstType Reshape(const Tensor& tensor,
+                                                 int num_col_dims) {
+    int rank = tensor.dims_.size();
+    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
+                   "`num_col_dims` must be between (0, rank_of_tensor).");
+    return EigenMatrix::From(tensor,
+                             flatten_to_2d(tensor.dims(), num_col_dims));
+  }
+};

 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
  // Flatten reshapes a Tensor into an EigenVector.
  static typename EigenVector::Type Flatten(Tensor& tensor) {
-    return EigenVector::From(
-        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
+    return EigenVector::From(tensor, {product(tensor.dims_)});
  }

  static typename EigenVector::ConstType Flatten(const Tensor& tensor) {
-    return EigenVector::From(
-        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
+    return EigenVector::From(tensor, {product(tensor.dims_)});
  }
 };

--- a/paddle/framework/eigen_test.cc
+++ b/paddle/framework/eigen_test.cc
@ -108,5 +108,24 @@ TEST(Eigen, Matrix) {
  }
 }

+TEST(Eigen, MatrixReshape) {
+  Tensor t;
+  float* p = t.mutable_data<float>({2, 3, 6, 4}, platform::CPUPlace());
+  for (int i = 0; i < 2 * 3 * 6 * 4; ++i) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenMatrix<float>::Type em = EigenMatrix<float>::Reshape(t, 2);
+
+  ASSERT_EQ(2 * 3, em.dimension(0));
+  ASSERT_EQ(6 * 4, em.dimension(1));
+
+  for (int i = 0; i < 2 * 3; i++) {
+    for (int j = 0; j < 6 * 4; j++) {
+      ASSERT_NEAR(i * 6 * 4 + j, em(i, j), 1e-6f);
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@ -87,3 +87,24 @@ message OpProto {
  repeated Attr attrs = 4;
  required string comment = 5;
 }
+
+enum DataType {
+  BOOL = 0;
+  INT16 = 1;
+  INT32 = 2;
+  INT64 = 3;
+  FP16 = 4;
+  FP32 = 5;
+  FP64 = 6;
+}
+
+message LoDTensorDesc {
+  required DataType data_type = 1;
+  repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  optional int32 lod_level = 3 [ default = 0 ];
+}
+
+message VarDesc {
+  required string name = 1;
+  optional LoDTensorDesc lod_tensor = 2;
+}
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@ -3,7 +3,7 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"

-USE_OP(add_two);
+USE_OP(add);

 namespace paddle {
 namespace framework {
@ -41,7 +41,7 @@ namespace f = paddle::framework;

 TEST(GradOpBuilder, AddTwo) {
  std::shared_ptr<f::OperatorBase> add_op(f::OpRegistry::CreateOp(
-      "add_two", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
+      "add", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
  std::shared_ptr<f::OperatorBase> grad_add_op =
      f::OpRegistry::CreateGradOp(*add_op);
  EXPECT_EQ(grad_add_op->Inputs().size(), 4UL);
--- a/paddle/framework/images/duplicate_op2.graffle
+++ b/paddle/framework/images/duplicate_op2.graffle
--- a/paddle/framework/images/duplicate_op2.png
+++ b/paddle/framework/images/duplicate_op2.png
--- a/Show More
+++ b/Show More