Merge branch 'develop' into cos_sim_vector

8 years ago · 965fd2250d
parent 03ea7320d3 0f42e5649e
commit 965fd2250d
184 changed files with 6758 additions and 2503 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -4,7 +4,6 @@ cache:
    - $HOME/.ccache
    - $HOME/.cache/pip
    - $TRAVIS_BUILD_DIR/build/third_party
    - $TRAVIS_BUILD_DIR/build_android/third_party
 sudo: required
 dist: trusty
 os:
@ -12,7 +11,6 @@ os:
 env:
  - JOB=build_doc
  - JOB=check_style
  - JOB=build_android
 addons:
  apt:
    packages:
@ -23,7 +21,6 @@ addons:
      - python
      - python-pip
      - python2.7-dev
      - python-numpy
      - python-wheel
      - libboost-dev
      - curl
@ -37,8 +34,8 @@ before_install:
  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
  # protobuf version.
-  - pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
+  - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
-  - pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
+  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
  - curl https://glide.sh/get | bash
  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
  - go get -u github.com/alecthomas/gometalinter
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -65,8 +65,8 @@ if(NOT CMAKE_BUILD_TYPE)
 endif()
 if(ANDROID)
-    if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
+    if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
-        message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21")
+        message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
    endif()
    set(WITH_GPU OFF CACHE STRING
--- a/Dockerfile.android
+++ b/Dockerfile.android
@ -4,9 +4,15 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 # ENV variables
 ARG ANDROID_ABI
 ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
 ENV HOME=/root \
    ANDROID_NDK_HOME=/opt/android-ndk-linux \
-    ANDROID_STANDALONE_TOOLCHAIN=/opt/android-toolchain-gcc
+    ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain \
    ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain
 RUN apt-get update && \
    apt-get install -y \
@ -15,12 +21,11 @@ RUN apt-get update && \
    apt-get clean -y
 # Install Go and glide
-RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    tar -C /usr/local -xzf go.tgz && \
+    tar -xz -C /usr/local && \
    mkdir /root/gopath && \
    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src && \
+    mkdir /root/gopath/src
    rm go.tgz
 ENV GOROOT=/usr/local/go GOPATH=/root/gopath
 # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
 ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
@ -42,7 +47,8 @@ RUN mkdir /opt/android-ndk-tmp && \
    wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
    unzip -q android-ndk-r14b-linux-x86_64.zip && \
    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
-    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_STANDALONE_TOOLCHAIN} && \
+    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-23 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \
    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-23 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \
    rm -rf /opt/android-ndk-tmp && \
    rm -rf ${ANDROID_NDK_HOME}
--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@ -20,6 +20,7 @@
 # The supported variables are listed belows:
 # 
 # ANDROID_STANDALONE_TOOLCHAIN
 # ANDROID_TOOLCHAIN
 # ANDROID_ABI
 # ANDROID_NATIVE_API_LEVEL
 # ANDROID_ARM_MODE
@ -57,6 +58,10 @@ IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL)
    ENDIF()
 ENDIF()
 IF(NOT DEFINED ANDROID_TOOLCHAIN)
    SET(ANDROID_TOOLCHAIN clang)
 ENDIF()
 IF(NOT DEFINED ANDROID_ABI)
    SET(ANDROID_ABI "armeabi-v7a")
 ENDIF()
@ -82,6 +87,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
            "${CMAKE_VERSION}), when cross-compiling for Android.")
    IF(ANDROID_STANDALONE_TOOLCHAIN)
        # Use standalone toolchain
        SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot")
        IF(NOT CMAKE_SYSTEM_VERSION)
@ -96,26 +102,44 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
        ENDIF()
        # Toolchain
        SET(ANDROID_TOOLCHAIN "gcc")
        SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN})
-        IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+    ELSE(ANDROID_NDK)
-            SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
+        # TODO: use android ndk
-            IF(ANDROID_ABI STREQUAL "armeabi")
+    ENDIF()
-                SET(CMAKE_SYSTEM_PROCESSOR armv5te)
+
-            ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
+    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-                SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
+        SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
-            ENDIF()
+        IF(ANDROID_ABI STREQUAL "armeabi")
-        ENDIF()
+            SET(CMAKE_SYSTEM_PROCESSOR armv5te)
-        IF(ANDROID_ABI STREQUAL "arm64-v8a")
+            SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi)
-            SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
+        ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
-            SET(CMAKE_SYSTEM_PROCESSOR aarch64)
+            SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
            SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi)
        ENDIF()
-        SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
+    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
        SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
        SET(CMAKE_SYSTEM_PROCESSOR aarch64)
        SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android)
    ELSE()
        MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.")
    ENDIF()
    SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
    IF(ANDROID_TOOLCHAIN STREQUAL clang)
        SET(ANDROID_C_COMPILER_NAME clang)
        SET(ANDROID_CXX_COMPILER_NAME clang++)
        SET(CMAKE_C_COMPILER_TARGET   ${ANDROID_CLANG_TRIPLE})
        SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE})
    ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc)
        SET(ANDROID_C_COMPILER_NAME gcc)
        SET(ANDROID_CXX_COMPILER_NAME g++)
    ELSE()
        MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}")
    ENDIF()
    # C compiler
    IF(NOT CMAKE_C_COMPILER)
-        SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc")
+        SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}")
    ELSE()
        GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
    ENDIF()
@ -125,7 +149,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
    # CXX compiler
    IF(NOT CMAKE_CXX_COMPILER)
-        SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++")
+        SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}")
    ELSE()
        GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
    ENDIF()
@ -137,7 +161,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
    SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
    # Toolchain and ABI specific flags.
-    SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections -finline-limit=64")
+    SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections")
    SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections")
    IF(ANDROID_ABI STREQUAL "armeabi")
@ -145,8 +169,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
             -march=armv5te
             -mtune=xscale
             -msoft-float)
-    ENDIF()
+    ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
    IF(ANDROID_ABI STREQUAL "armeabi-v7a")
        LIST(APPEND ANDROID_COMPILER_FLAGS
             -march=armv7-a
             -mfloat-abi=softfp)
@ -156,6 +179,8 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
            LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16)
        ENDIF()
        LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8)
    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
    ENDIF()
    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
@ -164,10 +189,18 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
        ELSE()
            LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb)
        ENDIF()
        IF(ANDROID_TOOLCHAIN STREQUAL clang)
            # Disable integrated-as for better compatibility.
            LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as)
        ENDIF()
    ENDIF()
-    IF(ANDROID_ABI STREQUAL "arm64-v8a")
+    IF(ANDROID_TOOLCHAIN STREQUAL clang)
-        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
+        # CMake automatically forwards all compiler flags to the linker,
        # and clang doesn't like having -Wa flags being used for linking.
        # To prevent CMake from doing this would require meddling with
        # the CMAKE_<LANG>_COMPILE_OBJECT rules, which would get quite messy.
        LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments)
    ENDIF()
    STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 IF(USE_EIGEN_FOR_BLAS)
    return()
 ENDIF(USE_EIGEN_FOR_BLAS)
 INCLUDE(cblas)
 IF(NOT ${CBLAS_FOUND})
--- a/doc/design/functions_operators_layers.md
+++ b/doc/design/functions_operators_layers.md
@ -86,12 +86,13 @@ def layer.fc(X):
 We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`.  So we have the following concepts in above illustrative example:
-```
+
 | C++ functions/functors | mul          | add          |             |          |
 |------------------------|--------------|--------------|-------------|----------|
 | C++ operator class     | mulOp        | addOp        | FCOp        |          |
 | Python binding         | operator.mul | operator.add | operator.fc |          |
 | Python function        |              |              |             | layer.fc |
-```
+
 This is how we differentiate layer and operators in PaddlePaddle:
--- a/doc/design/graph.md
+++ b/doc/design/graph.md
@ -1,4 +1,4 @@
-# Design Doc: Computations as Graphs
+# Design Doc: Computations as a Graph
 A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before.
@ -8,6 +8,8 @@ This document explains that the construction of a graph as three steps:
 - construct the backward part
 - construct the optimization part
 ## The Construction of a Graph
 Let us take the problem of image classification as a simple example.  The application program that trains the model looks like:
 ```python
@ -25,7 +27,9 @@ The first four lines of above program build the forward part of the graph.
 ![](images/graph_construction_example_forward_only.png)
-In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x.  `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b.
+In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x.  `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b, and the initialization operators.
 Initialization operators are kind of "run-once" operators -- the `Run` method increments a class data member counter so to run at most once.  By doing so, a parameter wouldn't be initialized repeatedly, say, in every minibatch.
 In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`.  These protobuf messages are saved in a `BlockDesc` protobuf message.
@ -49,3 +53,18 @@ According to the chain rule of gradient computation, `ConstructBackwardGraph` wo
 For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient.  Here results in the complete graph:
 ![](images/graph_construction_example_all.png)
 ## Block and Graph
 The word block and graph are interchangable in the desgin of PaddlePaddle.  A [Block[(https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions.  A graph of operators and variables is a representation of the block.
 A Block keeps operators in an array `BlockDesc::ops`
 ```protobuf
 message BlockDesc {
  repeated OpDesc ops = 1;
  repeated VarDesc vars = 2;
 }
 ```
 in the order that there appear in user programs, like the Python program at the beginning of this article.  We can imagine that in `ops`,  we have some forward operators, followed by some gradient operators, and then some optimization operators.
--- a/doc/design/images/graph_construction_example.dot
+++ b/doc/design/images/graph_construction_example.dot
@ -2,6 +2,8 @@ digraph ImageClassificationGraph {
        ///////// The forward part /////////
        FeedX [label="Feed", color=blue, shape=box];
        FeedY [label="Feed", color=blue, shape=box];
        InitW [label="Init", color=blue, shape=diamond];
        Initb [label="Init", color=blue, shape=diamond];
        FC [label="FC", color=blue, shape=box];
        MSE [label="MSE", color=blue, shape=box];
@ -14,6 +16,8 @@ digraph ImageClassificationGraph {
        FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
        FeedY -> l [color=blue];
        InitW -> W [color=blue];
        Initb -> b [color=blue];
        W -> FC [color=blue];
        b -> FC [color=blue];
        l -> MSE [color=blue];
--- a/doc/design/images/graph_construction_example_all.png
+++ b/doc/design/images/graph_construction_example_all.png
--- a/doc/design/images/graph_construction_example_forward_backward.png
+++ b/doc/design/images/graph_construction_example_forward_backward.png
--- a/doc/design/images/graph_construction_example_forward_only.png
+++ b/doc/design/images/graph_construction_example_forward_only.png
--- a/doc/design/ops/dist_train.md
+++ b/doc/design/ops/dist_train.md
@ -0,0 +1,106 @@
 # Design Doc: Operation Graph Based Parameter Server
 ## Abstract
 We propose an approach to implement the parameter server. In this
 approach, there is no fundamental difference between the trainer and
 the parameter server: they both run subgraphs, but subgraphs of
 different purposes.
 ## Background
 The previous implementations of the parameter server does not run a
 subgraph. parameter initialization, optimizer computation, network
 communication and checkpointing are implemented twice on both the
 trainer and the parameter server.
 It would be great if we can write code once and use them on both the
 trainer and the parameter server: reduces code duplication and
 improves extensibility. Given that after the current refactor, we are
 representing everything as a computing graph on the
 trainer. Representing everything as a computing graph on the parameter
 server becomes a natural extension.
 ## Design
 ### Graph Converter
 The *graph converter* converts the user-defined operation (OP) graph
 into subgraphs to be scheduled on different nodes with the following
 steps:
 1. OP placement: the OPs will be placed on different nodes according
   to heuristic that minimizes estimated total computation
   time. Currently we will use a simple heuristic that puts parameter
   varable on parameter server workers and everything else on trainer
   workers.
 1. Add communication OPs to enable the communication between nodes.
 We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
 Below is an example of converting the user defined graph to the
 subgraphs for the trainer and the parameter server:
 <img src="src/local-graph.png" width="300"/>
 After converting:
 <img src="src/dist-graph.png" width="700"/>
 1. The parameter variable W and it's optimizer subgraph are placed on the parameter server.
 1. Operators are added to the subgraphs.
   - *Send* sends data to the connected *Recv* operator.  The
 	 scheduler on the receive node will only schedule *Recv* operator
 	 to run when the *Send* operator has ran (the *Send* OP will mark
 	 the *Recv* OP runnable automatically).
   - *Enueue* enqueues the input variable, it can block until space
     become available in the queue.
   - *Dequeue* outputs configurable numbers of tensors from the
     queue. It will block until the queue have the required number of
     tensors.
 ### Benefits
 - Model parallelism become easier to implement: it's an extension to
  the trainer - parameter server approach. we already have the
  communication OPs, but need to extend the graph converter's
  placement functionality.
 - User-defined optimizer is easier to add - user can now express it as
  a subgraph.
 - No more duplication logic inside the trainer and the parameter
  server mentioned in the background section.
 ### Challenges
 - It might be hard for the graph converter to cut a general graph
  (without any hint for which subgraph is the optimizer). We may need
  to label which subgraph inside the OP graph is the optimizer.
 - It's important to balance the parameter shards of on multiple
  parameter server. If a single parameter is very big (some
  word-embedding, fully connected, softmax layer), we need to
  automatically partition the single parameter onto different
  parameter servers when possible (only element-wise optimizer depends
  on the parameter variable).
 ### Discussion
 - In the "Aync SGD" figure, the "W" variable on the parameter server
  could be read and wrote concurrently, what is our locking strategy?
  E.g., each variable have a lock cpp method to be invoked by every
  OP, or, have a lock OP.
 - Can the Enqueue OP be implemented under our current tensor design
  (puts the input tensor into the queue tensor)?
 - *Dequeue* OP will have variable numbers of output (depends on the
  `min_count` attribute), does our current design support it? (similar
  question for the *Add* OP)
 ### References:
 [1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
--- a/doc/design/ops/src/dist-graph.graffle
+++ b/doc/design/ops/src/dist-graph.graffle
--- a/doc/design/ops/src/dist-graph.png
+++ b/doc/design/ops/src/dist-graph.png
--- a/doc/design/ops/src/local-graph.graffle
+++ b/doc/design/ops/src/local-graph.graffle
--- a/doc/design/ops/src/local-graph.png
+++ b/doc/design/ops/src/local-graph.png
--- a/doc/design/simple_op_design.md
+++ b/doc/design/simple_op_design.md
@ -147,7 +147,7 @@ class CosineOp {
 struct CosineOpProtoMaker : public OpProtoMaker {
 	CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) {
 		AddInput("input", "input of cosine op");
-		AddAttr("scale", "scale of cosine op", float).Default(1.0).LargerThan(0.0);
+		AddAttr("scale", "scale of cosine op", float).Default(1.0).GreaterThan(0.0);
 		AddType("cos");
 		AddComment("This is cos op");
 	}
--- a/doc/design/var_desc.md
+++ b/doc/design/var_desc.md
@ -0,0 +1,124 @@
 ## Background
 PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime.
 PaddlePaddle use proto message to describe compile time graph for
 1. Computation graph should be able to be saved to a file.
 1. In distributed training, the graph will be serialized and send to multiple workers.
 The computation graph is constructed by Data Node and Operation Node. The concept to represent them is in the table below.
 | |compile time|runtime|
 |---|---|---|
 |Data|VarDesc(proto)|Variable(cpp)|
 |Operation|OpDesc(proto)|Operator(cpp)|
 ## Definition of VarDesc
 A VarDesc should have a name and value, in PaddlePaddle, the value will always be a tensor. Since we use LoDTensor most of the time. We add a LoDTesnorDesc to represent it.
 ```proto
 message VarDesc {
  required string name = 1;
  optional LoDTensorDesc lod_tensor = 2;
 }
 ```
 ## Definition of LodTensorDesc
 ```proto
 enum DataType {
  BOOL = 0;
  INT16 = 1;
  INT32 = 2;
  INT64 = 3;
  FP16 = 4;
  FP32 = 5;
  FP64 = 6;
 }
 message LoDTensorDesc {
  required DataType data_type = 1;
  repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
  optional int32 lod_level = 3 [default=0];
 }
 ```
 ## Definition of Variable in Python
 In Python API, layer will take Variable as Input, and return Variable as Output. There should be a class `Variable` in python to help create and manage Variable.
 ```python
 image = Variable(dims=[-1, 640, 480])
 # fc1 and fc2 are both Variable
 fc1 = layer.fc(input=image, output_size=10)
 fc2 = layer.fc(input=fc1, output_size=20)
 ```
 ### what should class `Variable` Have
 1. `name`.a name of string type is used to mark the value of the Variable.
 1. `initializer`. Since our Tensor does not have value. we will always use some Operator to fullfill it when run. So we should have a initialize method to help add the init operator.
 1. `operator`. Variable should record which operator produce itself. The reaon is:
  - we use pd.eval(targets=[var1, var2]) to run the related ops to get the value of var1 and var2. var.op is used to trace the dependency of the current variable.
 In PaddlePaddle, we use Block to describe Computation Graph, so in the code we will use Block but not Graph.
 ```python
 import VarDesc
 import LoDTensorDesc
 import framework
 def AddInitialOperator(variable, initializer):
 	# add an initialize Operator to block to init this Variable
 class Variable(object):
   def __init__(self, name, dims, type, initializer):
      self._block = get_default_block()
      self._name = name
      self.op = None
      tensor_desc = LoDTensorDesc(data_type=type, dims=dims)
      _var_desc = VarDesc(name=name, lod_tensor=tensor_desc)
      self._var = framework.CreateVar(_var_desc)
      self._block.add_var(self)
      # add initial op according to initializer
      if initializer is not None:
          AddInitialOperator(self, initializer)
   def dims(self):
      return self._var.dims()
   def data_type(self):
       return self._var.data_type()
   def to_proto(self):
       pass
 ```
 Then we can use this Variable to create a fc layer in Python.
 ```python
 import paddle as pd
 def flatten_size(X, num_flatten_dims):
  prod = 1 # of last num_flatten_dims
  for i in xrange(num_flatten_dims):
    prod = prod * X.dims[-i-1]
  return prod
 def layer.fc(X, output_size, num_flatten_dims):
  W = Variable(pd.random_uniform(), type=FP32, dims=[flatten_size(X, num_flatten_dims), output_size])
  b = Variable(pd.random_uniform(), type=FP32, dims=[output_size])
  out = Variable(type=FP32)
  y = operator.fc(X, W, b, output=out) # fc will put fc op input into out
  pd.InferShape(y)
  return out
 x = Variable(dims=[-1, 640, 480])
 y = layer.fc(x, output_size=100)
 z = layer.fc(y, output_size=200)
 paddle.eval(targets=[z], ...)
 print(z)
 ```
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@ -5,15 +5,13 @@
 PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
-如何构建PaddlePaddle的文档
+如何构建文档
-==========================
+============
-PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式，我们提供了一个构建脚本build_docs.sh来进行构建。
+PaddlePaddle的文档构建有两种方式。
 PaddlePaddle文档需要准备的环境相对较复杂，所以我们推荐使用基于Docker来构建PaddlePaddle的文档。
-
+使用Docker构建
-使用Docker构建PaddlePaddle的文档
+--------------
 --------------------------------
 使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档，即
@ -21,58 +19,46 @@ PaddlePaddle文档需要准备的环境相对较复杂，所以我们推荐使
    cd TO_YOUR_PADDLE_CLONE_PATH
    cd paddle/scripts/tools/build_docs
-    bash build_docs.sh with_docker
+    sh build_docs.sh
 编译完成后，会在当前目录生成两个子目录\:
 * doc 英文文档目录
 * doc_cn 中文文档目录
 编译完成之后，会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。
 打开浏览器访问对应目录下的index.html即可访问本地文档。
-
+直接构建
-
+--------
 直接构建PaddlePaddle的文档
 --------------------------
 因为PaddlePaddle的v2 api文档生成过程依赖于py_paddle Python包，用户需要首先确认py_paddle包已经安装。
 ..  code-block:: bash
    python -c "import py_paddle"
 如果提示错误，那么用户需要在本地编译安装PaddlePaddle，请参考 `源码编译文档 <http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
 注意，用户在首次编译安装PaddlePaddle时，请将WITH_DOC选项关闭。在编译安装正确之后，请再次确认py_paddle包已经安装，即可进行下一步操作。
 如果提示正确，可以执行以下命令编译生成文档，即
 ..  code-block:: bash
    cd TO_YOUR_PADDLE_CLONE_PATH
-    cd paddle/scripts/tools/build_docs
+    mkdir -p build
-    bash build_docs.sh local
+    cd build
-
+    cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
-编译完成之后，会在当前目录生成两个子目录\:
+    make gen_proto_py
-
+    make paddle_docs paddle_docs_cn
 * doc 英文文档目录
 * doc_cn 中文文档目录
 编译完成之后，会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。
 打开浏览器访问对应目录下的index.html即可访问本地文档。
-如何书写PaddlePaddle的文档
+如何书写文档
-==========================
+============
 PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程进行书写。
-如何更新www.paddlepaddle.org文档
+如何更新文档主题
-================================
+================
 PaddlePaddle文档主题在 `TO_YOUR_PADDLE_CLONE_PATH/doc_theme` 文件夹下，包含所有和前端网页设计相关的文件。
-开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中，提交方式可参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
+如何更新doc.paddlepaddle.org
 ============================
 更新的文档以PR的形式提交到github中，提交方式参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
 目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://doc.paddlepaddle.org/develop/doc_cn/>`_ 和
 `英文文档 <http://doc.paddlepaddle.org/develop/doc/>`_ 。
 ..  _cmake: https://cmake.org/
 ..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
--- a/paddle/cuda/include/hl_cpu_gru.cuh
+++ b/paddle/cuda/include/hl_cpu_gru.cuh
@ -18,14 +18,6 @@ limitations under the License. */
 #ifndef __NVCC__
 #include "paddle/math/MathFunctions.h"
 #ifndef PADDLE_TYPE_DOUBLE
 #define     CBLAS_GEMM     paddle::gemm<float>
 #else
 #define     CBLAS_GEMM     paddle::gemm<double>
 #endif
 template<class OpResetOutput>
 void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
                                       real *gateValue,
@ -210,51 +202,6 @@ inline void forward_final_output(OpFinalOutput opFinalOutput,
  }
 }
 template<class OpResetOutput, class OpFinalOutput>
 void hl_cpu_gru_forward(OpResetOutput opResetOutput,
                        OpFinalOutput opFinalOutput,
                        hl_gru_value value,
                        int frameSize,
                        int batchSize,
                        hl_activation_mode_t active_node,
                        hl_activation_mode_t active_gate) {
  if (value.prevOutValue) {
    CBLAS_GEMM(CblasNoTrans,
               CblasNoTrans,
               batchSize,
               2 * frameSize,
               frameSize,
               1,
               value.prevOutValue,
               frameSize,
               value.gateWeight,
               frameSize * 2,
               1,
               value.gateValue,
               frameSize * 3);
  }
  forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate);
  if (value.prevOutValue) {
    CBLAS_GEMM(CblasNoTrans,
               CblasNoTrans,
               batchSize,
               frameSize,
               frameSize,
               1,
               value.resetOutputValue,
               frameSize,
               value.stateWeight,
               frameSize,
               1,
               value.gateValue + frameSize * 2,
               frameSize * 3);
  }
  forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node);
 }
 template<class OpStateGrad>
 void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad,
                                      real *gateValue,
@ -525,86 +472,6 @@ inline void backward_reset_grad(OpResetGrad opResetGrad,
  }
 }
 template<class OpStateGrad, class OpResetGrad>
 void hl_cpu_gru_backward(OpStateGrad opStateGrad,
                         OpResetGrad opResetGrad,
                         hl_gru_value value,
                         hl_gru_grad  grad,
                         int frameSize,
                         int batchSize,
                         hl_activation_mode_t active_node,
                         hl_activation_mode_t active_gate) {
  backward_state_grad(opStateGrad, value, grad,
    frameSize, batchSize, active_node);
  if (value.prevOutValue && grad.prevOutGrad) {
    CBLAS_GEMM(CblasNoTrans,
               CblasTrans,
               batchSize,
               frameSize,
               frameSize,
               1,
               grad.gateGrad + frameSize * 2,
               frameSize * 3,
               value.stateWeight,
               frameSize,
               0,
               grad.resetOutputGrad,
               frameSize);
    if (grad.stateWeightGrad) {
      CBLAS_GEMM(CblasTrans,
                 CblasNoTrans,
                 frameSize,
                 frameSize,
                 batchSize,
                 1,
                 value.resetOutputValue,
                 frameSize,
                 grad.gateGrad + frameSize * 2,
                 frameSize * 3,
                 1,
                 grad.stateWeightGrad,
                 frameSize);
    }
  }
  backward_reset_grad(opResetGrad, value, grad,
    frameSize, batchSize, active_gate);
  if (grad.prevOutGrad && value.prevOutValue) {
    CBLAS_GEMM(CblasNoTrans,
               CblasTrans,
               batchSize,
               frameSize,
               frameSize * 2,
               1,
               grad.gateGrad,
               frameSize * 3,
               value.gateWeight,
               frameSize * 2,
               1,
               grad.prevOutGrad,
               frameSize);
    if (grad.gateWeightGrad) {
      CBLAS_GEMM(CblasTrans,
                 CblasNoTrans,
                 frameSize,
                 frameSize * 2,
                 batchSize,
                 1,
                 value.prevOutValue,
                 frameSize,
                 grad.gateGrad,
                 frameSize * 3,
                 1,
                 grad.gateWeightGrad,
                 frameSize * 2);
    }
  }
 }
 #endif
 #endif  // HL_CPU_GRU_CUH_
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -9,6 +9,7 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
 cc_test(variable_test SRCS variable_test.cc)
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@ -41,11 +41,23 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc);
 // check whether a value(attribute) fit a certain limit
 template <typename T>
-class LargerThanChecker {
+class GreaterThanChecker {
 public:
-  explicit LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
+  explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
  void operator()(T& value) const {
-    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
+    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
  }
 private:
  T lower_bound_;
 };
 template <typename T>
 class EqualGreaterThanChecker {
 public:
  explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
  void operator()(T& value) const {
    PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
  }
 private:
@ -110,8 +122,13 @@ class TypedAttrChecker {
    return *this;
  }
-  TypedAttrChecker& LargerThan(const T& lower_bound) {
+  TypedAttrChecker& GreaterThan(const T& lower_bound) {
-    value_checkers_.push_back(LargerThanChecker<T>(lower_bound));
+    value_checkers_.push_back(GreaterThanChecker<T>(lower_bound));
    return *this;
  }
  TypedAttrChecker& EqualGreaterThan(const T& lower_bound) {
    value_checkers_.push_back(EqualGreaterThanChecker<T>(lower_bound));
    return *this;
  }
--- a/paddle/framework/backward.md
+++ b/paddle/framework/backward.md
@ -2,20 +2,31 @@
 ## Motivation
-In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass.
+In Neural Network, many model is solved by the the backpropagation algorithm(known as BP) at present. Technically it caculates the gradient of the loss function, then distributed back through the networks. Follows the chain rule, so we need a module chains the gradient operators/expressions together with to construct the backward pass. Every forward network needs a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass. 
 ## Backward Operator Registry
-A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs and output gradients and then calculate its input gradients.
+## Implementation
 In this design doc, we exported only one API for generating the backward pass.
 ```c++
 std::unique_ptr<OperatorBase> Backward(const OperatorBase& forwardOp,
    const std::unordered_set<std::string>& no_grad_vars);
 ```
 The implementation behind it can be divided into two parts, **Backward Operator Creating** and **Backward Operator Building**.
 ### Backward Operator Registry
 A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs, and output gradients and then calculate its input gradients.
 |                        | forward operator | backward operator 
 | ---------------------- | ---------------- |------------------------- |		
 | **Operator::inputs_**  | Inputs       | Inputs, Outputs, OutputGradients |	
 | **Operator::outputs_** | Outputs          | InputGradients            |
- In most cases, there is a one-to-one correspondence between forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced.
+ In most cases, there is a one-to-one correspondence between the forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced.
-For example, we have got a `mul_op`, and we can register it's information and corresponding backward operator by the following macro:
+For example, we have got a `mul_op`, and we can register its information and corresponding backward operator by the following macro:
 ```cpp
 REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
@ -25,58 +36,65 @@ REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
 `mul_grad` is the type of backward operator, and `MulOpGrad` is its class name.
-## Backward Opeartor Creating
+### Backward Opeartor Creating
-Given a certain forward operator, we can get its corresponding backward opeartor by calling:
+Given a certain forward operator, we can get its corresponding backward operator by calling:
 ```cpp
 OperatorBase* bwd_op = BuildGradOp(const OperatorBase* fwd_op);
-``` 
+```
 The function `BuildGradOp` will sequentially execute following processes:
 1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`.
-2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these are not necessary for gradient computing.
+2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these, are not necessary for gradient computing.
 3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`.
 4. Building backward operator with `inputs`, `outputs` and forward operator's attributes.
-## Backward Network Building
+### Backward Network Building
-A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and put them together.
+A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and append them together one by one. There is some corner case need to process specially.
 In our design, the network itself is also a kind of operator. So the operators contained by a big network may be some small network. 
 given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`,`InputGradients`.
 1. Op 
-   when the input forward network is a Op, return its gradient Operator Immediately.
+   When the input forward network is an Op, return its gradient Operator Immediately. If all of its outputs are in no gradient set, then return a special `NOP`.
 2. NetOp 
-   when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to forward NetOp.
+   In our design, the network itself is also a kind of operator(**NetOp**). So the operators contained by a big network may be some small network. When the input forward network is a NetOp, it needs to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to the forward NetOp.
 3. RnnOp
   RnnOp is a nested stepnet operator.  Backward module need to recusively call `Backward` for every stepnet.
 4. Sharing Variables
   **sharing variables**. As illustrated in the pictures, two operator's share the same variable name of W@GRAD, which will overwrite their sharing input variable. 
 <p align="center">
 <img src="./images/duplicate_op.png" width="50%" ><br/>
-   **shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwirte their shared input variable.  
+	pic 1. Sharing variables in operators. 
-   <p align="center">
+</p>
   <img src="./images/duplicate_op.png" width="70%" ><br/>
-   1. shared variable in two operators. 
+	Sharing variable between operators or same input variable used in multiple operators leads to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively and add a generic add operator to replace the overwrite links. 
-   </p>
+<p align="center">
 <img src="images/duplicate_op2.png" width="40%" ><br/>
-   Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator replace the overwirte links. 
+	pic 2. Replace sharing variable's gradient with `Add` operator.
-   <p align="center">
+</p>
   <img src="images/duplicate_op2.png" width="90%" ><br/>
-   2. replace shared variable gradient with `Add` Operator
+	Because our framework finds variables accord to their names, we need to rename the output links. We add a suffix of number to represent its position in clockwise. 
-   </p>
+5. Part of Gradient is Zero.
   In the whole graph, there is some case of that one operator's gradient is not needed, but its input's gradient is a dependency link of other operator,  we need to fill a same shape gradient matrix in the position. In our implement, we insert a special `fillZeroLike` operator.
-	Then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.
+Follow these rules above, then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@ -283,5 +283,14 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
 DDim::DDim(std::initializer_list<int64_t> init_list) {
  *this = make_ddim(init_list);
 }
 DDim flatten_to_2d(const DDim& src, int num_col_dims) {
  int rank = src.size();
  return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
                    product(slice_ddim(src, num_col_dims, rank))});
 }
 DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); }
 }  // namespace framework
 }  // namespace paddle
--- a/Show More
+++ b/Show More