Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into executor-design

8 years ago · c52bb8505f
parent 3e42744191 4df6cf4d16
commit c52bb8505f
231 changed files with 8334 additions and 4443 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -31,6 +31,3 @@
    -   id: go-fmt
        types:
        - go
-    -   id: gometalinter
-        types:
-        - go
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -105,6 +105,12 @@ if (WITH_C_API AND WITH_PYTHON)
    "different Python interpreter from compiling.")
 endif()

+if(MOBILE_INFERENCE)
+    set(THIRD_PARTY_BUILD_TYPE MinSizeRel)
+else()
+    set(THIRD_PARTY_BUILD_TYPE Release)
+endif()
+
 ########################################################################################

 include(external/mklml)     # download mklml package
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -24,6 +24,10 @@ if(WITH_DOUBLE)
    add_definitions(-DPADDLE_TYPE_DOUBLE)
 endif(WITH_DOUBLE)

+if(WITH_TESTING)
+    add_definitions(-DPADDLE_WITH_TESTING)
+endif(WITH_TESTING)
+
 if(NOT WITH_TIMER)
    add_definitions(-DPADDLE_DISABLE_TIMER)
 endif(NOT WITH_TIMER)
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@ -8,7 +8,7 @@ ExternalProject_Add(
    extern_eigen3
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-    GIT_TAG         "master"
+    GIT_TAG         4e79cb69b9425f5f8c3a84be4350d4ab75b5fd9d
    PREFIX          ${EIGEN_SOURCE_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@ -36,6 +36,7 @@ ExternalProject_Add(
    # change this back to the official Github repo once my PR is
    # merged.
    GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
+    GIT_TAG         986964c07427ecb9cdb5bd73f73ebbd40e54dadb
    PREFIX          ${GFLAGS_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@ -45,11 +46,11 @@ ExternalProject_Add(
                    -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                    -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )

 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@ -31,6 +31,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS gflags
    GIT_REPOSITORY  "https://github.com/google/glog.git"
+    GIT_TAG         v0.3.5
    PREFIX          ${GLOG_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@ -43,12 +44,12 @@ ExternalProject_Add(
                    -DWITH_GFLAGS=ON
                    -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
                    -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
                     -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )

 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@ -56,11 +56,11 @@ IF(WITH_TESTING)
                        -DBUILD_GMOCK=ON
                        -Dgtest_disable_pthreads=ON
                        -Dgtest_force_shared_crt=ON
-                        -DCMAKE_BUILD_TYPE=Release
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                        ${EXTERNAL_OPTIONAL_ARGS}
        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                         -DCMAKE_BUILD_TYPE:STRING=Release
+                         -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
    )

    ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL)
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@ -191,12 +191,12 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
            ${OPTIONAL_ARGS}
            -Dprotobuf_BUILD_TESTS=OFF
            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            -DCMAKE_BUILD_TYPE=Release
+            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
            -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
            -DCMAKE_INSTALL_LIBDIR=lib
        CMAKE_CACHE_ARGS
            -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-            -DCMAKE_BUILD_TYPE:STRING=Release
+            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
            -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
            -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
            ${OPTIONAL_CACHE_ARGS}
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@ -35,6 +35,7 @@ ExternalProject_Add(
    extern_warpctc
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
+    GIT_TAG         b63a0644654a3e0ed624c85a1767bc8193aead09
    PREFIX          ${WARPCTC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@ -48,9 +49,9 @@ ExternalProject_Add(
                    -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
                    -DBUILD_SHARED=ON
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
+    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@ -42,11 +42,11 @@ ExternalProject_Add(
                    -DBUILD_SHARED_LIBS=OFF
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                    -DCMAKE_MACOSX_RPATH=ON
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )

 LIST(APPEND external_project_dependencies zlib)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -389,13 +389,60 @@ function(go_test TARGET_NAME)
    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 endfunction(go_test)

+# Modification of standard 'protobuf_generate_cpp()' with protobuf-lite support
+# Usage:
+#   paddle_protobuf_generate_cpp(<proto_srcs> <proto_hdrs> <proto_files>)
+
+function(paddle_protobuf_generate_cpp SRCS HDRS)
+  if(NOT ARGN)
+    message(SEND_ERROR "Error: paddle_protobuf_generate_cpp() called without any proto files")
+    return()
+  endif()
+
+  set(${SRCS})
+  set(${HDRS})
+
+  if (MOBILE_INFERENCE)
+      set(EXTRA_FLAG "lite:")  
+  else()
+      set(EXTRA_FLAG "") 
+  endif()
+
+  foreach(FIL ${ARGN})
+    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+    get_filename_component(FIL_WE ${FIL} NAME_WE)
+    
+    set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
+    set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
+    list(APPEND ${SRCS} "${_protobuf_protoc_src}")
+    list(APPEND ${HDRS} "${_protobuf_protoc_hdr}")
+    
+    add_custom_command(
+      OUTPUT "${_protobuf_protoc_src}"
+             "${_protobuf_protoc_hdr}"
+
+      COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
+      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} 
+      -I${CMAKE_CURRENT_SOURCE_DIR}
+      --cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
+      DEPENDS ${ABS_FIL} protoc
+      COMMENT "Running C++ protocol buffer compiler on ${FIL}"
+      VERBATIM )
+  endforeach()
+
+  set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
+  set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+  set(${HDRS} ${${HDRS}} PARENT_SCOPE)
+endfunction()
+
+
 function(proto_library TARGET_NAME)
  set(oneValueArgs "")
  set(multiValueArgs SRCS DEPS)
  cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  set(proto_srcs)
  set(proto_hdrs)
-  protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
+  paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
 endfunction()

--- a/doc/design/block.md
+++ b/doc/design/block.md
@ -5,12 +5,12 @@
 Both deep learning systems and programming languages help users describe computation procedures.  These systems use various representations of computation:

 - Caffe, Torch, and Paddle: sequences of layers.
- TensorFlow, Caffe2, Mxnet: graphs of operators.
+- TensorFlow, Caffe2, Mxnet: graph of operators.
 - PaddlePaddle: nested blocks, like C++ and Java programs.

 ## Block in Programming Languages and Deep Learning

-In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions, or operators.
+In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators.

 Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:

@ -24,14 +24,14 @@ A key difference is that a C++ program describes a one pass computation, whereas

 ## Stack Frames and the Scope Hierarchy

-The existence of the backward makes the execution of a block of traditional programs and PaddlePaddle different to each other:
+The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:

-| programming languages | PaddlePaddle                  |
-|-----------------------|-------------------------------|
-| stack                 | scope hierarchy               |
-| stack frame           | scope                         |
-| push at entering block| push at entering block        |
-| pop at leaving block  | destroy at minibatch completes|
+| programming languages | PaddlePaddle                    |
+|-----------------------|---------------------------------|
+| stack                 | scope hierarchy                 |
+| stack frame           | scope                           |
+| push at entering block| push at entering block          |
+| pop at leaving block  | destroy when minibatch completes|

 1. In traditional programs:

@ -42,9 +42,9 @@ The existence of the backward makes the execution of a block of traditional prog
 1. In PaddlePaddle

   - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables.
-   - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are to be used by the backward pass.  So it has a stack forest known as a *scope hierarchy*.
+   - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass.  So it has a stack forest known as a *scope hierarchy*.
   - The height of the highest tree is the maximum depth of nested blocks.
-   - After the process of a minibatch, PaddlePaddle destroys the scope hierarchy.
+   - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy.

 ## Use Blocks in C++ and PaddlePaddle Programs

@ -94,14 +94,14 @@ with ie.false_block():
 o1, o2 = ie(cond)
 ```

-In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `x+1` and `fc(x)`.
+In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` .

-A difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.  The `ie.input(true, 0)` invocation returns instances in the 0-th input, `x`, that corresponds to true values in `cond` as the local variable `x`, where `ie.input(false, 0)` returns instances corresponding to false values.
+The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.


 ### Blocks with `for` and `RNNOp`

-The following RNN model from the [RNN design doc](./rnn.md)
+The following RNN model in PaddlePaddle from the [RNN design doc](./rnn.md) :

 ```python
 x = sequence([10, 20, 30]) # shape=[None, 1]
@ -112,9 +112,9 @@ U = var(0.375, param=true) # shape=[1]
 rnn = pd.rnn()
 with rnn.step():
  h = rnn.memory(init = m)
-  hh = rnn.previous_memory(h)
+  h_prev = rnn.previous_memory(h)
  a = layer.fc(W, x)
-  b = layer.fc(U, hh)  
+  b = layer.fc(U, h_prev)  
  s = pd.add(a, b)
  act = pd.sigmoid(s)
  rnn.update_memory(h, act)
@ -147,9 +147,9 @@ for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {

 ## Compilation and Execution

-Like TensorFlow programs, a PaddlePaddle program is written in Python.  The first part describes a neural network as a protobuf message, and the rest part executes the message for training or inference.
+Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference.

-The generation of this protobuf message is like what a compiler generates a binary executable file.  The execution of the message that the OS executes the binary file.
+The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file.

 ## The "Binary Executable File Format"

@ -186,8 +186,8 @@ Also, the RNN operator in above example is serialized into a protobuf message of

 ```
 OpDesc {
-  inputs = {0} // the index of x
-  outputs = {5, 3} // indices of act and hidden_out
+  inputs = {0} // the index of x in vars of BlockDesc above
+  outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
  attrs {
    "memories" : {1} // the index of h
    "step_net" : <above step net>
@ -203,14 +203,14 @@ This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing
 During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).

 VarDesc in a block should have its name scope to avoid local variables affect parent block's name scope.
-Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example
+Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example:

 ```python
-a = pd.Varaible(shape=[20, 20])
+a = pd.Variable(shape=[20, 20])
 b = pd.fc(a, params=["fc.w", "fc.b"])

 rnn = pd.create_rnn()
-with rnn.stepnet()
+with rnn.stepnet():
    x = a.as_step_input()
    # reuse fc's parameter
    fc_without_b = pd.get_variable("fc.w")
@ -218,17 +218,17 @@ with rnn.stepnet()

 out = rnn()
 ```
-the method `pd.get_variable` can help retrieve a Variable by a name, a Variable may store in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.
+The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.

 In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc.

 To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers.

-`SymbolTable` can do the following stuff:
+`SymbolTable` can do the following:

 - store the definitions (some names and attributes) of variables and operators,
- to verify if a variable was declared,
- to make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).
+- verify if a variable was declared,
+- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).


 ```c++
@ -240,19 +240,18 @@ class SymbolTable {

  OpDesc* NewOp(const string& name="");

-  // TODO determine whether name is generated by python or C++
-  // currently assume that a unique name will be generated by C++ if the
-  // argument name left default.
-  VarDesc* NewVar(const string& name="");
+  // TODO determine whether name is generated by python or C++.
+  // Currently assume that a unique name will be generated by C++ if the
+  // argument name is left default.
+  VarDesc* Var(const string& name="");

-  // find a VarDesc by name, if recursive true, find parent's SymbolTable
+  // find a VarDesc by name, if recursive is true, find parent's SymbolTable
  // recursively.
  // this interface is introduced to support InferShape, find protobuf messages
  // of variables and operators, pass pointers into InferShape.
-  // operator
  //
  // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should
-  // be proposed and embedded into pybind to enable python operate on C++ pointers.
+  // be proposed and embedded into pybind to enable python operation on C++ pointers.
  VarDesc* FindVar(const string& name, bool recursive=true);

  OpDesc* FindOp(const string& name);
@ -270,7 +269,7 @@ class SymbolTable {
 After all the description of variables and operators is added into SymbolTable,
 the block has enough information to run.

-The `Block` class takes a `BlockDesc` as input, and provide `Run` and `InferShape` functions.
+The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions.


 ```c++
@ -302,7 +301,7 @@ public:
  void CreateVariables(const framework::Scope& scope);
  void CreateOperators();

-  // some other necessary interfaces of NetOp are list below
+  // some other necessary interfaces of NetOp are listed below
  // ...

 private:
@ -316,15 +315,14 @@ private:
 Block inherits from OperatorBase, which has a Run method.
 Block's Run method will run its operators sequentially.

-There is another important interface called `Eval`, which take some arguments called targets, and generate a minimal graph which takes targets as the end points and creates a new Block,
-after `Run`, `Eval` will get the latest value and return the targets.
+There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets.

 The definition of Eval is as follows:

 ```c++
 // clean a block description by targets using the corresponding dependency graph.
 // return a new BlockDesc with minimal number of operators.
-// NOTE not return a Block but the block's description so that this can be distributed
+// NOTE: The return type is not a Block but the block's description so that this can be distributed
 // to a cluster.
 BlockDesc Prune(const BlockDesc& desc, vector<string> targets);

--- a/doc/design/dcgan.png
+++ b/doc/design/dcgan.png
--- a/doc/design/gan_api.md
+++ b/doc/design/gan_api.md
--- a/doc/design/images/graph_construction_example.dot
+++ b/doc/design/images/graph_construction_example.dot
@ -33,7 +33,6 @@ digraph ImageClassificationGraph {

        cost -> MSE_Grad [color=red];
        d_cost -> MSE_Grad [color=red];
-        x -> MSE_Grad [color=red];
        l -> MSE_Grad [color=red];
        y -> MSE_Grad -> d_y [color=red];

--- a/doc/design/images/graph_construction_example_all.png
+++ b/doc/design/images/graph_construction_example_all.png
--- a/doc/design/images/graph_construction_example_forward_backward.png
+++ b/doc/design/images/graph_construction_example_forward_backward.png
--- a/doc/design/images/graph_construction_example_forward_only.png
+++ b/doc/design/images/graph_construction_example_forward_only.png
--- a/doc/design/optimizer.md
+++ b/doc/design/optimizer.md
@ -0,0 +1,105 @@
+## Optimizer Design
+
+### The Problem
+
+A PaddlePaddle program, or a block, is a sequence of operators operating variables.  A training program needs to do three kinds of works:
+
+1. the forward pass, which computes intermediate results and the cost(s),
+1. the backward pass, which derives gradients from intermediate results and costs, and
+1. the optimization pass, which update model parameters to optimize the cost(s).
+
+These works rely on three kinds of operators:
+
+1. forward operators,
+1. gradient operators, and
+1. optimization operators.
+
+It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically.
+
+In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass.
+
+
+### High-level Python API to describe the training process
+
+1. User write code to describe the network:
+
+	```python
+	images = layer.data("images")
+	labels = layer.data("labels")
+	w1 = pd.var("w1")
+	b1 = pd.var("b1")
+	hidden = layer.fc(images, w=w1, b=b1)
+	cost = layer.mse(hidden, labels)
+	```
+
+	The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
+
+
+2. Users create a certain kind of Optimizer with some argument.
+
+	```python
+	optimizer = AdagradOptimizer(learing_rate=0.001)
+	```
+
+3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list.
+
+	```python
+	opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1])
+	```
+	The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session.
+
+4. Users use Session/Executor to run this opt_op_list as target to do training.
+
+	```python
+	sess.run(target= opt_op_list, ...)
+	```
+
+#### Optimizer Python interface:
+
+```python
+class Optimizer(object):
+    """Optimizer Base class.
+
+    """
+
+    def __init__(self):
+        pass
+
+    def create_backward_pass(self, loss, parameter_list=None):
+        """
+        create and add gradient Operators in BlockDesc to Compute gradients of `loss`
+        for parameters in parameter_list
+
+        Args:
+          loss: an variable generated by cost function.
+          parameter_list: parameters that need to compute gradient and update to optimize the lost.
+
+        Returns:
+          list of (parameters, gradients) pair.
+        """
+        return None
+
+    def create_optimization_pass(self, parameters_and_grads):
+        """Add optimization operators to update gradients to variables.
+
+        Args:
+          parameters_and_grads: a list of (variable, gradient) pair to update.
+
+        Returns:
+          optmization_op_list: a list of optimization operator that will update parameter using gradient.
+        """
+        return None
+
+    def minimize(self, loss, parameter_list):
+        """Add operations to minimize `loss` by updating `parameter_list`.
+
+        This method combines interface `create_backward_pass()` and
+        `create_optimization_pass()` into one.
+        """
+        params_grads = self.create_backward_pass(loss, parameter_list)
+        update_ops = self.create_optimization_pass(params_grads)
+        return update_ops
+
+```
+
+Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer.
--- a/doc/design/python_api.md
+++ b/doc/design/python_api.md
@ -22,7 +22,7 @@ Whenever we create a block, we need to set its parent block to the current block
 ```python
 class Program(objects):
    def __init__(self):
-        self.proto = core.NewProgram() # a C++ ProgramDesc pointer.
+        self.desc = core.NewProgram() # a C++ ProgramDesc pointer.
        self.blocks = vector<Block>()
        self.blocks.append(Block(self, -1)) # the global block
        self.current_block = 0          # initialized to the global block
@ -57,7 +57,7 @@ A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.m
 ```python
 class Block(objects):
    def __init__(self, program, parent_idx):
-        self.proto = core.NewBlock(program.proto)
+        self.desc = core.NewBlock(program.desc)
        self.program = program
        self.vars = map<string, Variable>()
        self.ops = vector<Operator>()
@ -98,11 +98,11 @@ class Operator(object):
                 outputs,# dict<stirng, Variable>
                 attrs   # dict<string, Any>
                 ):
-        self.proto = core.NewOpDesc(block.proto, type, inputs, outputs, attrs)
-        core.infer_shape(self.proto, inputs, outputs)
+        self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs)
+        core.infer_shape(self.desc, inputs, outputs)

    def type(self):
-        return self.proto.type()
+        return self.desc.type()
 ```

 `Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++.
@ -124,7 +124,7 @@ class Variable(object):
            name = unique_name_generator()
        self.name = name
        self.block = block
-        self.proto = core.NewVarDesc(block.proto, name, shape, lod_level)
+        self.desc = core.NewVarDesc(block.desc, name, shape, lod_level)
        self.writer = None
 ```

@ -214,3 +214,7 @@ def fc_layer(input, size, ...):
    out.writer = op
    return out
 ```
+
+## Optimizer
+
+[Optimizer Design Doc](./optimizer.md)
--- a/doc/design/refactorization.md
+++ b/doc/design/refactorization.md
@ -17,22 +17,22 @@ The goals of refactoring include:

 1. A graph is composed of *variables* and *operators*.

-1. The description of graphs must be capable of being serialized/deserialized, so that:
+1. The description of graphs must be serializable/deserializable, so that:

-   1. It can to be sent to the cloud for distributed execution, and
+   1. It can be sent to the cloud for distributed execution, and
   1. It can be sent to clients for mobile or enterprise deployment.

-1. The Python program does the following steps
+1. The Python program does two things

-   1. *compilation*: run a Python program to generate a protobuf message representation of the graph and send it to
+   1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to
      1. the C++ library `libpaddle.so` for local execution,
      1. the master process of a distributed training job for training, or
      1. the server process of a Kubernetes serving job for distributed serving.
-   1. *execution*: execute the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message.
+   1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message.

 ## Description and Realization of Computation Graph

-At compile time, the Python program generates a protobuf message representation of the graph, or the description of the graph.
+At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph.

 At runtime, the C++ program realizes the graph and runs it.

@ -42,11 +42,11 @@ At runtime, the C++ program realizes the graph and runs it.
 |Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
 |Block|BlockDesc|Block|

-The word *graph* is interchangeable with *block* in this document.  A graph represents computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
+The word *graph* is interchangeable with *block* in this document.  A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).

 ## Compilation and Execution

-1. Run an application Python program to describe the graph.  In particular, the Python application program does the following:
+1. Run a Python program to describe the graph.  In particular, the Python application program does the following:

   1. Create `VarDesc` to represent local/intermediate variables,
   1. Create operators and set attributes,
@ -54,10 +54,10 @@ The word *graph* is interchangeable with *block* in this document.  A graph repr
   1. Infer the type and the shape of variables,
   1. Plan memory-reuse for variables,
   1. Generate the backward graph
-   1. Optimize the computation graph.
-   1. Potentially, split the graph for distributed training.
+   1. Add optimization operators to the computation graph.
+   1. Optionally, split the graph for distributed training.

-1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the application Python program does the following:
+1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following:

   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
      1. realize local variables defined in the BlockDesc message in the new scope,
@ -107,8 +107,8 @@ Compile Time -> IR -> Runtime
 ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)

 * `Operator` is the fundamental building block of the user interface.
-    * Operator stores input/output variable names, and attributes.
-    * The `InferShape` interface is used to infer the shape of the output variable shapes based on the shapes of the input variables.
+    * Operator stores input/output variable names and attributes.
+    * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables.
    * Use `Run` to compute the `output` variables from the `input` variables.

 ---
@ -139,7 +139,7 @@ Compile Time -> IR -> Runtime
    * Limit the number of `tensor.device(dev) = ` in your code.
 * `thrust::transform` and `std::transform`.
    * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels.
-    * `thrust` also has more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
+    * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
 * Hand-writing `GPUKernel` and `CPU` code
    * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
 ---
@ -185,10 +185,10 @@ Make sure the registration process is executed and linked.
 1. Write an Op class and its gradient Op class, if required.
 2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
 3. Invoke the macro `REGISTER_OP`. This macro will
-	1. Call maker class to complete the `proto` and the `checker`
+	1. Call maker class to complete `proto` and `checker`
 	2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`

-4. Invoke the `USE` macro in which the Op is used, to make sure that it is linked.
+4. Invoke the `USE` macro in which the Op is used to make sure that it is linked.

 ---
 # Backward Module (1/2)
@ -199,13 +199,14 @@ Make sure the registration process is executed and linked.
 ---
 # Backward Module (2/2)
 ### Build Backward Network
- **Input**: graph of forward operators
- **Output**: graph of backward operators
+- **Input**: a graph of forward operators
+- **Output**: a graph of backward operators
 - **Corner cases in construction**
 	- Shared Variables => insert an `Add` operator to combine gradients
 	- No Gradient => insert a `fill_zero_grad` operator
 	- Recursive NetOp => call `Backward` recursively
 	- RNN Op => recursively call `Backward` on stepnet
+	- RNN Op => recursively call `Backward` on stepnet


 ---
@ -215,10 +216,10 @@ Make sure the registration process is executed and linked.
 	* Only dims and data pointers are stored in `Tensor`.
 	* All operations on `Tensor` are written in `Operator` or global functions.
 	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
-* `Variable` instances are the inputs and the outputs of an operator. Not just `Tensor`.
+* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`.
 	* `step_scopes` in RNN is a variable and not a tensor.
-* `Scope` is where variables are stores.
-	* map<string `variable_name`, Variable>
+* `Scope` is where variables are stored.
+	* map<string `var name`, Variable>
 	* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.

 ---
@ -246,7 +247,7 @@ Make sure the registration process is executed and linked.
 ---
 # Control the migration quality
 - Compare the performance of migrated models with old ones.
- Follow the google C++ style
+- Follow the google C++ style guide.
 - Build the automatic workflow of generating Python/C++ documentations.
  - The documentation of layers and ops should be written inside the code.
  - Take the documentation quality into account when submitting pull requests.
--- a/doc/design/register_grad_op.md
+++ b/doc/design/register_grad_op.md
@ -3,15 +3,17 @@

 ## The Problem Posed

-In our current operator registration mechanism, for each operator, the programmer should register a *gradient operator creator* function, which takes a C++ operator instance, and returns the corresponding gradient instance.
+Currently, for each C++ operator class definition, there registers a *gradient operator creator* function, which takes a C++ operator instance and returns the corresponding gradient operator instance.

-However, as we decided to separate the *compilation* and *execution* of DL models, we need to reshape the creator to take a protobuf `OpDesc` message, and returns a corresponding message.
+However, we noticed two problems with the current deisgn:

-More than that, the new registration mechanism need to support the fact that an operators' gradient computation might be a composition of operators.
+1. As we decided to separate the *compilation* and *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message.

-## Current Implementation
+1. Some operator's gradient computation requires more than one gradient operators.  For example, the gradient of *minus* consists of two operators -- an identity operaotr and a scale operator.  So we need to make the registration mechanism to support the mapping from an operator to a set of operators for gradient computation.

-OpInfos store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is
+## The Current Implementation
+
+The C++ class `OpInfos` store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is

 ```cpp
 struct OpInfo {
--- a/doc/design/scope.md
+++ b/doc/design/scope.md
@ -37,7 +37,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
 ```cpp
 class Scope {
 public:
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);
  const Variable* FindVar(const std::string& name) const;

 private:
@ -98,7 +98,7 @@ class Scope {
  Variable* FindVar(const std::string& name) const;

  // return if already contains same name variable.
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);

 private:
  std::shared_ptr<Scope> parent_;
@ -107,7 +107,7 @@ class Scope {
 ```
 ## Only scope can create a variable

-To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `NewVar` can construct `Variable`.
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`.

 ## When scope destroyed, all variables inside this scope should be destroyed together

@ -121,4 +121,4 @@ Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shar

 ## Orthogonal interface

-`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily.
+`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily.
--- a/doc/design/selected_rows.md
+++ b/doc/design/selected_rows.md
@ -0,0 +1,74 @@
+# Design Doc: Selected Rows
+
+`SelectedRows` is a kind of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in that tensor. It is straightforward to represent the sparse tensor by the following sparse tensor data structure:
+
+```cpp
+class SelectedRows {
+ private:
+  vector<int> rows_;
+  Tensor value_;
+  int height_;
+};
+```
+
+The field `height_` shows the first dimension of `SelectedRows`. The `rows` are the indices of which rows of `SelectedRows` are non-zeros. The `value_` field is an N-dim tensor and shape is `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`.
+
+Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be:
+
+```
+x = SelectedRow {
+  rows = [73, 84],
+  value = [[1, 2], [3,4]]
+}
+```
+
+
+## SelectedRows in Protobuf
+
+`SelectedRows` is a kind of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time since the `rows_` and `value_` are related to training data. 
+So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description.
+
+```proto
+message TensorDesc {
+  required DataType data_type = 1;
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+}
+
+message LodTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int lod_level = 2;
+}
+
+message VarDesc {
+  required string name = 1;
+  enum VarType { 
+    LOD_TENSOR = 0;
+    SELECTED_ROWS = 1;
+  }
+  required VarType type = 2;
+  optional LodTensorDesc lod_desc = 3;
+  optional TensorDesc selected_rows_desc = 4;
+  optional bool persistable = 5 [ default = false ];
+}
+```
+
+## InferShape for Selected Rows
+
+Just like `LoD` information, `InferShape` method will inference output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor.
+
+For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following
+
+```cpp
+void TableLookupGrad::InferShape(context) {
+  ...
+  context.SetDataType("Embedding.Grad", kSelectedRows);
+}
+```
+
+
+## Sparse Operators
+
+There are several operators should be written to support `SelectedRows`. They are:
+
+1. Operators which generates `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`.
+2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`.
--- a/doc/design/tensor_array.md
+++ b/doc/design/tensor_array.md
@ -161,7 +161,7 @@ class TensorArray:
        @name: str
            the name of the variable to output.
        '''
-        tensor = NewVar(name)
+        tensor = Var(name)
        tensor_array_stack(self.name, tensor)
        return tensor

--- a/Show More
+++ b/Show More