Merge branch 'develop' into crf

8 years ago · 515981d714
parent d92c671d5f b504a2346c
commit 515981d714
192 changed files with 7903 additions and 880 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -86,6 +86,14 @@ if(ANDROID OR IOS)
        "Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
    set(WITH_MKLML OFF CACHE STRING
        "Disable MKLML package when cross-compiling for Android and iOS" FORCE)
    # Compile PaddlePaddle mobile inference library
    if (NOT WITH_C_API)
        set(WITH_C_API ON CACHE STRING
            "Always compile the C_API when cross-compiling for Android and iOS" FORCE)
    endif()
    set(MOBILE_INFERENCE ON)
    add_definitions(-DPADDLE_MOBILE_INFERENCE)
 endif()
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@ -160,9 +168,11 @@ endif(USE_NNPACK)
 add_subdirectory(proto)
-# "add_subdirectory(go)" should be placed after the following loine,
+if(NOT MOBILE_INFERENCE)
-# because it depends on paddle/optimizer.
+    # "add_subdirectory(go)" should be placed after the following loine,
-add_subdirectory(paddle/optimizer)
+    # because it depends on paddle/optimizer.
    add_subdirectory(paddle/optimizer)
 endif()
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
 # placed after this block, because they depends on it.
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -24,6 +24,10 @@ if(WITH_DOUBLE)
    add_definitions(-DPADDLE_TYPE_DOUBLE)
 endif(WITH_DOUBLE)
 if(WITH_TESTING)
    add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)
 if(NOT WITH_TIMER)
    add_definitions(-DPADDLE_DISABLE_TIMER)
 endif(NOT WITH_TIMER)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -389,13 +389,60 @@ function(go_test TARGET_NAME)
    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 endfunction(go_test)
 # Modification of standard 'protobuf_generate_cpp()' with protobuf-lite support
 # Usage:
 #   paddle_protobuf_generate_cpp(<proto_srcs> <proto_hdrs> <proto_files>)
 function(paddle_protobuf_generate_cpp SRCS HDRS)
  if(NOT ARGN)
    message(SEND_ERROR "Error: paddle_protobuf_generate_cpp() called without any proto files")
    return()
  endif()
  set(${SRCS})
  set(${HDRS})
  if (MOBILE_INFERENCE)
      set(EXTRA_FLAG "lite:")  
  else()
      set(EXTRA_FLAG "") 
  endif()
  foreach(FIL ${ARGN})
    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
    get_filename_component(FIL_WE ${FIL} NAME_WE)
    set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
    set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
    list(APPEND ${SRCS} "${_protobuf_protoc_src}")
    list(APPEND ${HDRS} "${_protobuf_protoc_hdr}")
    add_custom_command(
      OUTPUT "${_protobuf_protoc_src}"
             "${_protobuf_protoc_hdr}"
      COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} 
      -I${CMAKE_CURRENT_SOURCE_DIR}
      --cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
      DEPENDS ${ABS_FIL} protoc
      COMMENT "Running C++ protocol buffer compiler on ${FIL}"
      VERBATIM )
  endforeach()
  set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
  set(${SRCS} ${${SRCS}} PARENT_SCOPE)
  set(${HDRS} ${${HDRS}} PARENT_SCOPE)
 endfunction()
 function(proto_library TARGET_NAME)
  set(oneValueArgs "")
  set(multiValueArgs SRCS DEPS)
  cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  set(proto_srcs)
  set(proto_hdrs)
-  protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
+  paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
 endfunction()
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@ -73,25 +73,43 @@ function(link_paddle_exe TARGET_NAME)
        generate_rdma_links()
    endif()
-    target_circle_link_libraries(${TARGET_NAME}
+    if(MOBILE_INFERENCE)
-        ARCHIVE_START
+        target_circle_link_libraries(${TARGET_NAME}
-        paddle_gserver
+            ARCHIVE_START
-        paddle_function
+            paddle_gserver
-        ARCHIVE_END
+            paddle_function
-        paddle_pserver
+            ARCHIVE_END
-        paddle_trainer_lib
+            paddle_math
-        paddle_network
+            paddle_utils
-        paddle_math
+            paddle_parameter
-        paddle_utils
+            paddle_proto
-        paddle_parameter
+            paddle_cuda
-        paddle_proto
+            ${EXTERNAL_LIBS}
-        paddle_cuda
+            ${CMAKE_THREAD_LIBS_INIT}
-        paddle_optimizer
+            ${CMAKE_DL_LIBS}
-        ${EXTERNAL_LIBS}
+            ${RDMA_LD_FLAGS}
-        ${CMAKE_THREAD_LIBS_INIT}
+            ${RDMA_LIBS})
-        ${CMAKE_DL_LIBS}
+    else()
-        ${RDMA_LD_FLAGS}
+        target_circle_link_libraries(${TARGET_NAME}
-        ${RDMA_LIBS})
+            ARCHIVE_START
            paddle_gserver
            paddle_function
            ARCHIVE_END
            paddle_pserver
            paddle_trainer_lib
            paddle_network
            paddle_math
            paddle_utils
            paddle_parameter
            paddle_proto
            paddle_cuda
            paddle_optimizer
            ${EXTERNAL_LIBS}
            ${CMAKE_THREAD_LIBS_INIT}
            ${CMAKE_DL_LIBS}
            ${RDMA_LD_FLAGS}
            ${RDMA_LIBS})
    endif()
    if(ANDROID)
        target_link_libraries(${TARGET_NAME} log)
--- a/doc/design/block.md
+++ b/doc/design/block.md
@ -5,12 +5,12 @@
 Both deep learning systems and programming languages help users describe computation procedures.  These systems use various representations of computation:
 - Caffe, Torch, and Paddle: sequences of layers.
- TensorFlow, Caffe2, Mxnet: graphs of operators.
+- TensorFlow, Caffe2, Mxnet: graph of operators.
 - PaddlePaddle: nested blocks, like C++ and Java programs.
 ## Block in Programming Languages and Deep Learning
-In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions, or operators.
+In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators.
 Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
@ -24,14 +24,14 @@ A key difference is that a C++ program describes a one pass computation, whereas
 ## Stack Frames and the Scope Hierarchy
-The existence of the backward makes the execution of a block of traditional programs and PaddlePaddle different to each other:
+The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
-| programming languages | PaddlePaddle                  |
+| programming languages | PaddlePaddle                    |
-|-----------------------|-------------------------------|
+|-----------------------|---------------------------------|
-| stack                 | scope hierarchy               |
+| stack                 | scope hierarchy                 |
-| stack frame           | scope                         |
+| stack frame           | scope                           |
-| push at entering block| push at entering block        |
+| push at entering block| push at entering block          |
-| pop at leaving block  | destroy at minibatch completes|
+| pop at leaving block  | destroy when minibatch completes|
 1. In traditional programs:
@ -42,9 +42,9 @@ The existence of the backward makes the execution of a block of traditional prog
 1. In PaddlePaddle
   - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables.
-   - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are to be used by the backward pass.  So it has a stack forest known as a *scope hierarchy*.
+   - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass.  So it has a stack forest known as a *scope hierarchy*.
   - The height of the highest tree is the maximum depth of nested blocks.
-   - After the process of a minibatch, PaddlePaddle destroys the scope hierarchy.
+   - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy.
 ## Use Blocks in C++ and PaddlePaddle Programs
@ -94,14 +94,14 @@ with ie.false_block():
 o1, o2 = ie(cond)
 ```
-In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `x+1` and `fc(x)`.
+In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` .
-A difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.  The `ie.input(true, 0)` invocation returns instances in the 0-th input, `x`, that corresponds to true values in `cond` as the local variable `x`, where `ie.input(false, 0)` returns instances corresponding to false values.
+The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.
 ### Blocks with `for` and `RNNOp`
-The following RNN model from the [RNN design doc](./rnn.md)
+The following RNN model in PaddlePaddle from the [RNN design doc](./rnn.md) :
 ```python
 x = sequence([10, 20, 30]) # shape=[None, 1]
@ -112,9 +112,9 @@ U = var(0.375, param=true) # shape=[1]
 rnn = pd.rnn()
 with rnn.step():
  h = rnn.memory(init = m)
-  hh = rnn.previous_memory(h)
+  h_prev = rnn.previous_memory(h)
  a = layer.fc(W, x)
-  b = layer.fc(U, hh)  
+  b = layer.fc(U, h_prev)  
  s = pd.add(a, b)
  act = pd.sigmoid(s)
  rnn.update_memory(h, act)
@ -147,9 +147,9 @@ for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
 ## Compilation and Execution
-Like TensorFlow programs, a PaddlePaddle program is written in Python.  The first part describes a neural network as a protobuf message, and the rest part executes the message for training or inference.
+Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference.
-The generation of this protobuf message is like what a compiler generates a binary executable file.  The execution of the message that the OS executes the binary file.
+The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file.
 ## The "Binary Executable File Format"
@ -186,8 +186,8 @@ Also, the RNN operator in above example is serialized into a protobuf message of
 ```
 OpDesc {
-  inputs = {0} // the index of x
+  inputs = {0} // the index of x in vars of BlockDesc above
-  outputs = {5, 3} // indices of act and hidden_out
+  outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
  attrs {
    "memories" : {1} // the index of h
    "step_net" : <above step net>
@ -203,14 +203,14 @@ This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing
 During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
 VarDesc in a block should have its name scope to avoid local variables affect parent block's name scope.
-Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example
+Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example:
 ```python
-a = pd.Varaible(shape=[20, 20])
+a = pd.Variable(shape=[20, 20])
 b = pd.fc(a, params=["fc.w", "fc.b"])
 rnn = pd.create_rnn()
-with rnn.stepnet()
+with rnn.stepnet():
    x = a.as_step_input()
    # reuse fc's parameter
    fc_without_b = pd.get_variable("fc.w")
@ -218,17 +218,17 @@ with rnn.stepnet()
 out = rnn()
 ```
-the method `pd.get_variable` can help retrieve a Variable by a name, a Variable may store in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.
+The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.
 In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc.
 To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers.
-`SymbolTable` can do the following stuff:
+`SymbolTable` can do the following:
 - store the definitions (some names and attributes) of variables and operators,
- to verify if a variable was declared,
+- verify if a variable was declared,
- to make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).
+- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).
 ```c++
@ -240,19 +240,18 @@ class SymbolTable {
  OpDesc* NewOp(const string& name="");
-  // TODO determine whether name is generated by python or C++
+  // TODO determine whether name is generated by python or C++.
-  // currently assume that a unique name will be generated by C++ if the
+  // Currently assume that a unique name will be generated by C++ if the
-  // argument name left default.
+  // argument name is left default.
  VarDesc* NewVar(const string& name="");
-  // find a VarDesc by name, if recursive true, find parent's SymbolTable
+  // find a VarDesc by name, if recursive is true, find parent's SymbolTable
  // recursively.
  // this interface is introduced to support InferShape, find protobuf messages
  // of variables and operators, pass pointers into InferShape.
  // operator
  //
  // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should
-  // be proposed and embedded into pybind to enable python operate on C++ pointers.
+  // be proposed and embedded into pybind to enable python operation on C++ pointers.
  VarDesc* FindVar(const string& name, bool recursive=true);
  OpDesc* FindOp(const string& name);
@ -270,7 +269,7 @@ class SymbolTable {
 After all the description of variables and operators is added into SymbolTable,
 the block has enough information to run.
-The `Block` class takes a `BlockDesc` as input, and provide `Run` and `InferShape` functions.
+The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions.
 ```c++
@ -302,7 +301,7 @@ public:
  void CreateVariables(const framework::Scope& scope);
  void CreateOperators();
-  // some other necessary interfaces of NetOp are list below
+  // some other necessary interfaces of NetOp are listed below
  // ...
 private:
@ -316,15 +315,14 @@ private:
 Block inherits from OperatorBase, which has a Run method.
 Block's Run method will run its operators sequentially.
-There is another important interface called `Eval`, which take some arguments called targets, and generate a minimal graph which takes targets as the end points and creates a new Block,
+There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets.
 after `Run`, `Eval` will get the latest value and return the targets.
 The definition of Eval is as follows:
 ```c++
 // clean a block description by targets using the corresponding dependency graph.
 // return a new BlockDesc with minimal number of operators.
-// NOTE not return a Block but the block's description so that this can be distributed
+// NOTE: The return type is not a Block but the block's description so that this can be distributed
 // to a cluster.
 BlockDesc Prune(const BlockDesc& desc, vector<string> targets);
--- a/doc/design/dcgan.png
+++ b/doc/design/dcgan.png
--- a/doc/design/gan_api.md
+++ b/doc/design/gan_api.md
--- a/doc/design/optimizer.md
+++ b/doc/design/optimizer.md
@ -0,0 +1,105 @@
 ## Optimizer Design
 ### The Problem
 A PaddlePaddle program, or a block, is a sequence of operators operating variables.  A training program needs to do three kinds of works:
 1. the forward pass, which computes intermediate results and the cost(s),
 1. the backward pass, which derives gradients from intermediate results and costs, and
 1. the optimization pass, which update model parameters to optimize the cost(s).
 These works rely on three kinds of operators:
 1. forward operators,
 1. gradient operators, and
 1. optimization operators.
 It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically.
 In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass.
 ### High-level Python API to describe the training process
 1. User write code to describe the network:
 	```python
 	images = layer.data("images")
 	labels = layer.data("labels")
 	w1 = pd.var("w1")
 	b1 = pd.var("b1")
 	hidden = layer.fc(images, w=w1, b=b1)
 	cost = layer.mse(hidden, labels)
 	```
 	The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
 2. Users create a certain kind of Optimizer with some argument.
 	```python
 	optimizer = AdagradOptimizer(learing_rate=0.001)
 	```
 3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list.
 	```python
 	opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1])
 	```
 	The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session.
 4. Users use Session/Executor to run this opt_op_list as target to do training.
 	```python
 	sess.run(target= opt_op_list, ...)
 	```
 #### Optimizer Python interface:
 ```python
 class Optimizer(object):
    """Optimizer Base class.
    """
    def __init__(self):
        pass
    def create_backward_pass(self, loss, parameter_list=None):
        """
        create and add gradient Operators in BlockDesc to Compute gradients of `loss`
        for parameters in parameter_list
        Args:
          loss: an variable generated by cost function.
          parameter_list: parameters that need to compute gradient and update to optimize the lost.
        Returns:
          list of (parameters, gradients) pair.
        """
        return None
    def create_optimization_pass(self, parameters_and_grads):
        """Add optimization operators to update gradients to variables.
        Args:
          parameters_and_grads: a list of (variable, gradient) pair to update.
        Returns:
          optmization_op_list: a list of optimization operator that will update parameter using gradient.
        """
        return None
    def minimize(self, loss, parameter_list):
        """Add operations to minimize `loss` by updating `parameter_list`.
        This method combines interface `create_backward_pass()` and
        `create_optimization_pass()` into one.
        """
        params_grads = self.create_backward_pass(loss, parameter_list)
        update_ops = self.create_optimization_pass(params_grads)
        return update_ops
 ```
 Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer.
--- a/doc/design/python_api.md
+++ b/doc/design/python_api.md
@ -22,7 +22,7 @@ Whenever we create a block, we need to set its parent block to the current block
 ```python
 class Program(objects):
    def __init__(self):
-        self.proto = core.NewProgram() # a C++ ProgramDesc pointer.
+        self.desc = core.NewProgram() # a C++ ProgramDesc pointer.
        self.blocks = vector<Block>()
        self.blocks.append(Block(self, -1)) # the global block
        self.current_block = 0          # initialized to the global block
@ -57,7 +57,7 @@ A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.m
 ```python
 class Block(objects):
    def __init__(self, program, parent_idx):
-        self.proto = core.NewBlock(program.proto)
+        self.desc = core.NewBlock(program.desc)
        self.program = program
        self.vars = map<string, Variable>()
        self.ops = vector<Operator>()
@ -98,11 +98,11 @@ class Operator(object):
                 outputs,# dict<stirng, Variable>
                 attrs   # dict<string, Any>
                 ):
-        self.proto = core.NewOpDesc(block.proto, type, inputs, outputs, attrs)
+        self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs)
-        core.infer_shape(self.proto, inputs, outputs)
+        core.infer_shape(self.desc, inputs, outputs)
    def type(self):
-        return self.proto.type()
+        return self.desc.type()
 ```
 `Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++.
@ -124,7 +124,7 @@ class Variable(object):
            name = unique_name_generator()
        self.name = name
        self.block = block
-        self.proto = core.NewVarDesc(block.proto, name, shape, lod_level)
+        self.desc = core.NewVarDesc(block.desc, name, shape, lod_level)
        self.writer = None
 ```
@ -214,3 +214,7 @@ def fc_layer(input, size, ...):
    out.writer = op
    return out
 ```
 ## Optimizer
 [Optimizer Design Doc](./optimizer.md)
--- a/doc/design/refactorization.md
+++ b/doc/design/refactorization.md
@ -17,22 +17,22 @@ The goals of refactoring include:
 1. A graph is composed of *variables* and *operators*.
-1. The description of graphs must be capable of being serialized/deserialized, so that:
+1. The description of graphs must be serializable/deserializable, so that:
-   1. It can to be sent to the cloud for distributed execution, and
+   1. It can be sent to the cloud for distributed execution, and
   1. It can be sent to clients for mobile or enterprise deployment.
-1. The Python program does the following steps
+1. The Python program does two things
-   1. *compilation*: run a Python program to generate a protobuf message representation of the graph and send it to
+   1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to
      1. the C++ library `libpaddle.so` for local execution,
      1. the master process of a distributed training job for training, or
      1. the server process of a Kubernetes serving job for distributed serving.
-   1. *execution*: execute the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message.
+   1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message.
 ## Description and Realization of Computation Graph
-At compile time, the Python program generates a protobuf message representation of the graph, or the description of the graph.
+At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph.
 At runtime, the C++ program realizes the graph and runs it.
@ -42,11 +42,11 @@ At runtime, the C++ program realizes the graph and runs it.
 |Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
 |Block|BlockDesc|Block|
-The word *graph* is interchangeable with *block* in this document.  A graph represents computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
+The word *graph* is interchangeable with *block* in this document.  A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
 ## Compilation and Execution
-1. Run an application Python program to describe the graph.  In particular, the Python application program does the following:
+1. Run a Python program to describe the graph.  In particular, the Python application program does the following:
   1. Create `VarDesc` to represent local/intermediate variables,
   1. Create operators and set attributes,
@ -54,10 +54,10 @@ The word *graph* is interchangeable with *block* in this document.  A graph repr
   1. Infer the type and the shape of variables,
   1. Plan memory-reuse for variables,
   1. Generate the backward graph
-   1. Optimize the computation graph.
+   1. Add optimization operators to the computation graph.
-   1. Potentially, split the graph for distributed training.
+   1. Optionally, split the graph for distributed training.
-1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the application Python program does the following:
+1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following:
   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
      1. realize local variables defined in the BlockDesc message in the new scope,
@ -107,8 +107,8 @@ Compile Time -> IR -> Runtime
 ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)
 * `Operator` is the fundamental building block of the user interface.
-    * Operator stores input/output variable names, and attributes.
+    * Operator stores input/output variable names and attributes.
-    * The `InferShape` interface is used to infer the shape of the output variable shapes based on the shapes of the input variables.
+    * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables.
    * Use `Run` to compute the `output` variables from the `input` variables.
 ---
@ -139,7 +139,7 @@ Compile Time -> IR -> Runtime
    * Limit the number of `tensor.device(dev) = ` in your code.
 * `thrust::transform` and `std::transform`.
    * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels.
-    * `thrust` also has more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
+    * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
 * Hand-writing `GPUKernel` and `CPU` code
    * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
 ---
@ -185,10 +185,10 @@ Make sure the registration process is executed and linked.
 1. Write an Op class and its gradient Op class, if required.
 2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
 3. Invoke the macro `REGISTER_OP`. This macro will
-	1. Call maker class to complete the `proto` and the `checker`
+	1. Call maker class to complete `proto` and `checker`
 	2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
-4. Invoke the `USE` macro in which the Op is used, to make sure that it is linked.
+4. Invoke the `USE` macro in which the Op is used to make sure that it is linked.
 ---
 # Backward Module (1/2)
@ -199,13 +199,14 @@ Make sure the registration process is executed and linked.
 ---
 # Backward Module (2/2)
 ### Build Backward Network
- **Input**: graph of forward operators
+- **Input**: a graph of forward operators
- **Output**: graph of backward operators
+- **Output**: a graph of backward operators
 - **Corner cases in construction**
 	- Shared Variables => insert an `Add` operator to combine gradients
 	- No Gradient => insert a `fill_zero_grad` operator
 	- Recursive NetOp => call `Backward` recursively
 	- RNN Op => recursively call `Backward` on stepnet
 	- RNN Op => recursively call `Backward` on stepnet
 ---
@ -215,10 +216,10 @@ Make sure the registration process is executed and linked.
 	* Only dims and data pointers are stored in `Tensor`.
 	* All operations on `Tensor` are written in `Operator` or global functions.
 	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
-* `Variable` instances are the inputs and the outputs of an operator. Not just `Tensor`.
+* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`.
 	* `step_scopes` in RNN is a variable and not a tensor.
-* `Scope` is where variables are stores.
+* `Scope` is where variables are stored.
-	* map<string `variable_name`, Variable>
+	* map<string `var name`, Variable>
 	* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
 ---
@ -246,7 +247,7 @@ Make sure the registration process is executed and linked.
 ---
 # Control the migration quality
 - Compare the performance of migrated models with old ones.
- Follow the google C++ style
+- Follow the google C++ style guide.
 - Build the automatic workflow of generating Python/C++ documentations.
  - The documentation of layers and ops should be written inside the code.
  - Take the documentation quality into account when submitting pull requests.
--- a/doc/design/selected_rows.md
+++ b/doc/design/selected_rows.md
@ -0,0 +1,74 @@
 # Design Doc: Selected Rows
 `SelectedRows` is a kind of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in that tensor. It is straightforward to represent the sparse tensor by the following sparse tensor data structure:
 ```cpp
 class SelectedRows {
 private:
  vector<int> rows_;
  Tensor value_;
  int height_;
 };
 ```
 The field `height_` shows the first dimension of `SelectedRows`. The `rows` are the indices of which rows of `SelectedRows` are non-zeros. The `value_` field is an N-dim tensor and shape is `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`.
 Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be:
 ```
 x = SelectedRow {
  rows = [73, 84],
  value = [[1, 2], [3,4]]
 }
 ```
 ## SelectedRows in Protobuf
 `SelectedRows` is a kind of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time since the `rows_` and `value_` are related to training data. 
 So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description.
 ```proto
 message TensorDesc {
  required DataType data_type = 1;
  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
 }
 message LodTensorDesc {
  required TensorDesc tensor = 1;
  optional int lod_level = 2;
 }
 message VarDesc {
  required string name = 1;
  enum VarType { 
    LOD_TENSOR = 0;
    SELECTED_ROWS = 1;
  }
  required VarType type = 2;
  optional LodTensorDesc lod_desc = 3;
  optional TensorDesc selected_rows_desc = 4;
  optional bool persistable = 5 [ default = false ];
 }
 ```
 ## InferShape for Selected Rows
 Just like `LoD` information, `InferShape` method will inference output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor.
 For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following
 ```cpp
 void TableLookupGrad::InferShape(context) {
  ...
  context.SetDataType("Embedding.Grad", kSelectedRows);
 }
 ```
 ## Sparse Operators
 There are several operators should be written to support `SelectedRows`. They are:
 1. Operators which generates `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`.
 2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`.
--- a/doc/design/test.dot
+++ b/doc/design/test.dot
@ -0,0 +1,35 @@
 digraph Test {
    z -> generator -> G_img;
    G_img -> discriminator -> D_f -> d_loss_f;
    label0 -> d_loss_f -> d_loss;
    img -> discriminator -> D_t -> d_loss_t;
    label1 -> d_loss_t -> d_loss;
    d_loss -> d_loss_t[color=red, style=dashed];
    d_loss -> d_loss_f[color=red, style=dashed];
    d_loss_t -> D_t[color=red, style=dashed];
    d_loss_f -> D_f[color=red, style=dashed];
    D_t -> discriminator[color=red, style=dashed];
    D_f -> discriminator[color=red, style=dashed];
    D_f -> g_loss;
    label2 -> g_loss;
    g_loss -> D_f[color=green, style=dashed];
    D_f -> discriminator[color=green, style=dashed];
    discriminator -> G_img[color=green, style=dashed];
    G_img -> generator[color=green, style=dashed];
    discriminator [color=red, shape=box];
    generator [color=green, shape=box];
    z [shape=diamond];
    img [shape=diamond];
    label0 [shape=diamond];
    label1 [shape=diamond];
    label2 [shape=diamond];
    d_loss [color=red];
    g_loss [color=green];
 }
--- a/doc/design/test.dot.png
+++ b/doc/design/test.dot.png
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@ -1,27 +1,32 @@
 add_subdirectory(cuda)
 add_subdirectory(function)
 add_subdirectory(utils)
 add_subdirectory(testing)
 add_subdirectory(math)
 add_subdirectory(parameter)
 add_subdirectory(gserver)
-add_subdirectory(pserver)
+add_subdirectory(parameter)
-add_subdirectory(trainer)
+add_subdirectory(testing)
 add_subdirectory(scripts)
 add_subdirectory(string)
 if(Boost_FOUND)
  add_subdirectory(memory)
  add_subdirectory(platform)
  add_subdirectory(framework)
  add_subdirectory(operators)
  add_subdirectory(pybind)
 endif()
-if(WITH_C_API)
+if(MOBILE_INFERENCE)
  add_subdirectory(capi)
-endif()
+else()
  add_subdirectory(pserver)
  add_subdirectory(trainer)
  add_subdirectory(string)
  add_subdirectory(scripts)
  if(WITH_C_API)
    add_subdirectory(capi)
  endif()
  if(Boost_FOUND)
    add_subdirectory(memory)
    add_subdirectory(platform)
    add_subdirectory(framework)
    add_subdirectory(operators)
    add_subdirectory(pybind)
  endif()
-if(WITH_SWIG_PY)
+  if(WITH_SWIG_PY)
-  add_subdirectory(api)
+    add_subdirectory(api)
  endif()
 endif()
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@ -37,9 +37,7 @@ set(PADDLE_CAPI_INFER_LIBS
    paddle_cuda
    paddle_function
    paddle_gserver
-    paddle_proto
+    paddle_proto)
    paddle_pserver
    paddle_network)
 cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
--- a/paddle/capi/tests/CMakeLists.txt
+++ b/paddle/capi/tests/CMakeLists.txt
@ -4,11 +4,12 @@ add_unittest(capi_test_mats test_Vector.cpp
 target_include_directories(capi_test_mats PUBLIC ${PADDLE_CAPI_INC_PATH})
 target_link_libraries(capi_test_mats paddle_capi)
-
+if(NOT MOBILE_INFERENCE)
-add_unittest_without_exec(capi_test_gradientMachine test_GradientMachine.cpp)
+    add_unittest_without_exec(capi_test_gradientMachine test_GradientMachine.cpp)
-target_include_directories(capi_test_gradientMachine PUBLIC
+    target_include_directories(capi_test_gradientMachine PUBLIC
-  ${PADDLE_CAPI_INC_PATH})
+      ${PADDLE_CAPI_INC_PATH})
-target_link_libraries(capi_test_gradientMachine paddle_capi)
+    target_link_libraries(capi_test_gradientMachine paddle_capi)
-add_test(NAME capi_test_gradientMachine
+    add_test(NAME capi_test_gradientMachine
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
+      COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
-  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/capi/tests)
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/capi/tests)
 endif()
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -19,7 +19,7 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 proto_library(framework_proto SRCS framework.proto)
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
-cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute)
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto proto_desc)
@ -42,5 +42,12 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward ${GLOB_OP_LIB})
 #if(WITH_GPU)
 #    nv_test(executor_test SRCS executor_test.cc DEPS executor)
 #else()
 #    cc_test(executor_test SRCS executor_test.cc DEPS executor)
 #endif()
 cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
 cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@ -172,30 +172,14 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
                              std::to_string(i));
        net->ops_[op_offset]->Rename(name, dup_outputs.back());
      }
-      // collect all the offset to append `add` op for each alias
+      // collect all the offset for each alias,
-      //
+      // insert a sum operator to add all aliases to output
-      // one variable is shared between multiple operators.
+      insert_position.push_back(
-      // insert add operator one by one, then add it to output
+          {dup_op.back(), OpRegistry::CreateOp("sum", {{"X", dup_outputs}},
-      for (size_t output_idx = 0; output_idx < dup_outputs.size() - 1;
+                                               {{"Out", {name}}}, {})});
           ++output_idx) {
        auto insert_add_x = dup_outputs[output_idx];
        auto insert_add_y = dup_outputs[output_idx + 1];
        auto insert_add_out = name + "@SHARED@" + std::to_string(output_idx);
        // first add op inserted
        if (output_idx == dup_outputs.size() - 2) {
          insert_add_out = name;
        }
        if (output_idx != 0) {
          insert_add_y = name + "@SHARED@" + std::to_string(output_idx - 1);
        }
        insert_position.push_back(
            {dup_op.back(),
             OpRegistry::CreateOp("sum", {{"X", {insert_add_x, insert_add_y}}},
                                  {{"Out", {insert_add_out}}}, {})});
      }
    }
-    // make sure the inserted `add` ops follow the BFS order.
+    // make sure the inserted `sum` ops follow the BFS order.
    insert_position.sort(
        [](const Pos& l, const Pos& r) { return l.first > r.first; });
@ -302,7 +286,7 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
    return grad_op_descs;  // empty vector
  }
-  grad_op_descs = OpRegistry::CreateGradOpDescs(*op_desc);
+  grad_op_descs = OpRegistry::CreateGradOpDescs(op_desc.get());
  std::list<std::unique_ptr<OpDescBind>> pending_fill_zeros_ops;
  for (auto& desc : grad_op_descs) {
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
@ -27,6 +27,8 @@ extern std::unique_ptr<OperatorBase> Backward(
    const OperatorBase& forwardOp,
    const std::unordered_set<std::string>& no_grad_vars);
 // TODO(jiayi): Add target as parameter and generate backward op
 // according to target.
 void AppendBackward(ProgramDescBind& program_desc,
                    const std::unordered_set<std::string>& no_grad_vars);
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@ -58,6 +58,8 @@ class MulOpMaker : public OpProtoAndCheckerMaker {
    AddInput("X", "A");
    AddInput("Y", "B");
    AddOutput("Out", "Out");
    AddAttr<int>("x_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
    AddAttr<int>("y_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
    AddComment("Mul");
  }
 };
@ -440,6 +442,28 @@ TEST(Backward, simple_single_op) {
            std::vector<std::string>({f::GradVarName("b")}));
 }
 TEST(Backward, default_attribute) {
  f::ProgramDesc *program_desc = GetNewProgramDesc();
  f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
  f::BlockDescBind *block = program.Block(0);
  f::OpDescBind *op = block->AppendOp();
  op->SetType("mul");
  op->SetInput("X", {"x"});
  op->SetInput("Y", {"y"});
  op->SetOutput("Out", {"out"});
  AppendBackward(program, {});
  ASSERT_EQ(block->AllOps().size(), 2UL);
  EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
  EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);
  f::OpDescBind *grad_op = block->AllOps()[1];
  ASSERT_EQ(grad_op->Type(), "mul_grad");
  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
 }
 TEST(Backward, simple_mult_op) {
  f::ProgramDesc *program_desc = GetNewProgramDesc();
  f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@ -74,6 +74,12 @@ void BlockDescBind::Sync() {
    for (auto &op_desc : ops_) {
      op_field.AddAllocated(op_desc->Proto());
    }
    auto &var_field = *this->desc_->mutable_vars();
    var_field.Clear();
    var_field.Reserve(static_cast<int>(vars_.size()));
    for (auto &var_desc : vars_) {
      var_field.AddAllocated(var_desc.second->Proto());
    }
    need_update_ = false;
  }
 }
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <deque>
 #include <memory>
 #include <unordered_map>
 #include <vector>
 #include "paddle/framework/op_desc.h"
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@ -28,7 +28,6 @@ inline DataType ToDataType(std::type_index type) {
    return DataType::INT32;
  } else {
    PADDLE_THROW("Not supported");
    return static_cast<DataType>(-1);
  }
 }
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@ -0,0 +1,163 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/framework/executor.h"
 #include <algorithm>
 #include <iostream>
 #include <memory>
 #include <set>
 #include <vector>
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/scope.h"
 namespace paddle {
 namespace framework {
 const std::string kFeedOpType = "feed";
 const std::string kFetchOpType = "fetch";
 Executor::Executor(const std::vector<platform::Place>& places) {
  PADDLE_ENFORCE_GT(places.size(), 0);
  device_contexts_.resize(places.size());
  for (size_t i = 0; i < places.size(); i++) {
    if (platform::is_cpu_place(places[i])) {
      device_contexts_[i] = new platform::CPUDeviceContext(
          boost::get<platform::CPUPlace>(places[i]));
    } else if (platform::is_gpu_place(places[i])) {
 #ifdef PADDLE_WITH_CUDA
      device_contexts_[i] = new platform::CUDADeviceContext(
          boost::get<platform::GPUPlace>(places[i]));
 #else
      PADDLE_THROW(
          "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
          "option");
 #endif
    }
  }
 }
 Executor::~Executor() {
  for (auto& device_context : device_contexts_) {
    delete device_context;
  }
 }
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
  // TODO(tonyyang-svail):
  //    - only runs on the first device (i.e. no interdevice communication)
  //    - will change to use multiple blocks for RNN op and Cond Op
  PADDLE_ENFORCE_GT(pdesc.blocks_size(), block_id);
  auto& block = pdesc.blocks(block_id);
  auto& device = device_contexts_[0];
  // Instantiate all the vars in the global scope
  for (auto& var : block.vars()) {
    scope->NewVar(var.name());
  }
  Scope& local_scope = scope->NewScope();
  std::vector<bool> should_run = Prune(pdesc, block_id);
  PADDLE_ENFORCE_EQ(should_run.size(), static_cast<size_t>(block.ops_size()));
  for (size_t i = 0; i < should_run.size(); ++i) {
    if (should_run[i]) {
      for (auto& var : block.ops(i).outputs()) {
        for (auto& argu : var.arguments()) {
          if (local_scope.FindVar(argu) == nullptr) {
            local_scope.NewVar(argu);
          }
        }
      }
      auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i));
      op->Run(local_scope, *device);
    }
  }
  // TODO(tonyyang-svail):
  //  - Destroy local_scope
 }
 std::vector<bool> Prune(const ProgramDesc& pdesc, int block_id) {
  // TODO(tonyyang-svail):
  //    - will change to use multiple blocks for RNN op and Cond Op
  auto& block = pdesc.blocks(block_id);
  auto& ops = block.ops();
  bool expect_feed = true;
  for (auto& op_desc : ops) {
    PADDLE_ENFORCE(op_desc.type() != kFeedOpType || expect_feed,
                   "All FeedOps are at the beginning of the ProgramDesc");
    expect_feed = (op_desc.type() == kFeedOpType);
  }
  bool expect_fetch = true;
  for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
    auto& op_desc = *op_iter;
    PADDLE_ENFORCE(op_desc.type() != kFetchOpType || expect_fetch,
                   "All FetchOps must at the end of the ProgramDesc");
    expect_fetch = (op_desc.type() == kFetchOpType);
  }
  std::set<std::string> dependent_vars;
  std::vector<bool> should_run;
  for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
    auto& op_desc = *op_iter;
    bool found_dependent_vars = false;
    for (auto& var : op_desc.outputs()) {
      for (auto& argu : var.arguments()) {
        if (dependent_vars.count(argu) != 0) {
          found_dependent_vars = true;
        }
      }
    }
    if (op_desc.type() == kFetchOpType || found_dependent_vars) {
      // erase its output to the dependency graph
      for (auto& var : op_desc.outputs()) {
        for (auto& argu : var.arguments()) {
          dependent_vars.erase(argu);
        }
      }
      // insert its input to the dependency graph
      for (auto& var : op_desc.inputs()) {
        for (auto& argu : var.arguments()) {
          dependent_vars.insert(argu);
        }
      }
      should_run.push_back(true);
    } else {
      should_run.push_back(false);
    }
  }
  // TODO(tonyyang-svail):
  //    - check this after integration of Init
  // PADDLE_ENFORCE(dependent_vars.empty());
  // since we are traversing the ProgramDesc in reverse order
  // we reverse the should_run vector
  std::reverse(should_run.begin(), should_run.end());
  return should_run;
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@ -0,0 +1,55 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/tensor.h"
 namespace paddle {
 namespace framework {
 class Executor {
 public:
  explicit Executor(const std::vector<platform::Place>& places);
  ~Executor();
  /* @Brief
   * Runtime evaluation of the given ProgramDesc under certain Scope
   *
   * @param
   *  ProgramDesc
   *  Scope
   */
  void Run(const ProgramDesc&, Scope*, int);
 private:
  std::vector<platform::DeviceContext*> device_contexts_;
 };
 /* @Brief
 * Pruning the graph
 *
 * @param
 *  ProgramDesc
 *
 * @return
 *  vector<bool> Same size as ops. Indicates whether an op should be run.
 */
 std::vector<bool> Prune(const ProgramDesc& pdesc, int block_id);
 }  // namespace framework
 }  // namespace paddle
--- a/Show More
+++ b/Show More