remove conflict

8 years ago · 3db3a1066b
parent ba791f7b3f c3b46d1683
commit 3db3a1066b
327 changed files with 10057 additions and 3102 deletions
--- a/README.md
+++ b/README.md
@ -51,19 +51,19 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 - **Connected to Products**
    In addition, PaddlePaddle is also designed to be easily deployable. At Baidu,
-    PaddlePaddle has been deployed into products or service with a vast number
+    PaddlePaddle has been deployed into products and services with a vast number
    of users, including ad click-through rate (CTR) prediction, large-scale image
    classification, optical character recognition(OCR), search ranking, computer
    virus detection, recommendation, etc. It is widely utilized in products at
-    Baidu and it has achieved a significant impact. We hope you can also exploit
+    Baidu and it has achieved a significant impact. We hope you can also explore
-    the capability of PaddlePaddle to make a huge impact for your product.
+    the capability of PaddlePaddle to make an impact on your product.
 ## Installation
 It is recommended to check out the
 [Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
+[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html).
 ## Documentation
@ -72,7 +72,7 @@ We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
 - [Deep Learning 101](http://book.paddlepaddle.org/index.html)
-  You might want to start from this online interactive book that can run in Jupyter Notebook.
+  You might want to start from this online interactive book that can run in a Jupyter Notebook.
 - [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
@ -1,10 +1,9 @@
 set -e
 unset OMP_NUM_THREADS MKL_NUM_THREADS
 export OMP_DYNAMIC="FALSE"
 export KMP_AFFINITY="granularity=fine,compact,0,0"
 function train() {
  unset OMP_NUM_THREADS MKL_NUM_THREADS
  export OMP_DYNAMIC="FALSE"
  export KMP_AFFINITY="granularity=fine,compact,0,0"
  topology=$1
  bs=$2
  use_mkldnn=$3
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -49,11 +49,12 @@ if(NOT WITH_GOLANG)
 endif(NOT WITH_GOLANG)
 if(NOT WITH_GPU)
    add_definitions(-DPADDLE_ONLY_CPU)
    add_definitions(-DHPPL_STUB_FUNC)
    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
    add_definitions(-DPADDLE_WITH_CUDA)
    FIND_PACKAGE(CUDA REQUIRED)
    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
--- a/doc/api/v1/index_cn.rst
+++ b/doc/api/v1/index_cn.rst
@ -21,7 +21,7 @@ Model Config API
    trainer_config_helpers/optimizers.rst
    trainer_config_helpers/data_sources.rst
    trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/activations.rst
    trainer_config_helpers/poolings.rst
    trainer_config_helpers/networks.rst
    trainer_config_helpers/evaluators.rst
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -345,6 +345,11 @@ clip
 ..  autoclass:: paddle.v2.layer.clip
    :noindex:
 resize
 ------
 ..  autoclass:: paddle.v2.layer.resize
    :noindex:
 slope_intercept
 ---------------
 ..  autoclass:: paddle.v2.layer.slope_intercept
--- a/doc/design/block.md
+++ b/doc/design/block.md
@ -55,17 +55,23 @@ Let us consolidate the discussion by presenting some examples.
 The following C++ programs shows how blocks are used with the `if-else` structure:
 ```c++
 namespace pd = paddle;
 int x = 10;
-int y = 20;
+int y = 1;
-int out;
+int z = 10;
 bool cond = false;
 int o1, o2;
 if (cond) {
  int z = x + y;
-  out = softmax(z);
+  o1 = z;
  o2 = pd::layer::softmax(z);
 } else {
-  int z = fc(x);
+  int d = pd::layer::fc(z);
-  out = z;
+  o1 = d;
  o2 = d+1;
 }
 ```
 An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows:
@ -73,57 +79,55 @@ An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator
 ```python
 import paddle as pd
-x = var(10)
+x = minibatch([10, 20, 30]) # shape=[None, 1]
-y = var(20)
+y = var(1) # shape=[1], value=1
-cond = var(false)
+z = minibatch([10, 20, 30]) # shape=[None, 1]
-ie = pd.create_ifelseop(inputs=[x], output_num=1)
+cond = larger_than(x, 15) # [false, true, true]
 ie = pd.ifelse()
 with ie.true_block():
-    x = ie.inputs(true, 0)
+    d = pd.layer.add_scalar(x, y)
-    z = operator.add(x, y)
+    ie.output(d, pd.layer.softmax(d))
    ie.set_output(true, 0, operator.softmax(z))
 with ie.false_block():
-    x = ie.inputs(false, 0)
+    d = pd.layer.fc(z)
-    z = layer.fc(x)
+    ie.output(d, d+1)
-    ie.set_output(true, 0, operator.softmax(z))
+o1, o2 = ie(cond)
 out = b(cond)
 ```
-In both examples, the left branch computes `softmax(x+y)` and the right branch computes `fc(x)`.
+In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `x+1` and `fc(x)`.
 A difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.  The `ie.input(true, 0)` invocation returns instances in the 0-th input, `x`, that corresponds to true values in `cond` as the local variable `x`, where `ie.input(false, 0)` returns instances corresponding to false values.
 ### Blocks with `for` and `RNNOp`
 The following RNN model from the [RNN design doc](./rnn.md)
 ```python
-x = sequence([10, 20, 30])
+x = sequence([10, 20, 30]) # shape=[None, 1]
-m = var(0)
+m = var(0) # shape=[1]
-W = tensor()
+W = var(0.314, param=true) # shape=[1]
-U = tensor()
+U = var(0.375, param=true) # shape=[1]
-
+
-rnn = create_rnn(inputs=[input])
+rnn = pd.rnn()
-with rnn.stepnet() as net:
+with rnn.step():
-  x = net.set_inputs(0)
+  h = rnn.memory(init = m)
-  h = net.add_memory(init=m)
+  hh = rnn.previous_memory(h)
-  fc_out = pd.matmul(W, x)
+  a = layer.fc(W, x)
-  hidden_out = pd.matmul(U, h.pre(n=1))
+  b = layer.fc(U, hh)  
-  sum = pd.add_two(fc_out, hidden_out)
+  s = pd.add(a, b)
-  act = pd.sigmoid(sum)
+  act = pd.sigmoid(s)
-  h.update(act)                       # update memory with act
+  rnn.update_memory(h, act)
-  net.set_outputs(0, act, hidden_out) # two outputs
+  rnn.output(a, b)
 o1, o2 = rnn()
 print o1, o2
 ```
 has its equivalent C++ program as follows
 ```c++
 int* x = {10, 20, 30};
-int m = 0;
+int* m = {0};
-int W = some_value();
+int* W = {0.314};
-int U = some_other_value();
+int* U = {0.375};
 int mem[sizeof(x) / sizeof(x[0]) + 1];
 int o1[sizeof(x) / sizeof(x[0]) + 1];
@ -131,20 +135,16 @@ int o2[sizeof(x) / sizeof(x[0]) + 1];
 for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
  int x = x[i-1];
  if (i == 1) mem[0] = m;
-  int fc_out = W * x;
+  int a = W * x;
-  int hidden_out = Y * mem[i-1];
+  int b = Y * mem[i-1];
-  int sum = fc_out + hidden_out;
+  int s = fc_out + hidden_out;
  int act = sigmoid(sum);
  mem[i] = act;
  o1[i] = act;
  o2[i] = hidden_out;
 }
 print_array(o1);
 print_array(o2);
 ```
 ## Compilation and Execution
 Like TensorFlow programs, a PaddlePaddle program is written in Python.  The first part describes a neural network as a protobuf message, and the rest part executes the message for training or inference.
@ -210,11 +210,11 @@ a = pd.Varaible(shape=[20, 20])
 b = pd.fc(a, params=["fc.w", "fc.b"])
 rnn = pd.create_rnn()
-with rnn.stepnet() as net:
+with rnn.stepnet()
-    x = net.set_inputs(a)
+    x = a.as_step_input()
    # reuse fc's parameter
    fc_without_b = pd.get_variable("fc.w")
-    net.set_outputs(fc_without_b)
+    rnn.output(fc_without_b)
 out = rnn()
 ```
--- a/doc/design/if_else_op.md
+++ b/doc/design/if_else_op.md
@ -1,41 +1,51 @@
-IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has N instances. If cond[i] == True, input instance input[i] will go through true_block() and generate output[i]; otherwise it will produce output from false_bloack().
+# The `IfElse` Operator
-```python
+PaddlePaddle's `IfElse` operator differs from TensorFlow's:
 import paddle as pd
-x = var()
+- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas
-y = var()
+- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch.
-cond = var()
+
-default_value = var()
+## Example
-b = pd.create_ifelseop(inputs=[x], output_num=1)
+
-with b.true_block():
+The following PaddlePaddle program shows the usage of the IfElse operator:
    x = b.inputs(0)
    z = operator.add(x, y)
    b.set_output(0, operator.softmax(z))
 with b.false_block():
    x = b.inputs(0)
    z = layer.fc(x)
    b.set_output(0, operator.softmax(z))
 out = b(cond)
 ```
 If only true_block is set in an IfElseOp, a special case is that we can have a default value for false as:
 ```python
 import paddle as pd
-x = var()
+x = minibatch([10, 20, 30]) # shape=[None, 1]
-y = var()
+y = var(1) # shape=[1], value=1
-cond = var()
+z = minibatch([10, 20, 30]) # shape=[None, 1]
-default_value = var()
+cond = larger_than(x, 15) # [false, true, true]
-b = pd.create_ifelseop(inputs=[x], output_num=1, default_value)
+
-
+ie = pd.ifelse()
-with b.true_block():
+with ie.true_block():
-    x = b.inputs(0)
+    d = pd.layer.add(x, y)
-    z = operator.add(x, y)
+    ie.output(d, pd.layer.softmax(d))
-    b.set_output(0, operator.softmax(z))
+with ie.false_block():
    d = pd.layer.fc(z)
    ie.output(d, d+1)
 o1, o2 = ie(cond)
 ```
-out = b(cond)
+A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch.
 An equivalent C++ program is as follows:
 ```c++
 namespace pd = paddle;
 int x = 10;
 int y = 1;
 int z = 10;
 bool cond = false;
 int o1, o2;
 if (cond) {
  int d = x + y;
  o1 = z;
  o2 = pd::layer::softmax(z);
 } else {
  int d = pd::layer::fc(z);
  o1 = d;
  o2 = d+1;
 }
 ```
 where default_value is a list of vars for `cond` == False.
--- a/doc/design/program.md
+++ b/doc/design/program.md
@ -1,8 +1,10 @@
-# Design Doc: ProgramDesc
+# Design Doc: PaddlePaddle Programs
-The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
+## Compile and Execution
 A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.
-As described in [graph.md](./graph.md), the first five lines of the following PaddlePaddle program
+A simple example PaddlePaddle program can be found in [graph.md](./graph.md):
 ```python
 x = layer.data("images")
@ -13,36 +15,112 @@ optimize(cost)
 train(cost, reader=mnist.train())
 ```
-generates, or compiles, a PaddelPaddle program, which is represented by the following protobuf message:
+The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message.  The last line runs it.
-```protobuf
+## Programs and Blocks
-message ProgramDesc {
+
-  repeated BlockDesc blocks = 1;
+The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
 - program: some nested blocks
 - [block](./block.md):
  - some local variable definitions, and
  - a sequence of operators
 The concept of block comes from usual programs.  For example, the following C++ program has three blocks:
 ```c++
 int main() { // block 0
  int i = 0;
  if (i < 10) { // block 1
    for (int j = 0; j < 10; j++) { // block 2
    }
  }
  return 0;
 }
 ```
 The following PaddlePaddle program has three blocks:
 ```python
 import paddle as pd  // block 0
 x = minibatch([10, 20, 30]) # shape=[None, 1]
 y = var(1) # shape=[1], value=1
 z = minibatch([10, 20, 30]) # shape=[None, 1]
 cond = larger_than(x, 15) # [false, true, true]
 ie = pd.ifelse()
 with ie.true_block():  // block 1
    d = pd.layer.add_scalar(x, y)
    ie.output(d, pd.layer.softmax(d))
 with ie.false_block():  // block 2
    d = pd.layer.fc(z)
    ie.output(d, d+1)
 o1, o2 = ie(cond)
 ```
 ## `BlockDesc` and `ProgramDesc`
 All protobuf messages are defined in `framework.proto`.
 `BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`.
 ```protobuf
 message BlockDesc {
  required int32 parent = 1;
  repeated VarDesc vars = 2;
  repeated OpDesc ops = 3;
 }
 ```
 The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks.
 All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array.
 ```protobuf
 message ProgramDesc {
  repeated BlockDesc blocks = 1;
 }
 ```
 ### Global Block
 The global block is the first one in the above array.
 ## Operators that Use Blocks
 In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch.
 The definition of `OpDesc` shows that an operator could have some attributes:
 ```protobuf
 message OpDesc {
  AttrDesc attrs = 1;
  ...
 }
 ```
 and an attribute could be of type block, which is, in fact, a block ID as described above:
 ```
 message AttrDesc {
-  required AttrType type = 1;
+  required string name = 1;
-  // index into ProgramDesc::blocks when type==BLOCK
+  enum AttrType {
-  optional int32 block = 2;
+    INT = 1,
    STRING = 2,
    ...
    BLOCK = ...
  }
  required AttrType type = 2;
  optional int32 block = 10; // when type == BLOCK
  ...
 }
 ```
-When each of the first five lines runs, related Python function, e.g., `layer.fc`, calls C++ InferShape functions.  This InferShape function needs to access the properties of VarDesc's accessed by the current OpDesc. These VarDesc's might not be defined in the current block, but in some ancestor blocks.  This requires that we can trace the parent of a block.
+## InferShape
 A nested block is often an attribute of an operator, most likely, an IfElseOp or a WhileOp.  In above solution, all blocks are in `ProgramDesc::blocks`, this implicitly assigns a zero-based ID to each block -- the index of the block in `ProgramDesc::blocks`.  So that `AttrDesc::block` could be an integer block ID.
 With this design, the InferShape function should take the following parameters:
--- a/doc/design/python_api.md
+++ b/doc/design/python_api.md
@ -0,0 +1,216 @@
 # Design Doc: Python API
 Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
 | Python classes | Protobuf messages |
 | --- | --- |
 | Program | ProgramDesc |
 | Block | BlockDesc |
 | Operator | OpDesc |
 | Variable | VarDesc |
 Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
 ## Core Concepts
 ### Program
 A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s.  The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
 Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`.
 ```python
 class Program(objects):
    def __init__(self):
        self.proto = core.NewProgram() # a C++ ProgramDesc pointer.
        self.blocks = vector<Block>()
        self.blocks.append(Block(self, -1)) # the global block
        self.current_block = 0          # initialized to the global block
    def global_block():
        return self.blocks[0]
    def current_block():
        return self.get_block(self.current_block)
    def rollback():
        self.current_block = self.current_block().parent_idx
    def create_block():
        new_block_idx = len(self.block)
        self.blocks.append(Block(self, self.current_block))
        self.current_block = new_block_idx
        return current_block()
 ```
 `Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`.
 `Program` creates the first block as the global block in its constructor.  All parameters and their initializer operators are in the global block.
 ### Block
 A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes
 1. a map from variable names to an instance of the Python `Variable` class, and
 1. a list of `Operator` instances.
 ```python
 class Block(objects):
    def __init__(self, program, parent_idx):
        self.proto = core.NewBlock(program.proto)
        self.program = program
        self.vars = map<string, Variable>()
        self.ops = vector<Operator>()
        self.parent_idx = parent_idx
    def create_var(self, ...):
        return Variable(self, ...)
    def _create_global_var(self, ...):
        program.global_block().create_var(...)
    def create_parameter(self, name, ...):
        # Parameter is a subclass of variable. See Parameter section for details.
        self.vars[name] = Parameter(self._create_global_var(...), ...)
        return self.vars[name]
    def append_operator(self, ...):
        self.ops.append(Operator(self, ...))
    def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
       self.ops.prepend(Operator(self, ...))
 ```
 `create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
 `prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
 ### Operator
 The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes.
 ```python
 class Operator(object):
    def __init__(self,
                 block,  # Block
                 type,   # string
                 inputs, # dict<string, Variable>
                 outputs,# dict<stirng, Variable>
                 attrs   # dict<string, Any>
                 ):
        self.proto = core.NewOpDesc(block.proto, type, inputs, outputs, attrs)
        core.infer_shape(self.proto, inputs, outputs)
    def type(self):
        return self.proto.type()
 ```
 `Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++.
 ### Variable
 Operators take Variables as its inputs and outputs.
 ```python
 class Variable(object):
    def __init__(self,
                 block=None,      # Block
                 name=None,       # string
                 shape,           # tuple
                 dtype="float32", # string
                 lod_level=None   # int
                 ):
        if name is None:
            name = unique_name_generator()
        self.name = name
        self.block = block
        self.proto = core.NewVarDesc(block.proto, name, shape, lod_level)
        self.writer = None
 ```
 Please be aware of `self.writer`, that tracks operator who creates the variable.  It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class.  This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**.
 ### Parameter
 A parameter is a global variable with an initializer (or load) operator.
 ```python
 class Parameter(Variable):
    def __init__(self,
                 block=None,      # Block
                 name=None,       # string
                 shape,           # tuple
                 dtype="float32", # string
                 lod_level=None   # int
                 trainable,       # bool
                 initialize_op_attrs,
                 optimize_op_attrs):
        super(Parameter, self).__init__(block, name, shape, dtype, lod_level)
        self.trainable = trainable
        self.optimize_op_attrs = optimize_op_attrs
        block.prepend(Operator(block,  # Block
                               initialize_op_attrs['type'],   # string
                               None,   # no inputs
                               self,   # output is the parameter
                               initialize_op_attrs)
 ```
 When users create a parameter, they can call
 ```python
 program.create_parameter(
  ...,
  init_attr={
    type: "uniform_random",
    min: -1.0,
    max: 1.0,
  })
 )
 ```
 In above example, `init_attr.type` names an initialize operator.  It can also name the load operator
 ```python
 init_attr={
 type: "load",
 filename: "something.numpy",
 }
 ```
 `optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
 ## Layer Functions
 A layer is a Python function that creates some operators and variables.  Layers simplify the work of application programmers.
 ### Data Layer
 ```python
 def data_layer(name, type, column_name):
    block = the_current_program.glolal_block()
    var = block.create_global_var(
            name=name,
            shape=[None] + type.dims(),
            dtype=type.dtype)
    block.prepend_operator(block,
                           type="Feed",
                           inputs = None,
                           outputs = [var],
                           {column_name: column_name})
    return var
 ```
 The input to the feed operator is a special variable in the global scope, which is the output of [Python readers](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md).
 ### FC Layer
 ```python
 def fc_layer(input, size, ...):
    block = program.current_block()
    w = block.create_parameter(...)
    b = block.create_parameter(...)
    out = block.create_var()
    op = block.append_operator("FC", X=input, W=w, b=b, out=out)
    out.writer = op
    return out
 ```
--- a/doc/design/refactor/session.md
+++ b/doc/design/refactor/session.md
@ -0,0 +1,180 @@
 # Design Doc: Session
 ## Abstract
 The *session* object encapsulates the environment in which the
 computation graph is executed.
 We will have the *local* session and *remote* session, they offer the
 same [interface](#interface). The local session encapsulates the local
 runtime environment and the remote session encapsulates the cluster
 runtime environment.
 The local runtime environment contains:
 1. computation devices (i.e., CPU, GPU) handles, and
 1. the [scope](../scope.md) which holds all variables.
 The remote runtime environment contains:
 1. computation devices (i.e., CPU and GPU on node 0, 1) in a cluster,
   and
 1. the distributed [scope](../scope.md) in a cluster which holds all
   variables.
 The user can create a remote session on Paddle Cloud and evaluate the
 computation graph with it. In this way, the user can control the
 remote computation resource in a cluster from his local computer.
 ## Background
 The current design has an implicit global session in which
 `paddle.eval()` is executed. The pain point is:
 Since the user is not able to explicitly switch between runtime
 environments, the user cannot run a topology in two independent
 environments.
 For example, in reinforcement learning, the user may want to have a
 stale model for inference and a fresh model for training, and only
 replace the stale model with the fresh model periodically.
 Furthermore, we have no concept that encapsulates a remote environment
 that executes a computation graph.
 We need the session object to address above issues.
 ## Session
 A session is an object that owns the runtime environment. All
 computations are executed through `session.eval()`.
 ### Interface
 ```python
 eval(
    targets,
    feed_dict=None,
 )
 ```
 Evaluates the target Operations or Variables in `targets`.
 - *targets*: the evaluation targets. Can be a single Operation or
  Variable, or a list with the Operations or Variables as
  elements. The value returned by `eval()` has the same shape as the
  `target` argument.
  The PaddlePaddle program is represented by
  the [ProgramDesc](../design/program.md), `eval()` will infer the
  ProgramDesc from the given targets and run the PaddlePaddle
  program. Please
  see
  [this graph](./distributed_architecture.md#local-training-architecture) for
  the detailed illustration for the local session
  and
  [this graph](./distributed_architecture.md#distributed-training-architecture) for
  the detailed illustration for the remote session.
 - *feed_dict*: a dictionary that contains the tensors which override
  the edges of the computation graph.
  feed_dict not only can provide the input data, it can override any
  OP's input as well:
  ```python
  a = pd.constant(2.0, name="a")
  b = pd.variable(name="b")
  c = pd.mul(a,b)
  sess.eval(targets=c, feed_dict={"b":3.0}) # returns 6.0
  ```
 ```python
 close()
 ```
 Closes the session and releases the scope that the session owns.
 ### Create a Local Session
 ```python
 session(
    devices=None
 )
 ```
 Creates a new session. One session owns one global scope, so creating
 multiple sessions will create different scopes.
 - *devices*: a single `string` or a list of `string` of device names,
  the corresponding devices will be the computation devices for
  `eval()`. If not specified, all available devices (e.g., all GPUs)
  will be used. The user doesn't need to specify the CPU device since
  it will be always used. Multiple sessions can use the same device.
 #### Example
 ```Python
 a = paddle.constant(1.0)
 b = paddle.constant(2.0)
 c = a + b
 sess = paddle.session(devices=["gpu:0", "gpu:1", "fpga:0"])
 sess.eval(c)
 sess.close()
 ```
 ### Create a Remote Session
 ```python
 create_cloud_job(
    name,
    num_trainer,
    mem_per_trainer,
    gpu_per_trainer,
    cpu_per_trainer,
    num_ps,
    mem_per_ps,
    cpu_per_ps,
 )
 ```
 Creates a Paddle Cloud job. Fails if the job name exists.
 ```python
 get_cloud_job(
    name
 )
 ```
 Gets a Paddle Cloud job.
 ```python
 remote_session(
    job
 )
 ```
 - *job*: the Paddle Cloud job.
 #### Example
 ```Python
 reader = paddle.reader.recordio("/pfs/home/peter/mnist-train-*") # data stored on Paddle Cloud
 image = reader.column(0)
 label = reader.column(1)
 fc1 = paddle.op.fc(image, size=256, act="sigmoid")
 fc2 = paddle.op.fc(fc1, size=10, act="softmax")
 cost = paddle.op.cross_entropy(fc2, label)
 opt = paddle.optimizer.sgd(cost)
 job = paddle.create_cloud_job("test", 3, "1G", 1, 1, 2, "1G", 1)
 sess = paddle.remote_ession(job)
 for i in range(1000):
    sess.eval(opt)
 sess.close()
 ```
--- a/doc/design/refactorization.md
+++ b/doc/design/refactorization.md
--- a/doc/design/register_grad_op.md
+++ b/doc/design/register_grad_op.md
@ -0,0 +1,90 @@
 # Design Doc: Gradient Operators Registration
 ## The Problem Posed
 In our current operator registration mechanism, for each operator, the programmer should register a *gradient operator creator* function, which takes a C++ operator instance, and returns the corresponding gradient instance.
 However, as we decided to separate the *compilation* and *execution* of DL models, we need to reshape the creator to take a protobuf `OpDesc` message, and returns a corresponding message.
 More than that, the new registration mechanism need to support the fact that an operators' gradient computation might be a composition of operators.
 ## Current Implementation
 OpInfos store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is
 ```cpp
 struct OpInfo {
  std::function<OperatorBase*(...)> creator_;
  std::string grad_op_type_;
  ...
 };
 map<string, OpInfo> OpInfoMap;
 OperatorBase* CreateGradientOperator(const OperatorBase& op) {
  return OpInfoMap.at(op.Type()).creator_(...);
 }
 ```
 ## Proposed Solution
 The mapping relationship between an operator and its gradient operators is a function. The interface of that function is:
 ```cpp
 // (OpDesc) --> vector<OpDesc>
 std::function<std::vector<OpDescBind>(const OpDescBind&)>;
 ```
 The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for protobuf message `OpDesc` to manipulate `OpDesc` fast.
 The `GradOpDescMaker` will be registered in `OpInfo`, to replace `grad_op_type_` field. The `OpInfo` should be
 ```cpp
 struct OpInfo {
  std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>  grad_op_maker_;
  ...
 };
 ```
 The `grad_op_maker_ ` is `nullptr` if the operator does not have associated gradient operators.
 We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is
 ```cpp
 class GradOpDescMakerBase {
 public:
  GradOpDescMakerBase(const OpDescBind& );
  virtual std::vector<std::unique_ptr<OpDescBind>> operator()()const = 0;
 };
 ```
 We can convert `GradOpDescMakerBase` to `std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>` by
 ```cpp
 using GradOpMaker = ...;
 std::function<std::vector<OpDescBind>(const OpDescBind&)> func;
 func = [] (const OpDescBind& fwd_op) {
  GradOpMaker maker(fwd_op);
  return maker();
 };
 ```
 We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator.
 We should chagne register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
 The user interface should be
 ```cpp
 vector<OpDesc> MinusOpGradMaker(OpDesc) {...}
 REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker);
 // Developers can still manually implement gradient operator.
 REGISTER_OPERATOR(minus_grad, MinusGradOp);
 ```
 The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside.
 ```cpp
 REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp);
 ```
--- a/doc/design/tensor_array.md
+++ b/doc/design/tensor_array.md
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@ -206,7 +206,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
    - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
    - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
-    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulKernel`类。
+    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
 - 在 `.cu`文件中注册GPU Kernel。
@ -285,41 +285,27 @@ class TestMulGradOp(GradientChecker):
            'Y': np.random.random((84, 100)).astype("float32")
        }
-    def test_cpu_gpu_compare(self):
+    def test_check_grad_normal(self):
        self.compare_grad(self.op, self.inputs)
    def test_normal(self):
        # mul op will enlarge the relative error
-        self.check_grad(
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5)
-    def test_ignore_x(self):
+    def test_check_grad_ingore_x(self):
        self.check_grad(
-            self.op,
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
            self.inputs, ["Y"],
            "Out",
            max_relative_error=0.5,
            no_grad_set={"X"})
-    def test_ignore_y(self):
+    def test_check_grad_ingore_y(self):
        self.check_grad(
-            self.op,
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
            self.inputs, ["X"],
            "Out",
            max_relative_error=0.5,
            no_grad_set={"Y"})
 ```
 下面解释代码中一些关键的地方:
 - 调用`create_op("mul")`创建反向Op对应的前向Op。
- 调用`compare_grad`函数对比CPU、GPU计算结果。
+- `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
- `test_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
+  - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
-  - 第一个参数`self.op` : 前向Op。
+  - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
-  - 第二个参数`self.inputs` : 输入词典，词典的Key和`ProtoMaker`定义保持一致。
+  - 第三个参数`max_relative_error`：指定检测梯度时能容忍的最大错误值。
-  - 第三个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
+- `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
  - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
 - `test_ignore_x`和`test_ignore_y`分支用来测试只需要计算一个输入梯度的情况。
 ### 编译和执行单元测试
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@ -182,7 +182,7 @@ Note that **different devices (CPU, GPU)share an Op definition; whether or not t
 `MulOp`'s CPU and GPU share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
-To ease the writing of `OpKernel` compute, and for reusing code cross-device, `Eigen unsupported Tensor` module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
+To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
 This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file.
@ -205,7 +205,7 @@ The definition of its corresponding backward operator, if applicable, is similar
    - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
    - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
-    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulKernel`.
+    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
 - Registering GPU Kernel in `.cu` files
@ -293,41 +293,27 @@ class TestMulGradOp(GradientChecker):
            'Y': np.random.random((84, 100)).astype("float32")
        }
-    def test_cpu_gpu_compare(self):
+    def test_check_grad_normal(self):
        self.compare_grad(self.op, self.inputs)
    def test_normal(self):
        # mul op will enlarge the relative error
-        self.check_grad(
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5)
-    def test_ignore_x(self):
+    def test_check_grad_ingore_x(self):
        self.check_grad(
-            self.op,
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
            self.inputs, ["Y"],
            "Out",
            max_relative_error=0.5,
            no_grad_set={"X"})
-    def test_ignore_y(self):
+    def test_check_grad_ingore_y(self):
        self.check_grad(
-            self.op,
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
            self.inputs, ["X"],
            "Out",
            max_relative_error=0.5,
            no_grad_set={"Y"})
 ```
 Some key points in the code above include:
 - `create_op("mul")` creates the backward operator's corresponding forward operator.
 - `compare_grad` compares results between utilizing the CPU and the GPU.
 - `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
-  - The first variable `self.op` denotes the forward operator.
+  - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
-  - The second variable `self.inputs` denotes the input dictionary, which has its key value identical to its `ProtoMaker` definitions.
+  - The second variable `"Out"` points to the network's final output target `Out`.
-  - The third variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
+  - The third variable `max_relative_error` points to the maximum relative tolerance error during scaling tests.
-  - The fourth variable `"Out"` points to the network's final output target `Out`.
+- `test_check_grad_ingore_x` and `test_check_grad_ingore_y`branches test the cases where there is only one scaling input.
 - `test_ignore_x` and `test_ignore_y`branches test the cases where there is only one scaling input.
 ### Compiling and Running
--- a/doc/howto/dev/use_eigen_en.md
+++ b/doc/howto/dev/use_eigen_en.md
@ -0,0 +1,146 @@
 ## How to use Eigen in Paddle
 Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`.
 ### Eigen Tensor Module
 The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU.
 Note that Eigen Tensor is still being actively developed, so its tests are not completely covered and its documentation may be sparse.
 For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md).
 ### paddle::framework::Tensor
 Paddle Tensor's is defined in the framework directory with the following interface:
 ```cpp
 class Tensor {
 public:
  /*! Return a pointer to mutable memory block. */
  template <typename T>
  inline T* data();
  /**
   * @brief   Return a pointer to mutable memory block.
   * @note    If not exist, then allocation.
   */
  template <typename T>
  inline T* mutable_data(platform::Place place);
  /**
   * @brief     Return a pointer to mutable memory block.
   *
   * @param[in] dims    The dimensions of the memory block.
   * @param[in] place   The place of the memory block.
   *
   * @note      If not exist, then allocation.
   */
  template <typename T>
  inline T* mutable_data(DDim dims, platform::Place place);
  /*! Resize the dimensions of the memory block. */
  inline Tensor& Resize(const DDim& dims);
  /*! Return the dimensions of the memory block. */
  inline const DDim& dims() const;
 private:
  /*! holds the memory block if allocated. */
  std::shared_ptr<Placeholder> holder_;
  /*! points to dimensions of memory block. */
  DDim dim_;
 };
 ```
 `Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
 ```cpp
 paddle::framework::Tensor t;
 paddle::platform::CPUPlace place;
 // set size first
 t.Resize({2, 3});
 // allocate memory on CPU later
 t.mutable_data(place);
 ```
 ### paddle::framework::Tensor Usage
 `AddOp` demonstrates Tensor's usage.
 - InferShape
 When computing a neural network's compute graph, first call every `Operator`'s `InferShape` method, and use `Resize` to configure the size of the output tensor.
 ```cpp
 void InferShape(const framework::InferShapeContext &ctx) const override {
  PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
                    ctx.Input<Tensor>("Y")->dims(),
                    "Two input of Add Op's dimension must be same.");
  ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("X")->dims());
 }
 ```
 - Run
 ```cpp
 void Compute(const framework::ExecutionContext& context) const override {
  auto* input0 = context.Input<Tensor>("X");
  auto* input1 = context.Input<Tensor>("Y");
  auto* output = context.Output<Tensor>("Out");
  output->mutable_data<T>(context.GetPlace());
  auto x = EigenVector<T>::Flatten(*input0);
  auto y = EigenVector<T>::Flatten(*input1);
  auto z = EigenVector<T>::Flatten(*output);
  auto place = context.GetEigenDevice<Place>();
  z.device(place) = x + y;
 }
 ```
 ### paddle::framework::Tensor到EigenTensor的转换
 As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
 Using EigenTensor as an example:
 ```cpp
 Tensor t;
 float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
 for (int i = 0; i < 1 * 2 * 3; i++) {
  p[i] = static_cast<float>(i);
 }
 EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
 ```
 `From` is an interfacing method provided by the EigenTensor template, which implements the transformation from a `paddle::framework::Tensor` object to an EigenTensor. Since `rank` is a template parameter, it needs to be explicitly specified at the time of the transformation.
 In Eigen, tensors with different ranks are different types, with `Vector` bring a rank-1 instance. Note that `EigenVector<T>::From` uses a transformation from an 1-dimensional Paddle tensor to a 1-dimensional Eigen tensor while `EigenVector<T>::Flatten` reshapes a paddle tensor and flattens it into a 1-dimensional Eigen tensor. Both resulting tensors are still typed EigenVector.
 For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc) in the `eigen_test.cc` file.
 ### Implementing Computation
 While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor.
 ```cpp
 auto x = EigenVector<T>::Flatten(*input0);
 auto y = EigenVector<T>::Flatten(*input1);
 auto z = EigenVector<T>::Flatten(*output);
 auto place = context.GetEigenDevice<Place>();
 z.device(place) = x + y;
 ```
 In this code segment, input0/input1/output can be Tensors of arbitrary dimension. We are calling Flatten from EigenVector, transforming a tensor of any dimension into a 1-dimensional EigenVector. After completing computation, input0/input1/output will retain the same shape information, and they can be resized using the `Resize` interface.
 Because the Eigen Tensor module is under-documented, please refer to `OpKernel`'s computation code in TensorFlow's [kernel module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels).
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@ -47,7 +47,7 @@ bool isUsingGpu() { return FLAGS_use_gpu; }
 void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
 bool isGpuVersion() {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
  return false;
 #else
  return true;
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@ -46,7 +46,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
  if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
  paddle::real* buf = ptr->mat->getRowBuf(rowID);
  size_t width = ptr->mat->getWidth();
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
  hl_memcpy(buf, rowArray, sizeof(paddle::real) * width);
 #else
  std::copy(rowArray, rowArray + width, buf);
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -19,16 +19,15 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 proto_library(framework_proto SRCS framework.proto)
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
-cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
+cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto proto_desc)
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope proto_desc)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
-cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator)
 cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder op_proto_maker op_info)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
@ -42,3 +41,6 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
 cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
 cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@ -21,20 +21,12 @@ limitations under the License. */
 #include <vector>
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/type_defs.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/variant.h"
 namespace paddle {
 namespace framework {
 // The order should be as same as framework.proto
 typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                       std::vector<float>, std::vector<std::string>, bool,
                       std::vector<bool>, BlockDesc*>
    Attribute;
 typedef std::unordered_map<std::string, Attribute> AttributeMap;
 ProgramDesc& GetProgramDesc();
 template <typename T>
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@ -13,10 +13,13 @@
   limitations under the License. */
 #include "paddle/framework/backward.h"
 #include "paddle/operators/net_op.h"
 #include <deque>
 #include <list>
 #include <memory>
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
@ -24,6 +27,35 @@
 namespace paddle {
 namespace framework {
 static inline std::unique_ptr<OperatorBase> CreateGradOp(
    const OperatorBase& op) {
  OpDescBind op_desc;
  op_desc.SetInputMap(op.Inputs());
  op_desc.SetOutputMap(op.Outputs());
  op_desc.SetType(op.Type());
  op_desc.SetAttrMap(op.Attrs());
  auto& info = OpInfoMap::Instance().Get(op.Type());
  auto grad_descs = info.GradOpMaker()(op_desc);
  std::vector<std::unique_ptr<OperatorBase>> grad_ops;
  grad_ops.reserve(grad_descs.size());
  std::transform(grad_descs.begin(), grad_descs.end(),
                 std::back_inserter(grad_ops),
                 [](const std::unique_ptr<OpDescBind>& grad_desc) {
                   return OpRegistry::CreateOp(*grad_desc);
                 });
  PADDLE_ENFORCE(!grad_ops.empty());
  if (grad_ops.size() == 1) {
    return std::move(grad_ops[0]);
  } else {
    auto net_op = new operators::NetOp();
    for (auto& grad_op : grad_ops) {
      net_op->AppendOp(std::move(grad_op));
    }
    net_op->CompleteAddOp();
    return std::unique_ptr<OperatorBase>(net_op);
  }
 }
 template <typename Map, typename T>
 static void ForEachVarName(const Map& names, T callback) {
  for (auto& name : names) {
@ -141,9 +173,26 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
        net->ops_[op_offset]->Rename(name, dup_outputs.back());
      }
      // collect all the offset to append `add` op for each alias
-      insert_position.push_back(
+      //
-          {dup_op.back(), OpRegistry::CreateOp("add", {{"X", {dup_outputs}}},
+      // one variable is shared between multiple operators.
-                                               {{"Out", {name}}}, {})});
+      // insert add operator one by one, then add it to output
      for (size_t output_idx = 0; output_idx < dup_outputs.size() - 1;
           ++output_idx) {
        auto insert_add_x = dup_outputs[output_idx];
        auto insert_add_y = dup_outputs[output_idx + 1];
        auto insert_add_out = name + "@SHARED@" + std::to_string(output_idx);
        // first add op inserted
        if (output_idx == dup_outputs.size() - 2) {
          insert_add_out = name;
        }
        if (output_idx != 0) {
          insert_add_y = name + "@SHARED@" + std::to_string(output_idx - 1);
        }
        insert_position.push_back(
            {dup_op.back(),
             OpRegistry::CreateOp("sum", {{"X", {insert_add_x, insert_add_y}}},
                                  {{"Out", {insert_add_out}}}, {})});
      }
    }
    // make sure the inserted `add` ops follow the BFS order.
@ -154,7 +203,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
      net->InsertOp(pos.first + 1, std::move(pos.second));
    }
  } else {
-    std::unique_ptr<OperatorBase> grad_op(OpRegistry::CreateGradOp(forwardOp));
+    std::unique_ptr<OperatorBase> grad_op(CreateGradOp(forwardOp));
    ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
                                          const std::string& grad_input) {
@ -182,7 +231,8 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
    // process recurrent gradient op as a special operator.
    if (forwardOp.Type() == "recurrent") {
-      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself), or
+      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
      // or
      // this will result in infinite loop.
      const auto& rnnop =
          *static_cast<const operators::RecurrentOp*>(&forwardOp);
@ -222,5 +272,145 @@ std::unique_ptr<OperatorBase> Backward(
  return BackwardRecursive(forwardOp, no_grad_names, uid);
 }
 // ====================================  //
 static bool AllGradInSet(const std::vector<std::string>& names,
                         const std::unordered_set<std::string>& set) {
  for (const std::string& name : names) {
    if (!set.count(GradVarName(name))) {
      return false;
    }
  }
  return true;
 }
 std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
    const std::unique_ptr<OpDescBind>& op_desc,
    std::unordered_set<std::string>& no_grad_vars) {
  std::vector<std::unique_ptr<OpDescBind>> grad_op_descs;
  // All input gradients of forwarding operator do not need to calculat.
  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
  if (AllGradInSet(inputs, no_grad_vars)) {
    return grad_op_descs;  // empty vector
  }
  // All output gradients of forwarding operator do not need to calculate.
  const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
  if (AllGradInSet(outputs, no_grad_vars)) {
    for (const std::string& name : inputs) {
      no_grad_vars.insert(GradVarName(name));
    }
    return grad_op_descs;  // empty vector
  }
  grad_op_descs = OpRegistry::CreateGradOpDescs(*op_desc);
  std::list<std::unique_ptr<OpDescBind>> pending_fill_zeros_ops;
  for (auto& desc : grad_op_descs) {
    for (const std::string& in_name : desc->InputArgumentNames()) {
      if (no_grad_vars.count(in_name)) {
        std::string prefix = in_name.substr(
            0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
        std::string new_name = prefix + kZeroVarSuffix;
        desc->Rename(in_name, new_name);
        std::unique_ptr<OpDescBind> fill_zeros_op(new OpDescBind(
            "fill_zeros_like", {{"X", {prefix}}}, {{"Y", {new_name}}}, {}));
        pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
      }
    }
    for (const std::string& out_name : desc->OutputArgumentNames()) {
      if (no_grad_vars.count(out_name)) {
        desc->Rename(out_name, kEmptyVarName);
      }
    }
  }
  for (auto& p : pending_fill_zeros_ops) {
    grad_op_descs.insert(grad_op_descs.begin(), std::move(p));
  }
  return grad_op_descs;
 }
 std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
    ProgramDescBind& program_desc, int block_idx,
    std::unordered_set<std::string>& no_grad_vars) {
  BlockDescBind* cur_block = program_desc.Block(block_idx);
  std::deque<std::unique_ptr<OpDescBind>>& op_descs = cur_block->ops_;
  std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
  size_t grad_desc_idx = 0;
  std::vector<std::unique_ptr<OpDescBind>> backward_descs;
  for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
    std::vector<std::unique_ptr<OpDescBind>> op_grads =
        MakeOpGrad(*it, no_grad_vars);
    if ((*it)->Type() == "recurrent") {
      PADDLE_ENFORCE_EQ(
          op_grads.size(), size_t(1),
          "rnn_op's gradient process should contain only one op.");
      int step_block_idx = (*it)->GetBlockAttr("stop_block");
      auto backward_block_op_descs =
          MakeBlockBackward(program_desc, step_block_idx, no_grad_vars);
      BlockDescBind* backward_block = program_desc.AppendBlock(*cur_block);
      for (auto& ptr : backward_block_op_descs) {
        backward_block->ops_.push_back(std::move(ptr));
      }
      op_grads[0]->SetBlockAttr("step_block", *backward_block);
    }
    for (const auto& desc : op_grads) {
      for (const std::string& out_name : desc->OutputArgumentNames()) {
        dup_out_ops[out_name].emplace_back(grad_desc_idx);
      }
      ++grad_desc_idx;
    }
    std::transform(
        op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
        [](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
  }
  // Check whether some variables are written more than once
  std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
  for (const auto& dup : dup_out_ops) {
    const std::string& out_name = dup.first;
    const std::vector<size_t> dup_op = dup.second;
    if (out_name != kEmptyVarName && dup_op.size() > 1) {
      std::vector<std::string> sum_op_inputs;
      for (size_t i = 0; i < dup_op.size(); ++i) {
        std::string new_name = out_name + "@RENAME@" + std::to_string(i);
        backward_descs[dup_op[i]]->Rename(out_name, new_name);
        sum_op_inputs.emplace_back(new_name);
      }
      std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
          "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
      pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
    }
  }
  pending_sum_ops.sort(
      [](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
         const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
        return a.first > b.first;
      });
  for (auto& p : pending_sum_ops) {
    backward_descs.insert(backward_descs.begin() + p.first + 1,
                          std::move(p.second));
  }
  return backward_descs;
 }
 void AppendBackward(ProgramDescBind& program_desc,
                    const std::unordered_set<std::string>& no_grad_vars) {
  std::unordered_set<std::string> no_grad_var_names;
  no_grad_var_names.reserve(no_grad_vars.size() + 1);
  no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
  for (auto& name : no_grad_vars) {
    no_grad_var_names.insert(GradVarName(name));
  }
  const int root_block_idx = 0;
  auto backward_op_descs =
      MakeBlockBackward(program_desc, root_block_idx, no_grad_var_names);
  auto& forw_op_descs = program_desc.Block(root_block_idx)->ops_;
  for (auto& ptr : backward_op_descs) {
    forw_op_descs.push_back(std::move(ptr));
  }
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
@ -13,8 +13,11 @@
   limitations under the License. */
 #pragma once
 #include <unordered_set>
-#include "operator.h"
+#include "paddle/framework/operator.h"
 #include "paddle/framework/program_desc.h"
 namespace paddle {
 namespace framework {
@ -23,5 +26,9 @@ namespace framework {
 extern std::unique_ptr<OperatorBase> Backward(
    const OperatorBase& forwardOp,
    const std::unordered_set<std::string>& no_grad_vars);
 void AppendBackward(ProgramDescBind& program_desc,
                    const std::unordered_set<std::string>& no_grad_vars);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@ -0,0 +1,93 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/program_desc.h"
 namespace paddle {
 namespace framework {
 VarDescBind *BlockDescBind::NewVar(const std::string &name) {
  need_update_ = true;
  auto it = vars_.find(name);
  PADDLE_ENFORCE(it == vars_.end(), "Duplicated variable %s", name);
  auto var = new VarDescBind(name);
  vars_[name].reset(var);
  return var;
 }
 VarDescBind *BlockDescBind::Var(const std::string &name) const {
  auto it = vars_.find(name);
  PADDLE_ENFORCE(it != vars_.end(),
                 "Can not find variable %s in current block.", name);
  return it->second.get();
 }
 bool BlockDescBind::HasVar(const std::string &name) const {
  return vars_.find(name) != vars_.end();
 }
 std::vector<VarDescBind *> BlockDescBind::AllVars() const {
  std::vector<VarDescBind *> res;
  for (const auto &p : vars_) {
    res.push_back(p.second.get());
  }
  return res;
 }
 OpDescBind *BlockDescBind::AppendOp() {
  need_update_ = true;
  ops_.emplace_back(new OpDescBind());
  return ops_.back().get();
 }
 OpDescBind *BlockDescBind::PrependOp() {
  need_update_ = true;
  ops_.emplace_front(new OpDescBind());
  return ops_.front().get();
 }
 std::vector<OpDescBind *> BlockDescBind::AllOps() const {
  std::vector<OpDescBind *> res;
  for (const auto &op : ops_) {
    res.push_back(op.get());
  }
  return res;
 }
 void BlockDescBind::Sync() {
  if (need_update_) {
    auto &op_field = *this->desc_->mutable_ops();
    op_field.Clear();
    op_field.Reserve(static_cast<int>(ops_.size()));
    for (auto &op_desc : ops_) {
      op_field.AddAllocated(op_desc->Proto());
    }
    need_update_ = false;
  }
 }
 BlockDescBind *BlockDescBind::ParentBlock() const {
  if (this->desc_->parent_idx() == -1) {
    return nullptr;
  }
  return prog_->Block(static_cast<size_t>(this->desc_->parent_idx()));
 }
 void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) {
  BlockDesc *desc = block.RawPtr();
  this->attrs_[name] = desc;
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@ -0,0 +1,81 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <deque>
 #include <unordered_map>
 #include <vector>
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/var_desc.h"
 #include "paddle/platform/macros.h"
 namespace paddle {
 namespace framework {
 class ProgramDescBind;
 // Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
 // read/write speed. Only when we want the protobuf message, the local changes
 // will be synchronized (by `Sync` method).
 class BlockDescBind {
 public:
  friend std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
      ProgramDescBind &program_desc, int block_idx,
      std::unordered_set<std::string> &no_grad_vars);
  friend void AppendBackward(
      ProgramDescBind &program_desc,
      const std::unordered_set<std::string> &no_grad_vars);
  BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
      : prog_(prog), desc_(desc), need_update_(false) {}
  int32_t ID() const { return desc_->idx(); }
  int32_t Parent() const { return desc_->parent_idx(); }
  VarDescBind *NewVar(const std::string &name_bytes);
  VarDescBind *Var(const std::string &name_bytes) const;
  bool HasVar(const std::string &var_name) const;
  std::vector<VarDescBind *> AllVars() const;
  BlockDescBind *ParentBlock() const;
  OpDescBind *AppendOp();
  OpDescBind *PrependOp();
  std::vector<OpDescBind *> AllOps() const;
  void Sync();
  BlockDesc *RawPtr() { return desc_; }
 private:
  ProgramDescBind *prog_;  // not_own
  BlockDesc *desc_;        // not_own
  bool need_update_;
  std::deque<std::unique_ptr<OpDescBind>> ops_;
  std::unordered_map<std::string, std::unique_ptr<VarDescBind>> vars_;
  DISABLE_COPY_AND_ASSIGN(BlockDescBind);
 };
 }  // namespace framework
 }  // namespace paddle
--- a/Show More
+++ b/Show More