merge develop

7 years ago · c93d74aa06
parent 6e7666f199 c3b46d1683
commit c93d74aa06
32 changed files with 1418 additions and 622 deletions
--- a/doc/design/block.md
+++ b/doc/design/block.md
@ -55,17 +55,23 @@ Let us consolidate the discussion by presenting some examples.
 The following C++ programs shows how blocks are used with the `if-else` structure:

 ```c++
+namespace pd = paddle;
+
 int x = 10;
-int y = 20;
-int out;
+int y = 1;
+int z = 10;
 bool cond = false;
+int o1, o2;
 if (cond) {
  int z = x + y;
-  out = softmax(z);
+  o1 = z;
+  o2 = pd::layer::softmax(z);
 } else {
-  int z = fc(x);
-  out = z;
+  int d = pd::layer::fc(z);
+  o1 = d;
+  o2 = d+1;
 }
+
 ```

 An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows:
@ -73,57 +79,55 @@ An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator
 ```python
 import paddle as pd

-x = var(10)
-y = var(20)
-cond = var(false)
-ie = pd.create_ifelseop(inputs=[x], output_num=1)
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
 with ie.true_block():
-    x = ie.inputs(true, 0)
-    z = operator.add(x, y)
-    ie.set_output(true, 0, operator.softmax(z))
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
 with ie.false_block():
-    x = ie.inputs(false, 0)
-    z = layer.fc(x)
-    ie.set_output(true, 0, operator.softmax(z))
-out = b(cond)
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
 ```

-In both examples, the left branch computes `softmax(x+y)` and the right branch computes `fc(x)`.
+In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `x+1` and `fc(x)`.

 A difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.  The `ie.input(true, 0)` invocation returns instances in the 0-th input, `x`, that corresponds to true values in `cond` as the local variable `x`, where `ie.input(false, 0)` returns instances corresponding to false values.

+
 ### Blocks with `for` and `RNNOp`

 The following RNN model from the [RNN design doc](./rnn.md)

 ```python
-x = sequence([10, 20, 30])
-m = var(0)
-W = tensor()
-U = tensor()
-
-rnn = create_rnn(inputs=[input])
-with rnn.stepnet() as net:
-  x = net.set_inputs(0)
-  h = net.add_memory(init=m)
-  fc_out = pd.matmul(W, x)
-  hidden_out = pd.matmul(U, h.pre(n=1))
-  sum = pd.add_two(fc_out, hidden_out)
-  act = pd.sigmoid(sum)
-  h.update(act)                       # update memory with act
-  net.set_outputs(0, act, hidden_out) # two outputs
-
+x = sequence([10, 20, 30]) # shape=[None, 1]
+m = var(0) # shape=[1]
+W = var(0.314, param=true) # shape=[1]
+U = var(0.375, param=true) # shape=[1]
+
+rnn = pd.rnn()
+with rnn.step():
+  h = rnn.memory(init = m)
+  hh = rnn.previous_memory(h)
+  a = layer.fc(W, x)
+  b = layer.fc(U, hh)  
+  s = pd.add(a, b)
+  act = pd.sigmoid(s)
+  rnn.update_memory(h, act)
+  rnn.output(a, b)
 o1, o2 = rnn()
-print o1, o2
 ```
-
 has its equivalent C++ program as follows

 ```c++
 int* x = {10, 20, 30};
-int m = 0;
-int W = some_value();
-int U = some_other_value();
+int* m = {0};
+int* W = {0.314};
+int* U = {0.375};

 int mem[sizeof(x) / sizeof(x[0]) + 1];
 int o1[sizeof(x) / sizeof(x[0]) + 1];
@ -131,20 +135,16 @@ int o2[sizeof(x) / sizeof(x[0]) + 1];
 for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
  int x = x[i-1];
  if (i == 1) mem[0] = m;
-  int fc_out = W * x;
-  int hidden_out = Y * mem[i-1];
-  int sum = fc_out + hidden_out;
+  int a = W * x;
+  int b = Y * mem[i-1];
+  int s = fc_out + hidden_out;
  int act = sigmoid(sum);
  mem[i] = act;
  o1[i] = act;
  o2[i] = hidden_out;
 }
-
-print_array(o1);
-print_array(o2);
 ```

-
 ## Compilation and Execution

 Like TensorFlow programs, a PaddlePaddle program is written in Python.  The first part describes a neural network as a protobuf message, and the rest part executes the message for training or inference.
@ -210,11 +210,11 @@ a = pd.Varaible(shape=[20, 20])
 b = pd.fc(a, params=["fc.w", "fc.b"])

 rnn = pd.create_rnn()
-with rnn.stepnet() as net:
-    x = net.set_inputs(a)
+with rnn.stepnet()
+    x = a.as_step_input()
    # reuse fc's parameter
    fc_without_b = pd.get_variable("fc.w")
-    net.set_outputs(fc_without_b)
+    rnn.output(fc_without_b)

 out = rnn()
 ```
--- a/doc/design/if_else_op.md
+++ b/doc/design/if_else_op.md
@ -1,41 +1,51 @@
-IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has N instances. If cond[i] == True, input instance input[i] will go through true_block() and generate output[i]; otherwise it will produce output from false_bloack().
+# The `IfElse` Operator

-```python
-import paddle as pd
+PaddlePaddle's `IfElse` operator differs from TensorFlow's:

-x = var()
-y = var()
-cond = var()
-default_value = var()
-b = pd.create_ifelseop(inputs=[x], output_num=1)
-with b.true_block():
-    x = b.inputs(0)
-    z = operator.add(x, y)
-    b.set_output(0, operator.softmax(z))
-
-with b.false_block():
-    x = b.inputs(0)
-    z = layer.fc(x)
-    b.set_output(0, operator.softmax(z))
-
-out = b(cond)
-```
+- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas
+- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch.
+
+## Example
+
+The following PaddlePaddle program shows the usage of the IfElse operator:

-If only true_block is set in an IfElseOp, a special case is that we can have a default value for false as:
 ```python
 import paddle as pd

-x = var()
-y = var()
-cond = var()
-default_value = var()
-b = pd.create_ifelseop(inputs=[x], output_num=1, default_value)
-
-with b.true_block():
-    x = b.inputs(0)
-    z = operator.add(x, y)
-    b.set_output(0, operator.softmax(z))
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
+with ie.true_block():
+    d = pd.layer.add(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```

-out = b(cond)
+A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch.
+
+An equivalent C++ program is as follows:
+
+```c++
+namespace pd = paddle;
+
+int x = 10;
+int y = 1;
+int z = 10;
+bool cond = false;
+int o1, o2;
+if (cond) {
+  int d = x + y;
+  o1 = z;
+  o2 = pd::layer::softmax(z);
+} else {
+  int d = pd::layer::fc(z);
+  o1 = d;
+  o2 = d+1;
+}
 ```
-where default_value is a list of vars for `cond` == False.
--- a/doc/design/program.md
+++ b/doc/design/program.md
@ -1,8 +1,10 @@
-# Design Doc: ProgramDesc
+# Design Doc: PaddlePaddle Programs

-The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
+## Compile and Execution
+
+A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.

-As described in [graph.md](./graph.md), the first five lines of the following PaddlePaddle program
+A simple example PaddlePaddle program can be found in [graph.md](./graph.md):

 ```python
 x = layer.data("images")
@ -13,36 +15,112 @@ optimize(cost)
 train(cost, reader=mnist.train())
 ```

-generates, or compiles, a PaddelPaddle program, which is represented by the following protobuf message:
+The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message.  The last line runs it.

-```protobuf
-message ProgramDesc {
-  repeated BlockDesc blocks = 1;
+## Programs and Blocks
+
+The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
+
+- program: some nested blocks
+- [block](./block.md):
+  - some local variable definitions, and
+  - a sequence of operators
+
+The concept of block comes from usual programs.  For example, the following C++ program has three blocks:
+
+```c++
+int main() { // block 0
+  int i = 0;
+  if (i < 10) { // block 1
+    for (int j = 0; j < 10; j++) { // block 2
+    }
  }
+  return 0;
+}
+```
+
+The following PaddlePaddle program has three blocks:
+
+```python
+import paddle as pd  // block 0
+
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]

+ie = pd.ifelse()
+with ie.true_block():  // block 1
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():  // block 2
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+
+## `BlockDesc` and `ProgramDesc`
+
+All protobuf messages are defined in `framework.proto`.
+
+`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`.
+
+```protobuf
 message BlockDesc {
  required int32 parent = 1;
  repeated VarDesc vars = 2;
  repeated OpDesc ops = 3;
 }
+```
+
+The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks.
+
+All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array.
+
+```protobuf
+message ProgramDesc {
+  repeated BlockDesc blocks = 1;
+}
+```
+
+
+### Global Block
+
+The global block is the first one in the above array.

+## Operators that Use Blocks
+
+In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch.
+
+The definition of `OpDesc` shows that an operator could have some attributes:
+
+```protobuf
 message OpDesc {
  AttrDesc attrs = 1;
  ...
 }
+```
+
+and an attribute could be of type block, which is, in fact, a block ID as described above:

+```
 message AttrDesc {
-  required AttrType type = 1;
+  required string name = 1;

-  // index into ProgramDesc::blocks when type==BLOCK
-  optional int32 block = 2;
+  enum AttrType {
+    INT = 1,
+    STRING = 2,
    ...
+    BLOCK = ...
  }
-```
+  required AttrType type = 2;

-When each of the first five lines runs, related Python function, e.g., `layer.fc`, calls C++ InferShape functions.  This InferShape function needs to access the properties of VarDesc's accessed by the current OpDesc. These VarDesc's might not be defined in the current block, but in some ancestor blocks.  This requires that we can trace the parent of a block.
+  optional int32 block = 10; // when type == BLOCK
+  ...
+}
+```

-A nested block is often an attribute of an operator, most likely, an IfElseOp or a WhileOp.  In above solution, all blocks are in `ProgramDesc::blocks`, this implicitly assigns a zero-based ID to each block -- the index of the block in `ProgramDesc::blocks`.  So that `AttrDesc::block` could be an integer block ID.
+## InferShape

 With this design, the InferShape function should take the following parameters:

--- a/doc/design/tensor_array.md
+++ b/doc/design/tensor_array.md
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -26,10 +26,8 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto proto_desc)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope proto_desc)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)

-cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator proto_desc)
-cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder op_proto_maker op_info)
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
-cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry sum_op)

 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@ -13,10 +13,13 @@
   limitations under the License. */

 #include "paddle/framework/backward.h"
+#include "paddle/operators/net_op.h"

+#include <deque>
 #include <list>
 #include <memory>

+#include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
@ -24,6 +27,35 @@
 namespace paddle {
 namespace framework {

+static inline std::unique_ptr<OperatorBase> CreateGradOp(
+    const OperatorBase& op) {
+  OpDescBind op_desc;
+  op_desc.SetInputMap(op.Inputs());
+  op_desc.SetOutputMap(op.Outputs());
+  op_desc.SetType(op.Type());
+  op_desc.SetAttrMap(op.Attrs());
+  auto& info = OpInfoMap::Instance().Get(op.Type());
+  auto grad_descs = info.GradOpMaker()(op_desc);
+  std::vector<std::unique_ptr<OperatorBase>> grad_ops;
+  grad_ops.reserve(grad_descs.size());
+  std::transform(grad_descs.begin(), grad_descs.end(),
+                 std::back_inserter(grad_ops),
+                 [](const std::unique_ptr<OpDescBind>& grad_desc) {
+                   return OpRegistry::CreateOp(*grad_desc);
+                 });
+  PADDLE_ENFORCE(!grad_ops.empty());
+  if (grad_ops.size() == 1) {
+    return std::move(grad_ops[0]);
+  } else {
+    auto net_op = new operators::NetOp();
+    for (auto& grad_op : grad_ops) {
+      net_op->AppendOp(std::move(grad_op));
+    }
+    net_op->CompleteAddOp();
+    return std::unique_ptr<OperatorBase>(net_op);
+  }
+}
+
 template <typename Map, typename T>
 static void ForEachVarName(const Map& names, T callback) {
  for (auto& name : names) {
@ -171,7 +203,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
      net->InsertOp(pos.first + 1, std::move(pos.second));
    }
  } else {
-    std::unique_ptr<OperatorBase> grad_op(OpRegistry::CreateGradOp(forwardOp));
+    std::unique_ptr<OperatorBase> grad_op(CreateGradOp(forwardOp));

    ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
                                          const std::string& grad_input) {
@ -240,5 +272,145 @@ std::unique_ptr<OperatorBase> Backward(
  return BackwardRecursive(forwardOp, no_grad_names, uid);
 }

+// ====================================  //
+
+static bool AllGradInSet(const std::vector<std::string>& names,
+                         const std::unordered_set<std::string>& set) {
+  for (const std::string& name : names) {
+    if (!set.count(GradVarName(name))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
+    const std::unique_ptr<OpDescBind>& op_desc,
+    std::unordered_set<std::string>& no_grad_vars) {
+  std::vector<std::unique_ptr<OpDescBind>> grad_op_descs;
+  // All input gradients of forwarding operator do not need to calculat.
+  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
+  if (AllGradInSet(inputs, no_grad_vars)) {
+    return grad_op_descs;  // empty vector
+  }
+  // All output gradients of forwarding operator do not need to calculate.
+  const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
+  if (AllGradInSet(outputs, no_grad_vars)) {
+    for (const std::string& name : inputs) {
+      no_grad_vars.insert(GradVarName(name));
+    }
+    return grad_op_descs;  // empty vector
+  }
+
+  grad_op_descs = OpRegistry::CreateGradOpDescs(*op_desc);
+
+  std::list<std::unique_ptr<OpDescBind>> pending_fill_zeros_ops;
+  for (auto& desc : grad_op_descs) {
+    for (const std::string& in_name : desc->InputArgumentNames()) {
+      if (no_grad_vars.count(in_name)) {
+        std::string prefix = in_name.substr(
+            0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
+        std::string new_name = prefix + kZeroVarSuffix;
+        desc->Rename(in_name, new_name);
+        std::unique_ptr<OpDescBind> fill_zeros_op(new OpDescBind(
+            "fill_zeros_like", {{"X", {prefix}}}, {{"Y", {new_name}}}, {}));
+        pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
+      }
+    }
+    for (const std::string& out_name : desc->OutputArgumentNames()) {
+      if (no_grad_vars.count(out_name)) {
+        desc->Rename(out_name, kEmptyVarName);
+      }
+    }
+  }
+
+  for (auto& p : pending_fill_zeros_ops) {
+    grad_op_descs.insert(grad_op_descs.begin(), std::move(p));
+  }
+  return grad_op_descs;
+}
+
+std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
+    ProgramDescBind& program_desc, int block_idx,
+    std::unordered_set<std::string>& no_grad_vars) {
+  BlockDescBind* cur_block = program_desc.Block(block_idx);
+  std::deque<std::unique_ptr<OpDescBind>>& op_descs = cur_block->ops_;
+  std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
+  size_t grad_desc_idx = 0;
+  std::vector<std::unique_ptr<OpDescBind>> backward_descs;
+  for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
+    std::vector<std::unique_ptr<OpDescBind>> op_grads =
+        MakeOpGrad(*it, no_grad_vars);
+
+    if ((*it)->Type() == "recurrent") {
+      PADDLE_ENFORCE_EQ(
+          op_grads.size(), size_t(1),
+          "rnn_op's gradient process should contain only one op.");
+      int step_block_idx = (*it)->GetBlockAttr("stop_block");
+      auto backward_block_op_descs =
+          MakeBlockBackward(program_desc, step_block_idx, no_grad_vars);
+      BlockDescBind* backward_block = program_desc.AppendBlock(*cur_block);
+      for (auto& ptr : backward_block_op_descs) {
+        backward_block->ops_.push_back(std::move(ptr));
+      }
+      op_grads[0]->SetBlockAttr("step_block", *backward_block);
+    }
+
+    for (const auto& desc : op_grads) {
+      for (const std::string& out_name : desc->OutputArgumentNames()) {
+        dup_out_ops[out_name].emplace_back(grad_desc_idx);
+      }
+      ++grad_desc_idx;
+    }
+    std::transform(
+        op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
+        [](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
+  }
+  // Check whether some variables are written more than once
+  std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
+  for (const auto& dup : dup_out_ops) {
+    const std::string& out_name = dup.first;
+    const std::vector<size_t> dup_op = dup.second;
+    if (out_name != kEmptyVarName && dup_op.size() > 1) {
+      std::vector<std::string> sum_op_inputs;
+      for (size_t i = 0; i < dup_op.size(); ++i) {
+        std::string new_name = out_name + "@RENAME@" + std::to_string(i);
+        backward_descs[dup_op[i]]->Rename(out_name, new_name);
+        sum_op_inputs.emplace_back(new_name);
+      }
+      std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
+          "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
+      pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
+    }
+  }
+  pending_sum_ops.sort(
+      [](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
+         const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
+        return a.first > b.first;
+      });
+  for (auto& p : pending_sum_ops) {
+    backward_descs.insert(backward_descs.begin() + p.first + 1,
+                          std::move(p.second));
+  }
+  return backward_descs;
+}
+
+void AppendBackward(ProgramDescBind& program_desc,
+                    const std::unordered_set<std::string>& no_grad_vars) {
+  std::unordered_set<std::string> no_grad_var_names;
+  no_grad_var_names.reserve(no_grad_vars.size() + 1);
+  no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
+  for (auto& name : no_grad_vars) {
+    no_grad_var_names.insert(GradVarName(name));
+  }
+  const int root_block_idx = 0;
+  auto backward_op_descs =
+      MakeBlockBackward(program_desc, root_block_idx, no_grad_var_names);
+  auto& forw_op_descs = program_desc.Block(root_block_idx)->ops_;
+  for (auto& ptr : backward_op_descs) {
+    forw_op_descs.push_back(std::move(ptr));
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
@ -13,8 +13,11 @@
   limitations under the License. */

 #pragma once
+
 #include <unordered_set>
-#include "operator.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
+
 namespace paddle {
 namespace framework {

@ -23,5 +26,9 @@ namespace framework {
 extern std::unique_ptr<OperatorBase> Backward(
    const OperatorBase& forwardOp,
    const std::unordered_set<std::string>& no_grad_vars);
+
+void AppendBackward(ProgramDescBind& program_desc,
+                    const std::unordered_set<std::string>& no_grad_vars);
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@ -32,6 +32,14 @@ class ProgramDescBind;

 class BlockDescBind {
 public:
+  friend std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
+      ProgramDescBind &program_desc, int block_idx,
+      std::unordered_set<std::string> &no_grad_vars);
+
+  friend void AppendBackward(
+      ProgramDescBind &program_desc,
+      const std::unordered_set<std::string> &no_grad_vars);
+
  BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
      : prog_(prog), desc_(desc), need_update_(false) {}

--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@ -66,7 +66,6 @@ message OpProto {

    optional bool duplicable = 3 [ default = false ];
    optional bool intermediate = 4 [ default = false ];
-    optional bool not_in_gradient = 5 [ default = false ];
  }

  // AttrProto describes the C++ type Attribute.
@ -116,4 +115,7 @@ message BlockDesc {
  repeated OpDesc ops = 4;
 }

+// Please refer to
+// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
+// for more details.
 message ProgramDesc { repeated BlockDesc blocks = 1; }
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOpArgType::OUT WARRANTIES OR CONDITIONS OF ANY KOpArgType::IND, either
-express or implied. See the License for the specific language governing
-permissions and limitations under the License. */
-
-#include "paddle/framework/grad_op_builder.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace framework {
-enum class OpArgType { IN, OUT };
-
-static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
-                       bool is_grad, VariableNameMap* vars) {
-  const auto& src_inout =
-      src_type == OpArgType::IN ? src_op->Inputs() : src_op->Outputs();
-  auto& dst_inout = *vars;
-  auto& proto = OpInfoMap::Instance().Get(src_op->Type()).Proto();
-  const auto& src_arg_list =
-      src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
-  for (const auto& arg : src_arg_list) {
-    if (arg.not_in_gradient() && !is_grad) continue;
-    const std::string src_name = arg.name();
-    std::string dst_name = is_grad ? GradVarName(src_name) : src_name;
-    dst_inout[dst_name].reserve(src_inout.at(src_name).size());
-    for (auto& var_name : src_inout.at(src_name)) {
-      std::string s = is_grad ? GradVarName(var_name) : var_name;
-      dst_inout[dst_name].emplace_back(s);
-    }
-  }
-}
-
-OperatorBase* BuildGradOp(const OperatorBase* op) {
-  auto& info = OpInfoMap::Instance().Get(op->Type());
-  PADDLE_ENFORCE(info.HasGradientOp());
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  TransOpArg(op, OpArgType::IN, false, &inputs);   // I
-  TransOpArg(op, OpArgType::OUT, false, &inputs);  // O
-  TransOpArg(op, OpArgType::OUT, true, &inputs);   // OG
-  TransOpArg(op, OpArgType::IN, true, &outputs);   // IG
-
-  auto& grad_info = OpInfoMap::Instance().Get(info.grad_op_type_);
-  return grad_info.Creator()(info.grad_op_type_, inputs, outputs, op->Attrs());
-}
-
-static void TransOpDescArg(const OpDescBind* src_op, const OpArgType& src_type,
-                           bool is_grad, OpDescBind* dst_op,
-                           const OpArgType& dst_type) {
-  PADDLE_ENFORCE(dst_op != nullptr,
-                 "Protobuf desc of gradient op must be initialized first.");
-  const auto& proto = OpInfoMap::Instance().Get(src_op->Type()).Proto();
-  const auto& src_arg_list =
-      src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
-  for (const auto& arg : src_arg_list) {
-    if (arg.not_in_gradient() && !is_grad) continue;
-    const std::string src_name = arg.name();
-    std::vector<std::string> vars = src_type == OpArgType::IN
-                                        ? src_op->Input(src_name)
-                                        : src_op->Output(src_name);
-    if (is_grad) {
-      for (std::string& var : vars) {
-        var = GradVarName(var);
-      }
-    }
-    std::string dst_name = is_grad ? GradVarName(src_name) : src_name;
-    dst_type == OpArgType::IN ? dst_op->SetInput(dst_name, vars)
-                              : dst_op->SetOutput(dst_name, vars);
-  }
-}
-
-void CompleteGradOpDesc(const OpDescBind* forw_op, OpDescBind* grad_op) {
-  auto& info = OpInfoMap::Instance().Get(forw_op->Type());
-  PADDLE_ENFORCE(info.HasGradientOp());
-
-  grad_op->SetType(info.grad_op_type_);
-
-  TransOpDescArg(forw_op, OpArgType::IN, false, grad_op, OpArgType::IN);
-  TransOpDescArg(forw_op, OpArgType::OUT, false, grad_op, OpArgType::IN);
-  TransOpDescArg(forw_op, OpArgType::OUT, true, grad_op, OpArgType::IN);
-  TransOpDescArg(forw_op, OpArgType::IN, true, grad_op, OpArgType::OUT);
-
-  grad_op->SetAttrMap(forw_op->GetAttrMap());
-}
-
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/framework/grad_op_builder.h
+++ b/paddle/framework/grad_op_builder.h
@ -1,28 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-
-OperatorBase* BuildGradOp(const OperatorBase* op);
-
-void CompleteGradOpDesc(const OpDescBind* forw_op, OpDescBind* grad_op);
-
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@ -1,186 +0,0 @@
-#include "paddle/framework/grad_op_builder.h"
-#include <gtest/gtest.h>
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-
-USE_OP(sum);
-
-namespace paddle {
-namespace framework {
-
-class MutiInOutOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MutiInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("In1", "a single input");
-    AddInput("In2_mult", "a multiple input").AsDuplicable();
-    AddInput("In3", "another single input");
-    AddOutput("Out1", "a single output");
-    AddOutput("Out2_mult", "a multiple output").AsDuplicable();
-    AddComment("test op with multiple inputs and outputs");
-  }
-};
-
-class IOIgnoredOpMaker : public OpProtoAndCheckerMaker {
- public:
-  IOIgnoredOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("In1", "a single input");
-    AddInput("In2_mult", "a multiple input").AsDuplicable().NotInGradient();
-    AddInput("In3_mult", "another multiple input").AsDuplicable();
-    AddOutput("Out1_mult", "a multiple output").AsDuplicable();
-    AddOutput("Out2", "a single output").NotInGradient();
-    AddComment("op with inputs and outputs ignored in gradient calculating");
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
-
-namespace f = paddle::framework;
-
-REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker, mult_io_grad, f::NOP);
-REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker, io_ignored_grad, f::NOP);
-
-TEST(GradOpBuilder, MutiInOut) {
-  std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
-      "mult_io", {{"In1", {"in1"}},
-                  {"In2_mult", {"in2_1", "in2_2", "in2_3"}},
-                  {"In3", {"in3"}}},
-      {{"Out1", {"out1"}}, {"Out2_mult", {"out2_1", "out2_2"}}}, {}));
-  std::shared_ptr<f::OperatorBase> grad_test_op =
-      f::OpRegistry::CreateGradOp(*test_op);
-
-  ASSERT_EQ(grad_test_op->Inputs().size(), 3UL + 2UL + 2UL);
-  EXPECT_EQ(grad_test_op->Input("In1"), "in1");
-  EXPECT_EQ(grad_test_op->Inputs("In2_mult"),
-            std::vector<std::string>({"in2_1", "in2_2", "in2_3"}));
-  EXPECT_EQ(grad_test_op->Input("In3"), "in3");
-  EXPECT_EQ(grad_test_op->Input("Out1"), "out1");
-  EXPECT_EQ(grad_test_op->Inputs("Out2_mult"),
-            std::vector<std::string>({"out2_1", "out2_2"}));
-  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out1")),
-            f::GradVarName("out1"));
-  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out2_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("out2_1"), f::GradVarName("out2_2")}));
-
-  ASSERT_EQ(grad_test_op->Outputs().size(), 3UL);
-  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
-  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
-            std::vector<std::string>({f::GradVarName("in2_1"),
-                                      f::GradVarName("in2_2"),
-                                      f::GradVarName("in2_3")}));
-  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In3")), f::GradVarName("in3"));
-}
-
-TEST(GradOpBuilder, IOIgnoredInGradient) {
-  std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
-      "io_ignored", {{"In1", {"in1"}},
-                     {"In2_mult", {"in2_1", "in2_2"}},
-                     {"In3_mult", {"in3_1", "in3_2"}}},
-      {{"Out1_mult", {"out1_1", "out1_2"}}, {"Out2", {"out2"}}}, {}));
-  std::shared_ptr<f::OperatorBase> grad_test_op =
-      f::OpRegistry::CreateGradOp(*test_op);
-
-  // 'In2' and 'Out2' are ignored in gradient calculating
-  ASSERT_EQ(grad_test_op->Inputs().size(), 2UL + 1UL + 2UL);
-  EXPECT_EQ(grad_test_op->Input("In1"), "in1");
-  EXPECT_EQ(grad_test_op->Inputs("In3_mult"),
-            std::vector<std::string>({"in3_1", "in3_2"}));
-  EXPECT_EQ(grad_test_op->Inputs("Out1_mult"),
-            std::vector<std::string>({"out1_1", "out1_2"}));
-  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out1_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("out1_1"), f::GradVarName("out1_2")}));
-  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out2")),
-            f::GradVarName("out2"));
-
-  ASSERT_EQ(grad_test_op->Outputs().size(), 3UL);
-  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
-  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("in2_1"), f::GradVarName("in2_2")}));
-  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In3_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("in3_1"), f::GradVarName("in3_2")}));
-}
-
-TEST(GradOpDescBuilder, MutiInOut) {
-  f::OpDescBind *forw_op = new f::OpDescBind();
-  forw_op->SetType("mult_io");
-  forw_op->SetInput("In1", {"in1"});
-  forw_op->SetInput("In2_mult", {"in2_1", "in2_2", "in2_3"});
-  forw_op->SetInput("In3", {"in3"});
-  forw_op->SetOutput("Out1", {"out1"});
-  forw_op->SetOutput("Out2_mult", {"out2_1", "out2_2"});
-
-  f::OpDescBind *grad_op = new f::OpDescBind();
-  f::CompleteGradOpDesc(forw_op, grad_op);
-
-  EXPECT_EQ(grad_op->Type(), "mult_io_grad");
-  ASSERT_EQ(grad_op->InputNames().size(), 3UL + 2UL + 2UL);
-  EXPECT_EQ(grad_op->Input("In1"), std::vector<std::string>({"in1"}));
-  EXPECT_EQ(grad_op->Input("In2_mult"),
-            std::vector<std::string>({"in2_1", "in2_2", "in2_3"}));
-  EXPECT_EQ(grad_op->Input("In3"), std::vector<std::string>({"in3"}));
-  EXPECT_EQ(grad_op->Input("Out1"), std::vector<std::string>({"out1"}));
-  EXPECT_EQ(grad_op->Input("Out2_mult"),
-            std::vector<std::string>({"out2_1", "out2_2"}));
-  EXPECT_EQ(grad_op->Input(f::GradVarName("Out1")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op->Input(f::GradVarName("Out2_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("out2_1"), f::GradVarName("out2_2")}));
-
-  ASSERT_EQ(grad_op->OutputNames().size(), 3UL);
-  EXPECT_EQ(grad_op->Output(f::GradVarName("In1")),
-            std::vector<std::string>({f::GradVarName("in1")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("In2_mult")),
-            std::vector<std::string>({f::GradVarName("in2_1"),
-                                      f::GradVarName("in2_2"),
-                                      f::GradVarName("in2_3")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("In3")),
-            std::vector<std::string>({f::GradVarName("in3")}));
-  delete forw_op;
-  delete grad_op;
-}
-
-TEST(GradOpDescBuilder, IOIgnoredInGradient) {
-  f::OpDescBind *forw_op = new f::OpDescBind();
-  forw_op->SetType("io_ignored");
-  forw_op->SetInput("In1", {"in1"});
-  forw_op->SetInput("In2_mult", {"in2_1", "in2_2"});
-  forw_op->SetInput("In3_mult", {"in3_1", "in3_2"});
-  forw_op->SetOutput("Out1_mult", {"out1_1", "out1_2"});
-  forw_op->SetOutput("Out2", {"out2"});
-
-  f::OpDescBind *grad_op = new f::OpDescBind();
-  f::CompleteGradOpDesc(forw_op, grad_op);
-
-  EXPECT_EQ(grad_op->Type(), "io_ignored_grad");
-  // 'In2' and 'Out2' are ignored in gradient calculating
-  ASSERT_EQ(grad_op->InputNames().size(), 2UL + 1UL + 2UL);
-  EXPECT_EQ(grad_op->Input("In1"), std::vector<std::string>({"in1"}));
-  EXPECT_EQ(grad_op->Input("In3_mult"),
-            std::vector<std::string>({"in3_1", "in3_2"}));
-  EXPECT_EQ(grad_op->Input("Out1_mult"),
-            std::vector<std::string>({"out1_1", "out1_2"}));
-  EXPECT_EQ(grad_op->Input(f::GradVarName("Out1_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("out1_1"), f::GradVarName("out1_2")}));
-  EXPECT_EQ(grad_op->Input(f::GradVarName("Out2")),
-            std::vector<std::string>({f::GradVarName("out2")}));
-
-  ASSERT_EQ(grad_op->OutputNames().size(), 3UL);
-  EXPECT_EQ(grad_op->Output(f::GradVarName("In1")),
-            std::vector<std::string>({f::GradVarName("in1")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("In2_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("in2_1"), f::GradVarName("in2_2")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("In3_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("in3_1"), f::GradVarName("in3_2")}));
-  delete forw_op;
-  delete grad_op;
-}
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@ -18,6 +18,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
+                       const VariableNameMap &outputs,
+                       const AttributeMap &attrs) {
+  op_desc_.set_type(type);
+  inputs_ = inputs;
+  outputs_ = outputs;
+  attrs_ = attrs;
+}
+
 OpDesc *OpDescBind::Proto() {
  Sync();
  return &op_desc_;
@ -31,6 +40,14 @@ const std::vector<std::string> &OpDescBind::Input(
  return it->second;
 }

+std::vector<std::string> OpDescBind::InputArgumentNames() const {
+  std::vector<std::string> retv;
+  for (auto &ipt : this->inputs_) {
+    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
+  }
+  return retv;
+}
+
 void OpDescBind::SetInput(const std::string &param_name,
                          const std::vector<std::string> &args) {
  need_update_ = true;
@ -45,6 +62,14 @@ const std::vector<std::string> &OpDescBind::Output(
  return it->second;
 }

+std::vector<std::string> OpDescBind::OutputArgumentNames() const {
+  std::vector<std::string> retv;
+  for (auto &ipt : this->outputs_) {
+    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
+  }
+  return retv;
+}
+
 void OpDescBind::SetOutput(const std::string &param_name,
                           const std::vector<std::string> &args) {
  need_update_ = true;
@ -94,6 +119,18 @@ const std::unordered_map<std::string, Attribute> &OpDescBind::GetAttrMap()
  return attrs_;
 }

+void OpDescBind::Rename(const std::string &old_name,
+                        const std::string &new_name) {
+  for (auto &input : inputs_) {
+    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+  }
+  for (auto &output : outputs_) {
+    std::replace(output.second.begin(), output.second.end(), old_name,
+                 new_name);
+  }
+  need_update_ = true;
+}
+
 struct SetAttrDescVisitor : public boost::static_visitor<void> {
  explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
  mutable OpDesc::Attr *attr_;
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@ -27,6 +27,11 @@ class BlockDescBind;

 class OpDescBind {
 public:
+  OpDescBind() {}
+
+  OpDescBind(const std::string &type, const VariableNameMap &inputs,
+             const VariableNameMap &outputs, const AttributeMap &attrs);
+
  OpDesc *Proto();

  std::string Type() const { return op_desc_.type(); }
@ -35,11 +40,15 @@ class OpDescBind {

  const std::vector<std::string> &Input(const std::string &name) const;

+  std::vector<std::string> InputArgumentNames() const;
+
  void SetInput(const std::string &param_name,
                const std::vector<std::string> &args);

  const std::vector<std::string> &Output(const std::string &name) const;

+  std::vector<std::string> OutputArgumentNames() const;
+
  void SetOutput(const std::string &param_name,
                 const std::vector<std::string> &args);

@ -61,6 +70,8 @@ class OpDescBind {

  int GetBlockAttr(const std::string &name) const;

+  void Rename(const std::string &old_name, const std::string &new_name);
+
  // Only be used in C++
  const AttributeMap &GetAttrMap() const;

@ -70,6 +81,22 @@ class OpDescBind {
  std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
  std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }

+  void SetInputMap(const VariableNameMap &input) {
+    this->inputs_ = input;
+    this->need_update_ = true;
+  }
+
+  void SetOutputMap(const VariableNameMap &output) {
+    this->outputs_ = output;
+    this->need_update_ = true;
+  }
+
+  void Sync();
+
+  const VariableNameMap &Inputs() const { return inputs_; }
+
+  const VariableNameMap &Outputs() const { return outputs_; }
+
 private:
  template <typename MapType>
  static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
@ -81,8 +108,6 @@ class OpDescBind {
    return ret_val;
  }

-  void Sync();
-
  OpDesc op_desc_;
  VariableNameMap inputs_;
  VariableNameMap outputs_;
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@ -17,6 +17,7 @@
 #include <map>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/type_defs.h"
@ -27,7 +28,6 @@ namespace framework {

 struct OpInfo {
  OpCreator creator_;
-  std::string grad_op_type_;
  GradOpMakerFN grad_op_maker_;
  OpProto* proto_{nullptr};
  OpAttrChecker* checker_{nullptr};
@ -43,19 +43,19 @@ struct OpInfo {
    return *proto_;
  }

-  const OpAttrChecker& Checker() const {
-    PADDLE_ENFORCE_NOT_NULL(checker_,
-                            "Operator Checker has not been registered");
-    return *checker_;
-  }
-
  const OpCreator& Creator() const {
    PADDLE_ENFORCE_NOT_NULL(creator_,
                            "Operator Creator has not been registered");
    return creator_;
  }

-  bool HasGradientOp() const { return !grad_op_type_.empty(); }
+  const GradOpMakerFN& GradOpMaker() const {
+    PADDLE_ENFORCE_NOT_NULL(grad_op_maker_,
+                            "Operator GradOpMaker has not been registered.");
+    return grad_op_maker_;
+  }
+
+  const OpAttrChecker* Checker() const { return checker_; }
 };

 class OpInfoMap {
--- a/paddle/framework/op_proto_maker.h
+++ b/paddle/framework/op_proto_maker.h
@ -44,11 +44,6 @@ class OpProtoAndCheckerMaker {
      var_->set_intermediate(true);
      return *this;
    }
-
-    VariableBuilder& NotInGradient() {
-      var_->set_not_in_gradient(true);
-      return *this;
-    }
  };

  VariableBuilder AddInput(const std::string& name, const std::string& comment);
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@ -23,7 +23,9 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
    const std::string& type, const VariableNameMap& inputs,
    const VariableNameMap& outputs, AttributeMap attrs) {
  auto& info = OpInfoMap::Instance().Get(type);
-  info.Checker().Check(attrs);
+  if (info.Checker() != nullptr) {
+    info.Checker()->Check(attrs);
+  }
  auto op = info.Creator()(type, inputs, outputs, attrs);
  return std::unique_ptr<OperatorBase>(op);
 }
@ -52,9 +54,15 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
  return CreateOp(op_desc.type(), inputs, outputs, attrs);
 }

-std::unique_ptr<OperatorBase> OpRegistry::CreateGradOp(const OperatorBase& op) {
-  PADDLE_ENFORCE(!op.IsNetOp(), "Use framework::Backward to get backward ops");
-  return std::unique_ptr<OperatorBase>(BuildGradOp(&op));
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDescBind& op_desc) {
+  return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(),
+                  op_desc.GetAttrMap());
+}
+
+std::vector<std::unique_ptr<OpDescBind>> OpRegistry::CreateGradOpDescs(
+    const OpDescBind& op_desc) {
+  auto& info = OpInfoMap::Instance().Get(op_desc.Type());
+  return info.grad_op_maker_(op_desc);
 }

 }  // namespace framework
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@ -23,25 +23,37 @@ limitations under the License. */
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/details/op_registry.h"
 #include "paddle/framework/framework.pb.h"
-#include "paddle/framework/grad_op_builder.h"
+#include "paddle/framework/grad_op_desc_maker.h"
+#include "paddle/framework/op_desc.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"

 namespace paddle {
 namespace framework {
+class Registrar {
+ public:
+  // In our design, various kinds of classes, e.g., operators and kernels,
+  // have their corresponding registry and registrar. The action of
+  // registration is in the constructor of a global registrar variable, which,
+  // however, are not used in the code that calls package framework, and would
+  // be removed from the generated binary file by the linker. To avoid such
+  // removal, we add Touch to all registrar classes and make USE_OP macros to
+  // call this method. So, as long as the callee code calls USE_OP, the global
+  // registrar variable won't be removed by the linker.
+  void Touch() {}
+};

 template <typename... ARGS>
-struct OperatorRegistrar {
+struct OperatorRegistrar : public Registrar {
  explicit OperatorRegistrar(const char* op_type) : op_type(op_type) {
    PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
                   "'%s' is registered more than once.", op_type);
    static_assert(sizeof...(ARGS) != 0,
                  "OperatorRegistrar should be invoked at least by OpClass");
    details::OperatorRegistrarRecursive<0, false, ARGS...>(op_type, &info);
+    OpInfoMap::Instance().Insert(op_type, info);
  }

-  ~OperatorRegistrar() { OpInfoMap::Instance().Insert(op_type, info); }
-
  const char* op_type;

  OpInfo info;
@ -67,20 +79,10 @@ class OpRegistry {

  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);

-  static std::unique_ptr<OperatorBase> CreateGradOp(const OperatorBase& op);
-};
+  static std::vector<std::unique_ptr<OpDescBind>> CreateGradOpDescs(
+      const OpDescBind& op_desc);

-class Registrar {
- public:
-  // In our design, various kinds of classes, e.g., operators and kernels,
-  // have their corresponding registry and registrar. The action of
-  // registration is in the constructor of a global registrar variable, which,
-  // however, are not used in the code that calls package framework, and would
-  // be removed from the generated binary file by the linker. To avoid such
-  // removal, we add Touch to all registrar classes and make USE_OP macros to
-  // call this method. So, as long as the callee code calls USE_OP, the global
-  // registrar variable won't be removed by the linker.
-  void Touch() {}
+  static std::unique_ptr<OperatorBase> CreateOp(const OpDescBind& op_desc);
 };

 template <typename OpType, typename ProtoMakerType, typename GradOpType>
@ -138,33 +140,41 @@ class OpKernelRegistrar : public Registrar {
                             __test_global_namespace_##uniq_name##__>::value, \
                msg)

-/**
- * Macro to register Operator.
- */
-#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,          \
-                    grad_op_class)                                            \
+#define REGISTER_OPERATOR(op_type, op_class, ...)                      \
  STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
-      __reg_op__##op_type, "REGISTER_OP must be called in global namespace"); \
+      __reg_op__##op_type,                                             \
+      "REGISTER_OPERATOR must be called in global namespace");         \
  class _OpClass_##op_type##_ : public op_class {                      \
   public:                                                             \
    DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_);                     \
    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class);            \
  };                                                                   \
-  class _OpGradClass_##op_type##_ : public grad_op_class {                    \
-   public:                                                                    \
-    DEFINE_OP_CLONE_METHOD(_OpGradClass_##op_type##_);                        \
-    DEFINE_OP_CONSTRUCTOR(_OpGradClass_##op_type##_, grad_op_class);          \
-  };                                                                          \
-  static ::paddle::framework::OpRegistrar<                                    \
-      _OpClass_##op_type##_, op_maker_class, _OpGradClass_##op_type##_>       \
-      __op_registrar_##op_type##__(#op_type, #grad_op_type);                  \
+  static ::paddle::framework::OperatorRegistrar<_OpClass_##op_type##_, \
+                                                ##__VA_ARGS__>         \
+      __op_registrar_##op_type##__(#op_type);                          \
  int TouchOpRegistrar_##op_type() {                                   \
    __op_registrar_##op_type##__.Touch();                              \
    return 0;                                                          \
  }

+/**
+ * Macro to register Operator.
+ */
+#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,           \
+                    grad_op_class)                                             \
+  REGISTER_OPERATOR(grad_op_type, grad_op_class);                              \
+  class _GradOpDescMaker_##grad_op_type##_                                     \
+      : public ::paddle::framework::DefaultGradOpDescMaker {                   \
+    using ::paddle::framework::DefaultGradOpDescMaker::DefaultGradOpDescMaker; \
+                                                                               \
+   protected:                                                                  \
+    virtual std::string GradOpType() const { return #grad_op_type; }           \
+  };                                                                           \
+  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_,     \
+                    op_maker_class);
+
 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
-  REGISTER_OP(op_type, op_class, op_maker_class, , ::paddle::framework::NOP)
+  REGISTER_OPERATOR(op_type, op_class, op_maker_class)

 /**
 * Macro to register OperatorKernel.
--- a/paddle/framework/tensor_array.h
+++ b/paddle/framework/tensor_array.h
@ -26,6 +26,9 @@ namespace framework {
 * in original lod-tensor.
 */
 struct DySeqMeta {
+  DySeqMeta(size_t begin, size_t end, size_t ori_idx)
+      : begin(begin), end(end), ori_idx(ori_idx) {}
+
  size_t begin;
  size_t end;  // not included
  size_t ori_idx;
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@ -97,6 +97,17 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
  }
 };

+class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TanhShrinkOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of TanhShrink operator");
+    AddOutput("Y", "Output of TanhShrink operator");
+    AddComment("TanhShrink activation operator, tanhshrink(x) = x - tanh(x)");
+  }
+};
+
 class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  SqrtOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
@ -235,6 +246,9 @@ REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
 REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
            ops::ActivationOpGrad);

+REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
+            tanh_shrink_grad, ops::ActivationOpGrad);
+
 REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
            ops::ActivationOpGrad);

--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@ -146,6 +146,24 @@ struct TanhGradFunctor : public BaseActivationFunctor<T> {
  }
 };

+// tanhshrink(x) = x - tanh(x)
+// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+template <typename T>
+struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x - x.tanh();
+  }
+};
+
+template <typename T>
+struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) = dy * (x.tanh() * x.tanh());
+  }
+};
+
 // sqrt(x) = x^(1/2)
 template <typename T>
 struct SqrtFunctor : public BaseActivationFunctor<T> {
@ -407,4 +425,5 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
  __macro(pow, PowFunctor, PowGradFunctor);                      \
  __macro(stanh, STanhFunctor, STanhGradFunctor);                \
  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);       \
-  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor)
+  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);   \
+  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor)
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@ -36,7 +36,7 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
  MeanOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of mean op");
-    AddOutput("Out", "The output of mean op").NotInGradient();
+    AddOutput("Out", "The output of mean op");
    AddComment(R"DOC( Mean Operator
 )DOC");
  }
@ -52,11 +52,27 @@ class MeanGradOp : public framework::OperatorWithKernel {
  }
 };

+class MeanGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto* grad_op = new framework::OpDescBind();
+    grad_op->SetType("mean_grad");
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker, mean_grad, ops::MeanGradOp);
+REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker);
+REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
 REGISTER_OP_CPU_KERNEL(mean,
                       ops::MeanKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(mean_grad,
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@ -49,9 +49,9 @@ class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  MinusOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The left tensor of minus operator.").NotInGradient();
-    AddInput("Y", "The right tensor of minus operator.").NotInGradient();
-    AddOutput("Out", "The output tensor of minus operator.").NotInGradient();
+    AddInput("X", "The left tensor of minus operator.");
+    AddInput("Y", "The right tensor of minus operator.");
+    AddOutput("Out", "The output tensor of minus operator.");

    AddComment(R"DOC(Minus Operator

@ -64,26 +64,35 @@ or not. But the output only shares the LoD with input `X`.
 )DOC");
  }
 };
-template <typename AttrType>
-class MinusGradOp : public NetOp {
+
+class MinusGradMaker : public framework::GradOpDescMakerBase {
 public:
-  MinusGradOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : NetOp(type, inputs, outputs, attrs) {
-    auto out_grad = Input(framework::GradVarName("Out"));
-    auto x_grad = Output(framework::GradVarName("X"));
-    auto y_grad = Output(framework::GradVarName("Y"));
-
-    // x_grad = out_grad
-    AppendOp(framework::OpRegistry::CreateOp("identity", {{"X", {out_grad}}},
-                                             {{"Y", {x_grad}}}, {}));
-
-    framework::AttributeMap scale_attr;
-    scale_attr["scale"] = static_cast<AttrType>(-1);
-    AppendOp(framework::OpRegistry::CreateOp("scale", {{"X", {out_grad}}},
-                                             {{"Out", {y_grad}}}, scale_attr));
-    CompleteAddOp(false);
+  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
+
+  std::vector<std::unique_ptr<framework::OpDescBind>> operator()()
+      const override {
+    std::vector<std::unique_ptr<framework::OpDescBind>> ops;
+    auto x_g = InputGrad("X");
+    if (!x_g.empty()) {
+      auto *x_g_op = new framework::OpDescBind();
+      x_g_op->SetType("scale");
+      x_g_op->SetInput("X", OutputGrad("Out"));
+      x_g_op->SetOutput("Out", x_g);
+      x_g_op->SetAttr("scale", 1.0f);
+      ops.emplace_back(x_g_op);
+    }
+
+    auto y_g = InputGrad("Y");
+    if (!y_g.empty()) {
+      auto *y_g_op = new framework::OpDescBind();
+      y_g_op->SetType("scale");
+      y_g_op->SetInput("X", OutputGrad("Out"));
+      y_g_op->SetOutput("Out", y_g);
+      y_g_op->SetAttr("scale", -1.0f);
+      ops.emplace_back(y_g_op);
+    }
+
+    return ops;
  }
 };

@ -91,7 +100,6 @@ class MinusGradOp : public NetOp {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(minus, ops::MinusOp, ops::MinusOpMaker, minus_grad,
-            ops::MinusGradOp<float>);
+REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradMaker);
 REGISTER_OP_CPU_KERNEL(minus,
                       ops::MinusKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@ -56,8 +56,7 @@ class PadOpMaker : public framework::OpProtoAndCheckerMaker {
             "The input should be a k-D tensor(k > 0 and k < 7)");
    AddOutput("Out",
              "The output of pad op."
-              "A tensor with the same shape as X.")
-        .NotInGradient();
+              "A tensor with the same shape as X.");
    AddComment(R"DOC(
 Pad input into output, as specified by paddings and pad_value. The input should be a k-D tensor(k > 0 and k < 7). As an example:

@ -111,11 +110,29 @@ class PadOpGrad : public framework::OperatorWithKernel {
  }
 };

+class PadOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto* bind = new framework::OpDescBind();
+    bind->SetInput("X", Input("X"));
+    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    bind->SetAttrMap(Attrs());
+    bind->SetType("pad_grad");
+    return std::unique_ptr<framework::OpDescBind>(bind);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(pad, ops::PadOp, ops::PadOpMaker, pad_grad, ops::PadOpGrad);
+
+REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker);
+REGISTER_OPERATOR(pad_grad, ops::PadOpGrad);
 REGISTER_OP_CPU_KERNEL(pad, ops::PadKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(pad_grad,
                       ops::PadGradKernel<paddle::platform::CPUPlace, float>);
--- a/Show More
+++ b/Show More