diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 83fe9af768..59661c9c1d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,6 +31,3 @@
     -   id: go-fmt
         types:
         - go
-    -   id: gometalinter
-        types:
-        - go
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4783095194..1252e75398 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,6 +105,12 @@ if (WITH_C_API AND WITH_PYTHON)
     "different Python interpreter from compiling.")
 endif()
 
+if(MOBILE_INFERENCE)
+    set(THIRD_PARTY_BUILD_TYPE MinSizeRel)
+else()
+    set(THIRD_PARTY_BUILD_TYPE Release)
+endif()
+
 ########################################################################################
 
 include(external/mklml)     # download mklml package
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index f7483f6be9..bd853d921b 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -8,7 +8,7 @@ ExternalProject_Add(
     extern_eigen3
     ${EXTERNAL_PROJECT_LOG_ARGS}
     GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-    GIT_TAG         "master"
+    GIT_TAG         4e79cb69b9425f5f8c3a84be4350d4ab75b5fd9d
     PREFIX          ${EIGEN_SOURCE_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 957f8271e4..c819eb4d70 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -36,6 +36,7 @@ ExternalProject_Add(
     # change this back to the official Github repo once my PR is
     # merged.
     GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
+    GIT_TAG         986964c07427ecb9cdb5bd73f73ebbd40e54dadb
     PREFIX          ${GFLAGS_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -45,11 +46,11 @@ ExternalProject_Add(
                     -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                     -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                     ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 
 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index b3fef738cc..08bdc1e162 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -31,6 +31,7 @@ ExternalProject_Add(
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS gflags
     GIT_REPOSITORY  "https://github.com/google/glog.git"
+    GIT_TAG         v0.3.5
     PREFIX          ${GLOG_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -43,12 +44,12 @@ ExternalProject_Add(
                     -DWITH_GFLAGS=ON
                     -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
                     -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                     ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
                      -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 
 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 6a2a79b763..5a4aa7a5b7 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -56,11 +56,11 @@ IF(WITH_TESTING)
                         -DBUILD_GMOCK=ON
                         -Dgtest_disable_pthreads=ON
                         -Dgtest_force_shared_crt=ON
-                        -DCMAKE_BUILD_TYPE=Release
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                         ${EXTERNAL_OPTIONAL_ARGS}
         CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
                          -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                         -DCMAKE_BUILD_TYPE:STRING=Release
+                         -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
     )
 
     ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 7cf7ba85cc..be7f6a9465 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -191,12 +191,12 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             ${OPTIONAL_ARGS}
             -Dprotobuf_BUILD_TESTS=OFF
             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            -DCMAKE_BUILD_TYPE=Release
+            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
             -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
             -DCMAKE_INSTALL_LIBDIR=lib
         CMAKE_CACHE_ARGS
             -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-            -DCMAKE_BUILD_TYPE:STRING=Release
+            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
             -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
             -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
             ${OPTIONAL_CACHE_ARGS}
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index bb258c7b55..8bd0582228 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -35,6 +35,7 @@ ExternalProject_Add(
     extern_warpctc
     ${EXTERNAL_PROJECT_LOG_ARGS}
     GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
+    GIT_TAG         b63a0644654a3e0ed624c85a1767bc8193aead09
     PREFIX          ${WARPCTC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -48,9 +49,9 @@ ExternalProject_Add(
                     -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
                     -DBUILD_SHARED=ON
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                     ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
+    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index c496a52b78..e2c9fe56f3 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -42,11 +42,11 @@ ExternalProject_Add(
                     -DBUILD_SHARED_LIBS=OFF
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                     -DCMAKE_MACOSX_RPATH=ON
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                     ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 
 LIST(APPEND external_project_dependencies zlib)
diff --git a/doc/design/block.md b/doc/design/block.md
index 9c812732d6..7cbf0d55b1 100644
--- a/doc/design/block.md
+++ b/doc/design/block.md
@@ -243,7 +243,7 @@ class SymbolTable {
   // TODO determine whether name is generated by python or C++.
   // Currently assume that a unique name will be generated by C++ if the
   // argument name is left default.
-  VarDesc* NewVar(const string& name="");
+  VarDesc* Var(const string& name="");
 
   // find a VarDesc by name, if recursive is true, find parent's SymbolTable
   // recursively.
diff --git a/doc/design/executor.md b/doc/design/executor.md
new file mode 100644
index 0000000000..b5fb6c5c3c
--- /dev/null
+++ b/doc/design/executor.md
@@ -0,0 +1,23 @@
+# Executor Design Doc
+
+## Motivation
+
+We use executor to do the runtime evaluation of a `ProgramDesc`.
+
+## Overview
+
+An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
+
+### What does executor do?
+
+It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
+
+### What does executor NOT do?
+
+It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
+
+It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
+
+## Implementation
+
+`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
diff --git a/doc/design/images/graph_construction_example.dot b/doc/design/images/graph_construction_example.dot
index 8d1b673abf..e115f9844b 100644
--- a/doc/design/images/graph_construction_example.dot
+++ b/doc/design/images/graph_construction_example.dot
@@ -33,7 +33,6 @@ digraph ImageClassificationGraph {
 
         cost -> MSE_Grad [color=red];
         d_cost -> MSE_Grad [color=red];
-        x -> MSE_Grad [color=red];
         l -> MSE_Grad [color=red];
         y -> MSE_Grad -> d_y [color=red];
 
diff --git a/doc/design/images/graph_construction_example_all.png b/doc/design/images/graph_construction_example_all.png
index 1811875034..261611a572 100644
Binary files a/doc/design/images/graph_construction_example_all.png and b/doc/design/images/graph_construction_example_all.png differ
diff --git a/doc/design/images/graph_construction_example_forward_backward.png b/doc/design/images/graph_construction_example_forward_backward.png
index 3049a9315f..4c69687f4a 100644
Binary files a/doc/design/images/graph_construction_example_forward_backward.png and b/doc/design/images/graph_construction_example_forward_backward.png differ
diff --git a/doc/design/images/graph_construction_example_forward_only.png b/doc/design/images/graph_construction_example_forward_only.png
index 25d19088cb..e668c16e0c 100644
Binary files a/doc/design/images/graph_construction_example_forward_only.png and b/doc/design/images/graph_construction_example_forward_only.png differ
diff --git a/doc/design/infer_var_type.md b/doc/design/infer_var_type.md
new file mode 100644
index 0000000000..d9d5397bec
--- /dev/null
+++ b/doc/design/infer_var_type.md
@@ -0,0 +1,78 @@
+# Design Doc: InferVarType
+
+## The Problem Posed
+
+The variable in our design can hold variant types. Such as `LoDTensor` and `SelectedRows`. An operator should be able to inference the variable types of its output.
+
+For example, a `lookup table` operator takes two `LoDTensor`; one is a float tensor as the embedding table, the other is an int tensor as word ID. The gradient operator of `lookup table` will generate a `SelectedRows` as its output. A `sum` operator can take both `LoDTensor` and `SelectedRows` as its inputs and will generate a `LoDTensor` if any of its inputs is `LoDTensor`, otherwise, the `sum` operator will generate `SelectedRows` as its output.
+
+The variable type will be constant at runtime. Every variable's type can either be set by the user (input data and parameter) or be inferred by the operator in compile time.
+
+## Proposed Solution
+
+The `InferVarType` is a compile-time function which is registered to each operator. The inferface of that function is:
+
+
+```c++
+using InferVarTypeFN = std::function<
+    void (const OpDescBind& /*op_desc*/, BlockDescBind* /*block*/)>;
+```
+
+It takes an operator description as its input and will write the output variable type and store them in block description.
+
+The `InferVarTypeFN` will be registered in `OpInfo`, to replace `infer_var_type_` field. The `OpInfo` should be
+
+```cpp
+struct OpInfo {
+  InferVarTypeFN infer_var_type_;
+  ...
+};
+```
+
+The default `InferVarType` will set output type as `LoDTensor`. It can be done by `GetInferVarType()`.
+
+```cpp
+void DefaultInferVarType(const OpDescBind& op_desc, BlockDescBind* block) {
+  // set the output type of variable as `LoDTensor`.
+  // ...
+}
+
+struct OpInfo {
+  InferVarTypeFN infer_var_type_;
+  InferVarTypeFN GetInferVarType() const {
+    if (infer_var_type_) {
+      return infer_var_type_;
+    } else {
+      return DefaultInferVarType;
+    }
+  }
+};
+```
+
+## Register InferVarType
+
+We provide a thin base class for registering an `InferVarTypeFN`. To use a base class will ease the implementation of registry since we can detect the registry entry is an `InferVarTypeFN` or not.
+
+```cpp
+class VarTypeInferer {
+public:
+  virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const = 0;
+}
+```
+
+Operator developers can write the specialize `VarTypeInferer` as follow.
+
+```cpp
+class SpecialVarTypeInferer : public VarTypeInferer {
+public:
+  virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const {
+    // .. own logic
+  }
+}
+```
+
+Then user can register the `InferVarType` just like `GradOpDescMaker` and `OpInfoMaker`.
+
+```
+REGISTER_OPERATOR(some_op, OpType, SpecialVarTypeInferer, ...);
+```
diff --git a/doc/design/python_api.md b/doc/design/python_api.md
index 56ae1d925a..cb5fdc765b 100644
--- a/doc/design/python_api.md
+++ b/doc/design/python_api.md
@@ -179,40 +179,104 @@ init_attr={
 
 `optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
 
-## Layer Functions
+## Layer Function
 
-A layer is a Python function that creates some operators and variables.  Layers simplify the work of application programmers.
+A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers.
 
-### Data Layer
+Layer functions take `Variable` and configuration parameters as its input and return the output variable(s).
+
+For example, `FullyConnected` take one or more variable as its input. The input could be input data or another layer's output. There are many configuration options for a `FullyConnected` layer, such as layer size, activation, parameter names, initialization strategies of parameters, and so on. The `FullyConnected` layer will return an output variable.
+
+
+### Necessity for reusing code between layer functions
+
+There are a lot of code that can be reused. Such as
+
+* Give the default value of configuration. e.g., default initialize strategy for parameters is uniform random with `min = -1.0`, `max = 1.0`. and default initialize strategy for bias is to fill zero.
+* Append the activation operator.
+* Create a temporary variable.
+* Create parameter.
+* Generate a unique name.
+* Add a bias.
+* ...
+
+A mechanism to reuse code between layer functions is necessary. It will be around [150 lines of code](https://github.com/PaddlePaddle/Paddle/pull/4724/files#diff-823b27e07e93914ada859232ae23f846R12) if we write a `FullyConnected` layer without any helper functions.
+
+
+
+### Comparision between global functions and helper class
+
+The `FullyConnected` layer will be as follow when we provide global functions:
 
 ```python
-def data_layer(name, type, column_name):
-    block = the_current_program.glolal_block()
-    var = block.create_global_var(
-            name=name,
-            shape=[None] + type.dims(),
-            dtype=type.dtype)
-    block.prepend_operator(block,
-                           type="Feed",
-                           inputs = None,
-                           outputs = [var],
-                           {column_name: column_name})
-    return var
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+  if name is None:
+    name = unique_name("fc")
+  input = multiple_input(input)
+  param_attr = default_param_attr(param_attr)
+  param_attr = multiple_param_attr(param_attr, len(input))
+
+  # mul
+  mul_results = []
+  for ipt, attr in zip(input, param_attr):
+    shape = ipt.shape[1:] + [size]
+    w = g_program.global_block().create_parameter(shape, ipt.dtype, name, attr)
+    tmp = create_tmp_var(name)
+    g_program.current_block().append_op("mul", {ipt, w}, {tmp})
+  mul_results.append(tmp)
+
+  # add sum
+  ...
+  # add bias
+  ...
+  # add activation
+  ...
+  return out
 ```
 
-The input to the feed operator is a special variable in the global scope, which is the output of [Python readers](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md).
+We can provide many helpers functions for layer developers. However, there are several disadvantages for global helper functions:
+
+1. We need a namespace for these methods, then layer developers can quickly figure out what method they can use.
+2. Global functions will force layer developers to pass its parameter time by time.
+
+So we provide a helper class, `LayerHelper`, to share code between layer functions. The `FullyConnected` Layer will be as follow.
+
+```python
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+  helper = LayerHelper(locals())  # pass all parameter to LayerHelper
+
+  mul_results = []
+  for ipt, param in helper.iter_multiple_input_and_param():
+    w = helper.create_parameter(shape=ipt.shape[1:] + [size], dtype = ipt.dtype)
+    tmp = helper.create_tmp_variable()
+    helper.append_op('mul', {ipt, w}, {tmp})
+    mul_results.append(tmp)
+
+  pre_bias = helper.add_sum(mul_results)
+  pre_activation = helper.add_bias(pre_bias)
+  return helper.add_activation(pre_activation)
+```
+
+We not only use the fewer lines of code to write `fc_layer` but also make the code clearer to understand. At the same time, layer developers can figure out what function they can invoke by typing `helper.` in a python editor.
+
+
+### Implementation of layer helper
 
-### FC Layer
+We just keep all parameters of a layer function as a dictionary in layer helper as a private data member. Every method of layer helper will look up the dictionary after it is invoked. In that way, we can implement a layer helper for all layer functions even some layer does not contain some operator. For example, The `activation` is used by the FullyConnected layer or convolution layers, but a cross-entropy layer does not use it. The example code of `add_activation` are:
 
 ```python
-def fc_layer(input, size, ...):
-    block = program.current_block()
-    w = block.create_parameter(...)
-    b = block.create_parameter(...)
-    out = block.create_var()
-    op = block.append_operator("FC", X=input, W=w, b=b, out=out)
-    out.writer = op
-    return out
+class LayerHelper(object):
+  def __init__(self, **kwargs):  # kwargs is short for `keyword arguments`
+    self.kwargs = kwargs
+
+  def add_activation(self, input_var):
+    act = self.kwargs.get("act", None)  # default value is None
+    if act is None:  # do nothing if no act
+      return input_var
+
+    tmp = self.create_tmp_var(self)
+    self.append_op(type=act, input=input_var, output=tmp)
+    return tmp
 ```
 
 ## Optimizer
diff --git a/doc/design/register_grad_op.md b/doc/design/register_grad_op.md
index 3cf8a59446..9f1ce4bae7 100644
--- a/doc/design/register_grad_op.md
+++ b/doc/design/register_grad_op.md
@@ -3,15 +3,17 @@
 
 ## The Problem Posed
 
-In our current operator registration mechanism, for each operator, the programmer should register a *gradient operator creator* function, which takes a C++ operator instance, and returns the corresponding gradient instance.
+Currently, for each C++ operator class definition, there registers a *gradient operator creator* function, which takes a C++ operator instance and returns the corresponding gradient operator instance.
 
-However, as we decided to separate the *compilation* and *execution* of DL models, we need to reshape the creator to take a protobuf `OpDesc` message, and returns a corresponding message.
+However, we noticed two problems with the current deisgn:
 
-More than that, the new registration mechanism need to support the fact that an operators' gradient computation might be a composition of operators.
+1. As we decided to separate the *compilation* and *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message.
 
-## Current Implementation
+1. Some operator's gradient computation requires more than one gradient operators.  For example, the gradient of *minus* consists of two operators -- an identity operaotr and a scale operator.  So we need to make the registration mechanism to support the mapping from an operator to a set of operators for gradient computation.
 
-OpInfos store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is
+## The Current Implementation
+
+The C++ class `OpInfos` store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is
 
 ```cpp
 struct OpInfo {
diff --git a/doc/design/scope.md b/doc/design/scope.md
index b1f9bb4378..4da76eebb7 100644
--- a/doc/design/scope.md
+++ b/doc/design/scope.md
@@ -37,7 +37,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
 ```cpp
 class Scope {
  public:
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);
   const Variable* FindVar(const std::string& name) const;
 
  private:
@@ -98,7 +98,7 @@ class Scope {
   Variable* FindVar(const std::string& name) const;
 
   // return if already contains same name variable.
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);
 
  private:
   std::shared_ptr<Scope> parent_;
@@ -107,7 +107,7 @@ class Scope {
 ```
 ## Only scope can create a variable
 
-To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `NewVar` can construct `Variable`.
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`.
 
 ## When scope destroyed, all variables inside this scope should be destroyed together
 
@@ -121,4 +121,4 @@ Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shar
 
 ## Orthogonal interface
 
-`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily.
+`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily.
diff --git a/doc/design/tensor_array.md b/doc/design/tensor_array.md
index 8378e97bf7..37e4f7b90f 100644
--- a/doc/design/tensor_array.md
+++ b/doc/design/tensor_array.md
@@ -161,7 +161,7 @@ class TensorArray:
         @name: str
             the name of the variable to output.
         '''
-        tensor = NewVar(name)
+        tensor = Var(name)
         tensor_array_stack(self.name, tensor)
         return tensor
 
diff --git a/doc/design/var_desc.md b/doc/design/var_desc.md
index bfbbdd0578..0b2958c1b1 100644
--- a/doc/design/var_desc.md
+++ b/doc/design/var_desc.md
@@ -16,16 +16,23 @@ The computation graph is constructed by Data Node and Operation Node. The concep
 
 ## Definition of VarDesc
 
-A VarDesc should have a name and value, in PaddlePaddle, the value will always be a tensor. Since we use LoDTensor most of the time. We add a LoDTesnorDesc to represent it.
+A VarDesc should have a name, and value. The are two kinds of variable type in compile time, they are `LoDTensor` and `SelectedRows`. 
 
 ```proto
 message VarDesc {
   required string name = 1;
-  optional LoDTensorDesc lod_tensor = 2;
+  enum VarType {
+    LOD_TENSOR = 0;
+    SELECTED_ROWS = 1;
+  }
+  required VarType type = 2;
+  optional LoDTensorDesc lod_desc = 3;
+  optional TensorDesc selected_rows_desc = 4;
+  optional bool persistable = 5 [ default = false ];
 }
 ```
 
-## Definition of LodTensorDesc
+## Definition of TensorDesc
 
 ```proto
 enum DataType {
@@ -38,87 +45,25 @@ enum DataType {
   FP64 = 6;
 }
 
-message LoDTensorDesc {
+message TensorDesc {
   required DataType data_type = 1;
-  repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-  optional int32 lod_level = 3 [default=0];
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
 }
 ```
 
-## Definition of Variable in Python
-
-In Python API, layer will take Variable as Input, and return Variable as Output. There should be a class `Variable` in python to help create and manage Variable.
-
-```python
-image = Variable(dims=[-1, 640, 480])
-# fc1 and fc2 are both Variable
-fc1 = layer.fc(input=image, output_size=10)
-fc2 = layer.fc(input=fc1, output_size=20)
-```
-### what should class `Variable` Have
-1. `name`.a name of string type is used to mark the value of the Variable.
-1. `initializer`. Since our Tensor does not have value. we will always use some Operator to fullfill it when run. So we should have a initialize method to help add the init operator.
-1. `operator`. Variable should record which operator produce itself. The reaon is:
-  - we use pd.eval(targets=[var1, var2]) to run the related ops to get the value of var1 and var2. var.op is used to trace the dependency of the current variable.
-
-In PaddlePaddle, we use Block to describe Computation Graph, so in the code we will use Block but not Graph.
-
-```python
-import VarDesc
-import LoDTensorDesc
-import framework
-
-def AddInitialOperator(variable, initializer):
-	# add an initialize Operator to block to init this Variable
-
-class Variable(object):
-   def __init__(self, name, dims, type, initializer):
-      self._block = get_default_block()
-      self._name = name
-      self.op = None
-
-      tensor_desc = LoDTensorDesc(data_type=type, dims=dims)
-      _var_desc = VarDesc(name=name, lod_tensor=tensor_desc)
-      self._var = framework.CreateVar(_var_desc)
-      self._block.add_var(self)
+A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md).
 
-      # add initial op according to initializer
-      if initializer is not None:
-          AddInitialOperator(self, initializer)
-
-   def dims(self):
-      return self._var.dims()
-
-   def data_type(self):
-       return self._var.data_type()
+## Definition of LodTensorDesc
 
-   def to_proto(self):
-       pass
+```proto
+message LoDTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int lod_level = 2;
+}
 ```
 
-Then we can use this Variable to create a fc layer in Python.
+A LoDTensorDesc contains a tensor and a lod_level.
 
-```python
-import paddle as pd
-
-def flatten_size(X, num_flatten_dims):
-  prod = 1 # of last num_flatten_dims
-  for i in xrange(num_flatten_dims):
-    prod = prod * X.dims[-i-1]
-  return prod
-
-def layer.fc(X, output_size, num_flatten_dims):
-  W = Variable(pd.random_uniform(), type=FP32, dims=[flatten_size(X, num_flatten_dims), output_size])
-  b = Variable(pd.random_uniform(), type=FP32, dims=[output_size])
-  out = Variable(type=FP32)
-  y = operator.fc(X, W, b, output=out) # fc will put fc op input into out
-  pd.InferShape(y)
-  return out
-
-x = Variable(dims=[-1, 640, 480])
-y = layer.fc(x, output_size=100)
-z = layer.fc(y, output_size=200)
+## Definition of Variable in Python
 
-paddle.eval(targets=[z], ...)
-print(z)
-```
+For Variable in Python, please reference [`Python API`](./python_api.md).
diff --git a/doc/howto/deep_model/rnn/rnn_config_cn.rst b/doc/howto/deep_model/rnn/rnn_config_cn.rst
index 4d684cf8ad..63fa161faf 100644
--- a/doc/howto/deep_model/rnn/rnn_config_cn.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst
@@ -21,7 +21,7 @@ wmt14数据的提供文件在 `python/paddle/v2/dataset/wmt14.py <https://github
 
 循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
 
-.. image:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
+.. image:: src/bi_lstm.jpg
       :align: center
 
 一般来说，循环网络从 :math:`t=1` 到 :math:`t=T` 或者反向地从 :math:`t=T` 到 :math:`t=1` 执行以下操作。
@@ -96,7 +96,7 @@ Sequence to Sequence Model with Attention
 我们将使用 sequence to sequence model with attention
 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
 
-.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
+.. image:: src/encoder-decoder-attention-model.png
       :align: center
 
 在这个模型中，源序列 :math:`S = \{s_1, \dots, s_T\}` 
diff --git a/doc/howto/deep_model/rnn/rnn_config_en.rst b/doc/howto/deep_model/rnn/rnn_config_en.rst
index 2b581290a4..f92edd108f 100644
--- a/doc/howto/deep_model/rnn/rnn_config_en.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_en.rst
@@ -19,7 +19,7 @@ Simple Gated Recurrent Neural Network
 
 Recurrent neural network process a sequence at each time step sequentially. An example of the architecture of LSTM is listed below.
 
-.. image:: ../../../tutorials/sentiment_analysis/src/bi_lstm.jpg
+.. image:: src/bi_lstm.jpg
      :align: center
 
 Generally speaking, a recurrent network perform the following operations from :math:`t=1` to :math:`t=T`, or reversely from :math:`t=T` to :math:`t=1`.
@@ -78,7 +78,7 @@ Sequence to Sequence Model with Attention
 -----------------------------------------
 We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. An illustration of the sequence to sequence model with attention is shown in the following figure.
 
-.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
+.. image:: src/encoder-decoder-attention-model.png
       :align: center
 
 In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`.
diff --git a/doc/tutorials/sentiment_analysis/bi_lstm.jpg b/doc/howto/deep_model/rnn/src/bi_lstm.jpg
similarity index 100%
rename from doc/tutorials/sentiment_analysis/bi_lstm.jpg
rename to doc/howto/deep_model/rnn/src/bi_lstm.jpg
diff --git a/doc/tutorials/text_generation/encoder-decoder-attention-model.png b/doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png
similarity index 100%
rename from doc/tutorials/text_generation/encoder-decoder-attention-model.png
rename to doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png
diff --git a/doc/tutorials/image_classification/cifar.png b/doc/tutorials/image_classification/cifar.png
deleted file mode 100644
index f54a0c5883..0000000000
Binary files a/doc/tutorials/image_classification/cifar.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/image_classification.png b/doc/tutorials/image_classification/image_classification.png
deleted file mode 100644
index 14f2558050..0000000000
Binary files a/doc/tutorials/image_classification/image_classification.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/index_cn.md b/doc/tutorials/image_classification/index_cn.md
deleted file mode 100644
index 87f465522a..0000000000
--- a/doc/tutorials/image_classification/index_cn.md
+++ /dev/null
@@ -1,205 +0,0 @@
-图像分类教程
-==========
-
-在本教程中，我们将使用CIFAR-10数据集训练一个卷积神经网络，并使用这个神经网络来对图片进行分类。如下图所示，卷积神经网络可以辨识图片中的主体，并给出分类结果。
-<center>![Image Classification](./image_classification.png)</center>
-
-## 数据准备
-首先下载CIFAR-10数据集。下面是CIFAR-10数据集的官方网址：
-
-<https://www.cs.toronto.edu/~kriz/cifar.html>
-
-我们准备了一个脚本，可以用于从官方网站上下载CIFAR-10数据集，转为jpeg文件并存入特定的目录。使用这个脚本前请确认已经安装了pillow及相关依赖模块。可以参照下面的命令进行安装：
-
-1. 安装pillow
-
-```bash
-sudo apt-get install libjpeg-dev
-pip install pillow
-```
-
-2. 下载数据集
-
-```bash
-cd demo/image_classification/data/
-sh download_cifar.sh
-```
-
-CIFAR-10数据集包含60000张32x32的彩色图片。图片分为10类，每个类包含6000张。其中50000张图片作为训练集，10000张作为测试集。
-
-下图展示了所有的图片类别，每个类别中随机抽取了10张图片。
-<center>![Image Classification](./cifar.png)</center>
-
-脚本运行完成后，我们应当会得到一个名为cifar-out的文件夹，其下子文件夹的结构如下
-
-
-```
-train
----airplane
----automobile
----bird
----cat
----deer
----dog
----frog
----horse
----ship
----truck
-test
----airplane
----automobile
----bird
----cat
----deer
----dog
----frog
----horse
----ship
----truck
-```
-
-cifar-out下包含`train`和`test`两个文件夹，其中分别包含了CIFAR-10中的训练集和测试集。这两个文件夹下各自有10个子文件夹，每个子文件夹下存储相应分类的图片。将图片按照上述结构存储好之后，我们就可以着手对分类模型进行训练了。
-
-## 预处理
-数据下载之后，还需要进行预处理，将数据转换为Paddle的格式。我们可以通过如下命令进行预处理工作：
-
-```
-cd demo/image_classification/
-sh preprocess.sh
-```
-
-其中`preprocess.sh` 调用 `./demo/image_classification/preprocess.py` 对图片进行预处理
-```sh
-export PYTHONPATH=$PYTHONPATH:../../
-data_dir=./data/cifar-out
-python preprocess.py -i $data_dir -s 32 -c 1
-```
-
-`./demo/image_classification/preprocess.py` 使用如下参数：
-
-- `-i` 或 `--input` 给出输入数据所在路径；
-- `-s` 或 `--size` 给出图片尺寸；
-- `-c` 或 `--color` 标示图片是彩色图或灰度图
-
-## 模型训练
-在开始训练之前，我们需要先创建一个模型配置文件。下面我们给出了一个配置示例。**注意**，这里的列出的和`vgg_16_cifar.py`文件稍有差别，因为该文件可适用于预测。
-
-```python
-from paddle.trainer_config_helpers import *
-data_dir='data/cifar-out/batches/'
-meta_path=data_dir+'batches.meta'
-args = {'meta':meta_path, 'mean_img_size': 32,
-        'img_size': 32, 'num_classes': 10,
-        'use_jpeg': 1, 'color': "color"}
-define_py_data_sources2(train_list=data_dir+"train.list",
-                        test_list=data_dir+'test.list',
-                        module='image_provider',
-                        obj='processData',
-                        args=args)
-settings(
-    batch_size = 128,
-    learning_rate = 0.1 / 128.0,
-    learning_method = MomentumOptimizer(0.9),
-    regularization = L2Regularization(0.0005 * 128))
-
-img = data_layer(name='image', size=3*32*32)
-lbl = data_layer(name="label", size=10)
-# small_vgg is predined in trainer_config_helpers.network
-predict = small_vgg(input_image=img, num_channels=3)
-outputs(classification_cost(input=predict, label=lbl))
-```
-
-在第一行中我们载入用于定义网络的函数。
-```python
-from paddle.trainer_config_helpers import *
-```
-
-之后定义的`define_py_data_sources2`使用Python数据提供器，其中 `args`将在`image_provider.py`进行使用，该文件负责产生图片数据并传递给Paddle系统
- - `meta`: 训练集平均值。
- - `mean_img_size`: 平均特征图的高度及宽度。
- - `img_size`：输入图片的高度及宽度。
- - `num_classes`：类别个数。
- - `use_jpeg`：处理过程中数据存储格式。
- - `color`：标示是否为彩色图片。
- 
- `settings`用于设置训练算法。在下面的例子中，learning rate被设置为0.1除以batch size，而weight decay则为0.0005乘以batch size。
- 
- ```python
-settings(
-    batch_size = 128,
-    learning_rate = 0.1 / 128.0,
-    learning_method = MomentumOptimizer(0.9),
-    regularization = L2Regularization(0.0005 * 128)
-)
-```
-
-`small_vgg`定义了网络结构。这里我们使用的是一个小的VGG网络。关于VGG卷积神经网络的描述可以参考：[http://www.robots.ox.ac.uk/~vgg/research/very_deep/](http://www.robots.ox.ac.uk/~vgg/research/very_deep/)。
-```python
-# small_vgg is predined in trainer_config_helpers.network
-predict = small_vgg(input_image=img, num_channels=3)
-```
-配置创建完毕后，可以运行脚本train.sh来训练模型。
-
-```bash
-config=vgg_16_cifar.py
-output=./cifar_vgg_model
-log=train.log
-
-paddle train \
---config=$config \
---dot_period=10 \
---log_period=100 \
---test_all_data_in_one_period=1 \
---use_gpu=1 \
---save_dir=$output \
-2>&1 | tee $log
-
-python -m paddle.utils.plotcurve -i $log > plot.png
-```
-- 这里我们使用的是GPU模式进行训练。如果你没有GPU环境，可以设置`use_gpu=0`。
-- `./demo/image_classification/vgg_16_cifar.py`是网络和数据配置文件。各项参数的详细说明可以在命令行参数相关文档中找到。
-- 脚本`plotcurve.py`依赖于python的`matplotlib`模块。因此如果这个脚本运行失败，也许是因为需要安装`matplotlib`。
-在训练完成后，训练及测试误差曲线图会被`plotcurve.py`脚本保存在 `plot.png`中。下面是一个误差曲线图的示例：
-
-<center>![Training and testing curves.](./plot.png)</center>
-
-## 预测
-在训练完成后，模型及参数会被保存在路径`./cifar_vgg_model/pass-%05d`下。例如第300个pass的模型会被保存在`./cifar_vgg_model/pass-00299`。
-
-要对一个图片的进行分类预测，我们可以使用`predict.sh`，该脚本将输出预测分类的标签：
-
-```
-sh predict.sh
-```
-
-predict.sh:
-```
-model=cifar_vgg_model/pass-00299/
-image=data/cifar-out/test/airplane/seaplane_s_000978.png
-use_gpu=1
-python prediction.py $model $image $use_gpu
-```
-
-## 练习
-在CUB-200数据集上使用VGG模型训练一个鸟类图片分类模型。相关的鸟类数据集可以从如下地址下载，其中包含了200种鸟类的照片（主要来自北美洲）。
-
-<http://www.vision.caltech.edu/visipedia/CUB-200.html>
-
-
-
-
-## 细节探究
-### 卷积神经网络
-卷积神经网络是一种使用卷积层的前向神经网络，很适合构建用于理解图片内容的模型。一个典型的神经网络如下图所示：
-
-![Convolutional Neural Network](./lenet.png)
-
-一个卷积神经网络包含如下层：
-
-- 卷积层：通过卷积操作从图片或特征图中提取特征
-- 池化层：使用max-pooling对特征图下采样
-- 全连接层：使输入层到隐藏层的神经元是全部连接的。
-
-卷积神经网络在图片分类上有着惊人的性能，这是因为它发掘出了图片的两类重要信息：局部关联性质和空间不变性质。通过交替使用卷积和池化处理， 卷积神经网络能够很好的表示这两类信息。
-
-关于如何定义网络中的层，以及如何在层之间进行连接，请参考Layer文档。
diff --git a/doc/tutorials/image_classification/index_en.md b/doc/tutorials/image_classification/index_en.md
deleted file mode 100644
index 60c81a6a53..0000000000
--- a/doc/tutorials/image_classification/index_en.md
+++ /dev/null
@@ -1,221 +0,0 @@
-Image Classification Tutorial
-==============================
-
-This tutorial will guide you through training a convolutional neural network to classify objects using the CIFAR-10 image classification dataset.
-As shown in the following figure, the convolutional neural network can recognize the main object in images, and output the classification result.
-
-<center>![Image Classification](./image_classification.png)</center>
-
-## Data Preparation
-First, download CIFAR-10 dataset. CIFAR-10 dataset can be downloaded from its official website.
-
-<https://www.cs.toronto.edu/~kriz/cifar.html>
-
-We have prepared a script to download and process CIFAR-10 dataset. The script will download CIFAR-10 dataset from the official dataset.
-It will convert it to jpeg images and organize them into a directory with the required structure for the tutorial. Make sure that you have installed pillow and its dependents.
-Consider the following commands:
-
-1. install pillow dependents
-
-```bash
-sudo apt-get install libjpeg-dev
-pip install pillow
-```
-
-2. download data and preparation
-
-```bash
-cd demo/image_classification/data/
-sh download_cifar.sh
-```
-
-The CIFAR-10 dataset consists of 60000 32x32 color images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images.
-
-Here are the classes in the dataset, as well as 10 random images from each:
-<center>![Image Classification](./cifar.png)</center>
-
-
-After downloading and converting, we should find a directory (cifar-out) containing the dataset in the following format:
-
-```
-train
----airplane
----automobile
----bird
----cat
----deer
----dog
----frog
----horse
----ship
----truck
-test
----airplane
----automobile
----bird
----cat
----deer
----dog
----frog
----horse
----ship
----truck
-```
-
-It has two directories:`train` and `test`. These two directories contain training data and testing data of CIFAR-10, respectively. Each of these two folders contains 10 sub-folders, ranging from `airplane` to `truck`. Each sub-folder contains images with the corresponding label. After the images are organized into this structure, we are ready to train an image classification model.
-
-## Preprocess
-After the data has been downloaded, it needs to be pre-processed into the Paddle format. We can run the following command for preprocessing.
-
-```
-cd demo/image_classification/
-sh preprocess.sh
-```
-
-`preprocess.sh` calls `./demo/image_classification/preprocess.py` to preprocess image data.
-```sh
-export PYTHONPATH=$PYTHONPATH:../../
-data_dir=./data/cifar-out
-python preprocess.py -i $data_dir -s 32 -c 1
-```
-
-`./demo/image_classification/preprocess.py` has the following arguments
-
-- `-i` or `--input` specifes  the input data directory.
-- `-s` or `--size` specifies the processed size of images.
-- `-c` or `--color` specifes whether images are color images or gray images.
-
-
-## Model Training
-We need to create a model config file before training the model. An example of the config file (vgg_16_cifar.py) is listed below. **Note**, it is slightly different from the `vgg_16_cifar.py` which also applies to the prediction.
-
-```python
-from paddle.trainer_config_helpers import *
-data_dir='data/cifar-out/batches/'
-meta_path=data_dir+'batches.meta'
-args = {'meta':meta_path, 'mean_img_size': 32,
-        'img_size': 32, 'num_classes': 10,
-        'use_jpeg': 1, 'color': "color"}
-define_py_data_sources2(train_list=data_dir+"train.list",
-                        test_list=data_dir+'test.list',
-                        module='image_provider',
-                        obj='processData',
-                        args=args)
-settings(
-    batch_size = 128,
-    learning_rate = 0.1 / 128.0,
-    learning_method = MomentumOptimizer(0.9),
-    regularization = L2Regularization(0.0005 * 128))
-
-img = data_layer(name='image', size=3*32*32)
-lbl = data_layer(name="label", size=10)
-# small_vgg is predined in trainer_config_helpers.network
-predict = small_vgg(input_image=img, num_channels=3)
-outputs(classification_cost(input=predict, label=lbl))
-```
-
-The first line imports python functions for defining networks.
-```python
-from paddle.trainer_config_helpers import *
-```
-
-Then define an `define_py_data_sources2` which use python data provider
-interface. The arguments in `args` are used in `image_provider.py` which
-yeilds image data and transform them to Paddle.
- - `meta`: the mean value of training set.
- - `mean_img_size`: the size of mean feature map.
- - `img_size`: the height and width of input image.
- - `num_classes`: the number of classes.
- - `use_jpeg`: the data storage type when preprocessing.
- - `color`: specify color image.
-
-`settings` specifies the training algorithm. In the following example,
-it specifies learning rate as 0.1, but divided by batch size, and the weight decay
-is 0.0005 and multiplied by batch size.
-```python
-settings(
-    batch_size = 128,
-    learning_rate = 0.1 / 128.0,
-    learning_method = MomentumOptimizer(0.9),
-    regularization = L2Regularization(0.0005 * 128)
-)
-```
-
-The `small_vgg` specifies the network. We use a small version of VGG convolutional network as our network
-for classification. A description of VGG network can be found here [http://www.robots.ox.ac.uk/~vgg/research/very_deep/](http://www.robots.ox.ac.uk/~vgg/research/very_deep/).
-```python
-# small_vgg is predined in trainer_config_helpers.network
-predict = small_vgg(input_image=img, num_channels=3)
-```
-After writing the config, we can train the model by running the script train.sh.
-
-```bash
-config=vgg_16_cifar.py
-output=./cifar_vgg_model
-log=train.log
-
-paddle train \
---config=$config \
---dot_period=10 \
---log_period=100 \
---test_all_data_in_one_period=1 \
---use_gpu=1 \
---save_dir=$output \
-2>&1 | tee $log
-
-python -m paddle.utils.plotcurve -i $log > plot.png
-```
-
-- Here we use GPU mode to train. If you have no gpu environment, just set `use_gpu=0`.
-
-- `./demo/image_classification/vgg_16_cifar.py` is the network and data configuration file. The meaning of the other flags can be found in the documentation of the command line flags.
-
-- The script `plotcurve.py` requires the python module of `matplotlib`, so if it fails, maybe you need to install `matplotlib`.
-
-
-After training finishes, the training and testing error curves will be saved to `plot.png` using `plotcurve.py` script. An example of the plot is shown below:
-
-<center>![Training and testing curves.](./plot.png)</center>
-
-
-## Prediction
-After we train the model, the model file as well as the model parameters are stored in path `./cifar_vgg_model/pass-%05d`. For example, the model of the 300-th pass is stored at `./cifar_vgg_model/pass-00299`.
-
-To make a prediction for an image, one can run `predict.sh` as follows. The script will output the label of the classfiication.
-
-```
-sh predict.sh
-```
-
-predict.sh:
-```
-model=cifar_vgg_model/pass-00299/
-image=data/cifar-out/test/airplane/seaplane_s_000978.png
-use_gpu=1
-python prediction.py $model $image $use_gpu
-```
-
-## Exercise
-Train a image classification of birds using VGG model and CUB-200 dataset. The birds dataset can be downloaded here. It contains an image dataset with photos of 200 bird species (mostly North American).
-
-<http://www.vision.caltech.edu/visipedia/CUB-200.html>
-
-
-
-
-## Delve into Details
-### Convolutional Neural Network
-A Convolutional Neural Network is a feedforward neural network that uses convolution layers. It is very suitable for building neural networks that process and understand images. A standard convolutional neural network is shown below:
-
-![Convolutional Neural Network](./lenet.png)
-
-Convolutional Neural Network contains the following layers:
-
-- Convolutional layer: It uses convolution operation to extract features from an image or a feature map.
-- Pooling layer: It uses max-pooling to downsample feature maps.
-- Fully Connected layer: It uses fully connected connections to transform features.
-
-Convolutional Neural Network achieves amazing performance for image classification because it exploits two important characteristics of images: *local correlation* and *spatial invariance*. By iteratively applying convolution and max-pooing operations, convolutional neural network can well represent these two characteristics of images.
-
-
-For more details of how to define layers and their connections, please refer to the documentation of layers.
diff --git a/doc/tutorials/image_classification/lenet.png b/doc/tutorials/image_classification/lenet.png
deleted file mode 100644
index 1e6f2b32ba..0000000000
Binary files a/doc/tutorials/image_classification/lenet.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/plot.png b/doc/tutorials/image_classification/plot.png
deleted file mode 100644
index a31f99791c..0000000000
Binary files a/doc/tutorials/image_classification/plot.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/src/cifar.png b/doc/tutorials/image_classification/src/cifar.png
deleted file mode 100644
index f54a0c5883..0000000000
Binary files a/doc/tutorials/image_classification/src/cifar.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/src/image_classification.png b/doc/tutorials/image_classification/src/image_classification.png
deleted file mode 100644
index 14f2558050..0000000000
Binary files a/doc/tutorials/image_classification/src/image_classification.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/src/lenet.png b/doc/tutorials/image_classification/src/lenet.png
deleted file mode 100644
index 1e6f2b32ba..0000000000
Binary files a/doc/tutorials/image_classification/src/lenet.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/src/plot.png b/doc/tutorials/image_classification/src/plot.png
deleted file mode 100644
index a31f99791c..0000000000
Binary files a/doc/tutorials/image_classification/src/plot.png and /dev/null differ
diff --git a/doc/tutorials/index_cn.md b/doc/tutorials/index_cn.md
deleted file mode 100644
index 6a27004d58..0000000000
--- a/doc/tutorials/index_cn.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# 完整教程
-
-* [快速入门](quick_start/index_cn.rst)
-* [个性化推荐](rec/ml_regression_cn.rst)
-* [图像分类](image_classification/index_cn.md)
-* [情感分析](sentiment_analysis/index_cn.md)
-* [语义角色标注](semantic_role_labeling/index_cn.md)
-* [机器翻译](text_generation/index_cn.md)
-
-## 常用模型
-
-* [ResNet模型](imagenet_model/resnet_model_cn.md)
-* [词向量模型](embedding_model/index_cn.md)
diff --git a/doc/tutorials/index_en.md b/doc/tutorials/index_en.md
deleted file mode 100644
index 77331a703b..0000000000
--- a/doc/tutorials/index_en.md
+++ /dev/null
@@ -1,14 +0,0 @@
-# TUTORIALS
-There are several examples and demos here.
-
-* [Quick Start](quick_start/index_en.md)
-* [MovieLens Regression](rec/ml_regression_en.rst)
-* [Image Classification](image_classification/index_en.md)
-* [Sentiment Analysis](sentiment_analysis/index_en.md)
-* [Semantic Role Labeling](semantic_role_labeling/index_en.md)
-* [Text Generation](text_generation/index_en.md)
-* [Image Auto-Generation](gan/index_en.md)
-
-## Model Zoo
-* [ImageNet: ResNet](imagenet_model/resnet_model_en.md)
-* [Embedding: Chinese Word](embedding_model/index_en.md)
diff --git a/doc/tutorials/rec/ml_dataset_cn.md b/doc/tutorials/rec/ml_dataset_cn.md
deleted file mode 100644
index 2207a776f0..0000000000
--- a/doc/tutorials/rec/ml_dataset_cn.md
+++ /dev/null
@@ -1,105 +0,0 @@
-```eval_rst
-.. _demo_ml_dataset:
-
-```
-
-# MovieLens数据集
-
-[MovieLens 数据集](http://grouplens.org/datasets/movielens/)由GroupLens Research实验室搜集整理。
-该数据集包含一些用户信息、电影信息以及电影评分\[1-5\]。根据数据量规模，该数据及有很多不同的版本。
-我们用[MovieLens 百万数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)作为示例数据
-集，其中包含6,000位用户对4,000部电影的1,000,000条评价。该数据集于2003年2月发布。
-
-## 数据集特征
-
-在[ml-1m 数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)中有许多的特征。在[ml-1m 数据集]
-(http://files.grouplens.org/datasets/movielens/ml-1m.zip)中的这些数据文件(含有".dat"的后缀)实际上是CSV文件，
-分隔符为"::"。以下我们翻译数据集网站中README文件的描述:
-
-### 评分文件描述(ratings.dat)
-
-
-所有的评分数据都包含在"ratings.dat"文件中，遵循如下的格式:
-
-用户ID::电影ID::评分::时间戳
-
-- 用户ID范围从1到6040
-- 电影ID范围从1到3952
-- 评分被调整为5星的规模(只允许整数的星级)
-- 时间戳表示为从1970-01-01(UTC)来的秒数，与time(2)的返回值一致
-- 每位用户至少有20条评分
-
-### 用户文件描述(users.dat)
-
-所有的用户信息都包含在"users.dat"文件中，遵循如下的格式:
-
-用户ID::性别::年龄::职业::邮编
-
-所有的人口统计学信息由用户自愿提供，没有进行正确性的检查。只有含有人
-口统计学信息的用户才被包含在数据集中。
-
-- 性别，用"M"表示男性，"F"表示女性
-- 年龄从下列列表范围中选取:
-
-	*   1:	"18岁以下"
-	*  18:	"18-24岁"
-	*  25:	"25-34岁"
-	*  35:	"35-44岁"
-	*  45:	"45-49岁"
-	*  50:	"50-55岁"
-	*  56:	"56+"
-
-- 职业从下面所列中选择:
-
-	*   0:  "其他"或不确定
-	*   1:  "学术/教育工作者"
-	*   2:  "艺术家"
-	*   3:  "文书工作/管理员"
-	*   4:  "大学生/研究生"
-	*   5:  "客户服务"
-	*   6:  "医生/医疗保健"
-	*   7:  "行政工作/管理人员"
-	*   8:  "农民"
-	*   9:  "操持家务者"
-	*  10:  "高中毕业生"
-	*  11:  "律师"
-	*  12:  "程序员"
-	*  13:  "退休人员"
-	*  14:  "销售/市场"
-	*  15:  "科学家"
-	*  16:  "自由职业者"
-	*  17:  "技术员/工程师"
-	*  18:  "推销员/手工艺者"
-	*  19:  "无业人士"
-	*  20:  "作家"
-
-### 电影文件描述(movies.dat)
-
-所有的电影信息都包含在"movies.dat"文件中，遵循如下的格式:
-
-电影ID::电影名称::电影类型
-
-- 电影名称（包括发行时间）与IMDB网站提供的一致
-- 电影类型如符合多种用管道符号|分割，选自下列类型:
-
-	*	动作片
-	*	冒险片
-	*	动画片
-	*	儿童片
-	*	喜剧片
-	*	犯罪片
-	*	纪录片
-	*	戏剧
-	*	奇幻片
-	*	黑色电影
-	*	恐怖片
-	*	音乐剧
-	*	悬疑片
-	*	浪漫片
-	*	科幻片
-	*	惊险电影
-	*	战争片
-	*	西部片
-
-- 由于意外的副本记录和测试记录，有些电影ID可能与实际电影不相符合
-- 电影大部分是手工输入数据，因此可能会有一些错误和不一致发生
diff --git a/doc/tutorials/rec/ml_dataset_en.md b/doc/tutorials/rec/ml_dataset_en.md
deleted file mode 100644
index 25dea5c4af..0000000000
--- a/doc/tutorials/rec/ml_dataset_en.md
+++ /dev/null
@@ -1,111 +0,0 @@
-```eval_rst
-..  _demo_ml_dataset:
-```
-
-# MovieLens Dataset
-
-The [MovieLens Dataset](http://grouplens.org/datasets/movielens/) was collected by GroupLens Research.
-The data set contains some user information, movie information, and many movie ratings from \[1-5\].
-The data sets have many version depending on the size of set.
-We use [MovieLens 1M Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip) as a demo dataset, which contains
-1 million ratings from 6000 users on 4000 movies. Released 2/2003.
-
-## Dataset Features
-
-In [ml-1m Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip), there are many features in these dataset.
-The data files (which have ".dat" extension) in [ml-1m Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip)
-is basically CSV file that delimiter is "::". The description in README we quote here.
-
-### RATINGS FILE DESCRIPTION(ratings.dat)
-
-
-All ratings are contained in the file "ratings.dat" and are in the
-following format:
-
-UserID::MovieID::Rating::Timestamp
-
-- UserIDs range between 1 and 6040
-- MovieIDs range between 1 and 3952
-- Ratings are made on a 5-star scale (whole-star ratings only)
-- Timestamp is represented in seconds since the epoch as returned by time(2)
-- Each user has at least 20 ratings
-
-### USERS FILE DESCRIPTION(users.dat)
-
-User information is in the file "users.dat" and is in the following
-format:
-
-UserID::Gender::Age::Occupation::Zip-code
-
-All demographic information is provided voluntarily by the users and is
-not checked for accuracy.  Only users who have provided some demographic
-information are included in this data set.
-
-- Gender is denoted by a "M" for male and "F" for female
-- Age is chosen from the following ranges:
-
-	*  1:  "Under 18"
-	* 18:  "18-24"
-	* 25:  "25-34"
-	* 35:  "35-44"
-	* 45:  "45-49"
-	* 50:  "50-55"
-	* 56:  "56+"
-
-- Occupation is chosen from the following choices:
-
-	*  0:  "other" or not specified
-	*  1:  "academic/educator"
-	*  2:  "artist"
-	*  3:  "clerical/admin"
-	*  4:  "college/grad student"
-	*  5:  "customer service"
-	*  6:  "doctor/health care"
-	*  7:  "executive/managerial"
-	*  8:  "farmer"
-	*  9:  "homemaker"
-	* 10:  "K-12 student"
-	* 11:  "lawyer"
-	* 12:  "programmer"
-	* 13:  "retired"
-	* 14:  "sales/marketing"
-	* 15:  "scientist"
-	* 16:  "self-employed"
-	* 17:  "technician/engineer"
-	* 18:  "tradesman/craftsman"
-	* 19:  "unemployed"
-	* 20:  "writer"
-
-### MOVIES FILE DESCRIPTION(movies.dat)
-
-Movie information is in the file "movies.dat" and is in the following
-format:
-
-MovieID::Title::Genres
-
-- Titles are identical to titles provided by the IMDB (including
-year of release)
-- Genres are pipe-separated and are selected from the following genres:
-
-	* Action
-	* Adventure
-	* Animation
-	* Children's
-	* Comedy
-	* Crime
-	* Documentary
-	* Drama
-	* Fantasy
-	* Film-Noir
-	* Horror
-	* Musical
-	* Mystery
-	* Romance
-	* Sci-Fi
-	* Thriller
-	* War
-	* Western
-
-- Some MovieIDs do not correspond to a movie due to accidental duplicate
-entries and/or test entries
-- Movies are mostly entered by hand, so errors and inconsistencies may exist
diff --git a/doc/tutorials/rec/ml_regression_cn.rst b/doc/tutorials/rec/ml_regression_cn.rst
deleted file mode 100644
index 9278c9f603..0000000000
--- a/doc/tutorials/rec/ml_regression_cn.rst
+++ /dev/null
@@ -1,349 +0,0 @@
-MovieLens数据集评分回归模型
-===========================
-
-这里我们在MovieLens数据集描述一种 **余弦相似度回归** 任务。
-该示例将展示paddle如何进行词向量嵌入，处理相似度回归，针对文本
-的单词级别的卷积神经网络，以及paddle如何处理多种类型的输入。
-需要注意的是，该模型网络只是用于进行demo展示paddle如何工作，而
-没有进行结构的微调。
-
-
-**我们非常欢迎您用PADDLEPADDLE构建更好的示例，如果您有好的建议来
-让这个示例变得更好，希望能让我们知晓。**
-
-数据准备
-`````````
-下载并解压数据集
-'''''''''''''''''
-这里我们使用 :ref:`demo_ml_dataset` 。
-要下载和解压数据集，只需要简单的运行下面的命令即可。
-
-.. code-block:: bash
-
-	cd demo/recommendation/data
-	./ml_data.sh
-
-:code:`demo/recommendation/data/ml-1m` 的目录结构为:
-
-.. code-block:: text
-
-	+--ml-1m
-		+--- movies.dat 	# 电影特征
-		+--- ratings.dat 	# 评分
-		+--- users.dat 		# 用户特征
-		+--- README 		# 数据集描述
-
-字段配置文件
-'''''''''''''
-**字段配置文件** 用来具体说明数据集的字段和文件格式，
-例如，说明每个特征文件具体字段是 **什么** 类型。
-
-ml-1m的字段配置文件在目录 :code:`demo/recommendation/data/config.json` 中。
-其具体说明了字段类型和文件名称:
-
-1) 用户文件中有四种类型的字段\: 编号，性别，年龄和职业；
-
-2) 文件名称为"users.dat"，文件的分隔符为"::"。
-
-.. include:: ../../../demo/recommendation/data/config.json
-   :code: json
-   :literal:
-
-准备数据
-`````````
-你需要安装python的第三方库。
-**强烈推荐使用VIRTUALENV来创造一个干净的python环境。**
-
-.. code-block:: bash
-
-	pip install -r requirements.txt
-
-预处理数据一般的命令为:
-
-.. code-block:: bash
-
-	cd demo/recommendation
-	./preprocess.sh
-
-下面介绍预处理过程具体的步骤。
-
-提取电影或用户的特征并生成python对象
-'''''''''''''''''''''''''''''''''''''
-
-在movielens 1m数据集中，电影和用户有许多的特征。
-评分文件的每一行仅仅提供电影或用户的编号来代表相应的电影或用户。
-我们首先处理电影或用户的特征文件，然后用pickle命令将特征( **Meta** )对象存储为文件。
-
-Meta配置文件
-.............
-
-**Meta配置文件** 用来具体描述 **如何** 解析数据集中的每一个字段。
-该文件可以从字段配置文件生成，或是手动编辑生成。文件的格式可以
-为json或yaml格式。解析器能通过文件的扩展名自动识别文件的格式。
-
-要将字段配置文件转化为meta配置文件，只需要运行：
-
-.. code-block:: bash
-
-	cd demo/recommendation/data
-	python config_generator.py config.json > meta_config.json
-
-生成的meta配置文件如下所示：
-
-.. include:: ../../../demo/recommendation/data/meta_config.json
-	:code: json
-	:literal:
-
-在meta文件中有两种特征\: 电影和用户。
-
-* 在电影文件movies.dat中
-	* 我们仅用"::"来分隔每一行
-	* pos 0 代表编号
-	* pos 1 特征：
-		* name是电影名
-		* 利用正则表达式来解析该特征
-		* 基于字母的词嵌入特征
-		* 是序列
-	* pos 2 特征：
-		* name是体裁
-		* type是one hot稠密向量
-		* dictionary由解析自动生成，每一个key由'|'分隔
-* 在用户文件users.dat中
-	* 我们仅用"::"来分隔每一行
-	* pos 0 代表编号
-	* pos 1 特征：
-		* name是性别
-		* 简单的基于字母的词嵌入
-	* pos 2 特征：
-		* name是年龄
-		* 是整个的词嵌入
-		* 嵌入编号会根据单词排序
-	* pos 3 特征：
-		* name是职业
-		* 简单的整个词嵌入
-
-
-Meta文件
-''''''''
-
-有了meta配置文件之后，我们可以生成 **Meta文件** ，该文件是python的pickle对象，
-存储着电影或用户信息。可以运行下面的命令来生成。
-
-.. code-block:: bash
-
-	python meta_generator.py ml-1m meta.bin --config=meta_config.json
-
-meta文件 :code:`meta.bin` 的结构如下：
-
-.. code-block:: text
-
-    +--+ movie
-    |      +--+ __meta__
-    |      |       +--+ raw_meta  # 每个特征的meta配置。列表
-    |      |       |       +
-    |      |       |       |     # 编号字段，我们用编号作为key 
-    |      |       |       +--+ {'count': 3883, 'max': 3952, 'is_key': True, 'type': 'id', 'min': 1}
-    |      |       |       |
-    |      |       |       |     # 电影名字段，嵌入特征字典
-    |      |       |       +--+ {'dict': [ ... ], 'type': 'embedding', 'name': 'title', 'seq': 'sequence'}
-    |      |       |       |
-    |      |       |       |     # 体裁字段，体裁字典
-    |      |       |       +--+ {'dict': [ ... ], 'type': 'one_hot_dense', 'name': 'genres'}
-    |      |       |
-    |      |       +--+ feature_map [1, 2] # a list for raw_meta index for feature field.
-    |      |                               # it means there are 2 features for each key.
-    |      |                               #    * 0 offset of feature is raw_meta[1], Title.
-    |      |                               #    * 1 offset of feature is raw_meta[2], Genres.
-    |      |
-    |      +--+ 1 # 电影1的特征
-    |      |    +
-    |      |    +---+ [[...], [...]] # title ids, genres dense vector
-    |      |
-    |      +--+ 2
-    |      |
-    |      +--+ ...
-    |
-    +--- user
-           +--+ __meta__
-           |       +
-           |       +--+ raw_meta
-           |       |       +
-           |       |       +--+ id field as user
-           |       |       |
-           |       |       +--+ {'dict': ['F', 'M'], 'type': 'embedding', 'name': 'gender', 'seq': 'no_sequence'}
-           |       |       |
-           |       |       +--+ {'dict': ['1', '18', '25', '35', '45', '50', '56'], 'type': 'embedding', 'name': 'age', 'seq': 'no_sequence'}
-           |       |       |
-           |       |       +--+ {'dict': [...], 'type': 'embedding', 'name': 'occupation', 'seq': 'no_sequence'}
-           |       |
-           |       +--+ feature_map [1, 2, 3]
-           |
-           +--+ 1 # 用户1的特征
-           |
-           +--+ 2
-           +--+ ...
-
-
-分割训练/测试文件
-''''''''''''''''''
-
-我们将 :code:`ml-1m/ratings.dat` 文件分割为训练和测试文件。分割文件的方法是：对于每位用户，我们将评分分成两部分。
-这样的话每位用户在测试文件中将与训练文件含有同样的信息。
-
-用 :code:`separate.py` 来分离训练和测试文件。
-
-.. code-block:: bash
-
-	python split.py ml-1m/ratings.dat --delimiter="::" --test_ratio=0.1
-
-这样就会生成两个文件：:code:`ml-1m/ratings.dat.train` 和 :code:`ml-1m/ratings.data.test` 。
-将他们移动到目录 :code:`data` ，然后进行随机打乱，再为paddle的训练过程提供文件列表。
-
-..  code-block:: bash
-
-    shuf ml-1m/ratings.dat.train > ratings.dat.train
-    cp ml-1m/ratings.dat.test .
-    echo "./data/ratings.dat.train" > train.list
-    echo "./data/ratings.dat.test" > test.list
-
-
-神经网络结构配置
-`````````````````
-
-训练器配置文件
-'''''''''''''''
-
-网络结构如下图所示：
-
-..  image:: rec_regression_network.png
-    :align: center
-    :alt: rec_regression_network
-
-该示例的神经网络配置文件 :code:`trainer_config.py` 如下所示：
-
-..  literalinclude:: ../../../demo/recommendation/trainer_config.py
-    :language: python
-    :lines: 15-
-
-在文件 :code:`trainer_config.py` 中，我们仅仅是将每个特征种类映射到一个特征向量中，以下
-展示了如何将每个特征映射到一个向量。
-
-* :code:`id` \: 仅仅是简单的嵌入，然后添加一个全连接层。
-* :code:`embedding` \:
-    - 如果是序列，则先做嵌入，然后再做一次文本卷积网络操作，
-      然后得到平均采样的结果。
-    - 如果不是序列，则先做嵌入，然后添加一个全连接层。
-* :code:`one_host_dense` \:
-    - 仅仅是两个全连接层。
-
-然后我们利用多输入的:code:`fc_layer` 全连接层将电影的每个特征结合成一个电影特征，
-并且对用户的特征做同样的操作，也得到一个用户特征。然后我们求这两个特征的余弦相似度。
-
-在这些网络中，我们用以下的一些:ref:`api_trainer_config` 中的接口。
-
-*  数据层， :ref:`api_trainer_config_helpers_layers_data_layer`
-*  全连接层， :ref:`api_trainer_config_helpers_layers_fc_layer`
-*  嵌入层， :ref:`api_trainer_config_helpers_layers_embedding_layer`
-*  文本投影层， :ref:`api_trainer_config_helpers_layers_context_projection`
-*  采样层， :ref:`api_trainer_config_helpers_layers_pooling_layer`
-*  余弦相似度层， :ref:`api_trainer_config_helpers_layers_cos_sim`
-*  文本卷积采样层， :ref:`api_trainer_config_helpers_network_text_conv_pool`
-*  声明Python数据源， :ref:`api_trainer_config_helpers_data_sources` 
-
-数据提供脚本
-'''''''''''''
-
-..  literalinclude:: ../../../demo/recommendation/dataprovider.py
-    :language: python
-    :lines: 15-
-
-数据提供脚本仅仅是读取meta.bin和评分文件，生成训练需要的样本。
-在脚本 :code:`dataprovider.py` 中，我们需要设置：
-
-* obj.slots\: 特征的类型和维度。
-* use_seq\: :code:`dataprovider.py` 中的数据是否为序列模式。
-* process\: 返回数据的每一条样本给 :code:`paddle` 。
-
-数据提供脚本的细节文档可以参考 :ref:`api_pydataprovider2` 。
-
-训练
-````
-
-准备好数据，配置了网络，编写好数据提供脚本后，现在我们可以开始paddle训练了。
-
-代码 :code:`run.sh` 如下：
-
-..  literalinclude:: ../../../demo/recommendation/run.sh
-    :language: bash
-    :lines: 16-
-
-该脚本仅仅是开始一个paddle训练过程，将日志写入文件 :code:`log.txt` ，然后
-打印在屏幕上。
-
-脚本 :code:`run.sh` 中的每一行命令，请参考页面 :ref:`cmd_line_index` 。
-这些参数的简短介绍如下：
-
-*  config\: 告诉paddle哪个文件是神经网络的配置文件。
-*  save_dir\: 告诉paddle将模型保存在: code:`./output` 中。
-*  use_gpu\: 是否使用GPU，默认为不使用。
-*  trainer_count\: 一台机器上面的线程数量。
-*  test_all_data_in_one_period\: 每一个测试周期测试一次所有数据。否则，
-   每个测试周期测试: code:`batch_size` 批次的数据。
-*  log_period\: 在训练了: code:`log_period` 批次后打印日志。
-*  dot_period\: 在每训练: code:`dot_period` 个批次后打印一个 :code:`.` 。
-*  num_passes\: 训练至多: code:`num_passes` 轮。
-
-如果训练过程启动成功的话，输出应该类似如下：
-
-..  code-block:: text
-
-    I0601 08:07:22.832059 10549 TrainerInternal.cpp:157]  Batch=100 samples=160000 AvgCost=4.13494 CurrentCost=4.13494 Eval:  CurrentEval:
-
-    I0601 08:07:50.672627 10549 TrainerInternal.cpp:157]  Batch=200 samples=320000 AvgCost=3.80957 CurrentCost=3.48421 Eval:  CurrentEval:
-
-    I0601 08:08:18.877369 10549 TrainerInternal.cpp:157]  Batch=300 samples=480000 AvgCost=3.68145 CurrentCost=3.42519 Eval:  CurrentEval:
-
-    I0601 08:08:46.863963 10549 TrainerInternal.cpp:157]  Batch=400 samples=640000 AvgCost=3.6007 CurrentCost=3.35847 Eval:  CurrentEval:
-
-    I0601 08:09:15.413025 10549 TrainerInternal.cpp:157]  Batch=500 samples=800000 AvgCost=3.54811 CurrentCost=3.33773 Eval:  CurrentEval:
-    I0601 08:09:36.058670 10549 TrainerInternal.cpp:181]  Pass=0 Batch=565 samples=902826 AvgCost=3.52368 Eval:
-    I0601 08:09:46.215489 10549 Tester.cpp:101]  Test samples=97383 cost=3.32155 Eval:
-    I0601 08:09:46.215966 10549 GradientMachine.cpp:132] Saving parameters to ./output/model/pass-00000
-    I0601 08:09:46.233397 10549 ParamUtil.cpp:99] save dir ./output/model/pass-00000
-    I0601 08:09:46.233438 10549 Util.cpp:209] copy trainer_config.py to ./output/model/pass-00000
-    I0601 08:09:46.233541 10549 ParamUtil.cpp:147] fileName trainer_config.py
-
-模型被保存在 :code:`output/` 目录中。你可以在任何时候用 :code:`Ctrl-C` 来停止训练。
-
-模型评估和预测
-```````````````
-
-在训练了几个轮次以后，你可以对模型进行评估，得到最好轮次下的模型。运行下面命令即可：
-
-.. code-block:: bash
-
-    ./evaluate.sh 
-
-你将看到如下的信息：
-
-.. code-block:: text
-
-    Best pass is 00009,  error is 3.06949, which means predict get error as 0.875998002281
-    evaluating from pass output/pass-00009
-
-然后，你可以预测任何用户对于任何一部电影的评价，运行下面命令即可：
-
-..  code-block:: bash
-
-    python prediction.py 'output/pass-00009/'
-
-预测程序将读取用户的输入，然后输出预测分数。用户预测的命令行界面如下：
-
-..  code-block:: text
-
-    Input movie_id: 9
-    Input user_id: 4
-    Prediction Score is 2.56
-    Input movie_id: 8
-    Input user_id: 2
-    Prediction Score is 3.13
diff --git a/doc/tutorials/rec/ml_regression_en.rst b/doc/tutorials/rec/ml_regression_en.rst
deleted file mode 100644
index 993b9a516f..0000000000
--- a/doc/tutorials/rec/ml_regression_en.rst
+++ /dev/null
@@ -1,348 +0,0 @@
-Regression MovieLens Ratting
-============================
-
-Here we demonstrate a **Cosine Similarity Regression** job in movie lens dataset.
-This demo will show how paddle does (word) embedding job,
-handles the similarity regression,
-the character-level convolutional networks for text, and how does paddle handle
-multiple types of inputs.
-Note that the model structure is not fine-tuned and just a demo to show how paddle works.
-
-
-YOU ARE WELCOME TO BUILD A BETTER DEMO
-BY USING PADDLEPADDLE, AND LET US KNOW TO MAKE THIS DEMO BETTER.
-
-Data Preparation
-````````````````
-Download and extract dataset
-''''''''''''''''''''''''''''
-We use :ref:`demo_ml_dataset` here. 
-To download and unzip the dataset, simply run the following commands.
-
-..  code-block:: bash
-
-    cd demo/recommendation/data 
-    ./ml_data.sh
-
-And the directory structure of :code:`demo/recommendation/data/ml-1m` is:
-
-..  code-block:: text
-
-    +--ml-1m
-         +--- movies.dat    # movie features
-         +--- ratings.dat   # ratings
-         +--- users.dat     # user features
-         +--- README        # dataset description
-
-Field config file
-'''''''''''''''''
-**Field config file** is used to specify the fields of the dataset and the file format,
-i.e, specific **WHAT** type it is in each feature file.
-
-The field config file of ml-1m shows in :code:`demo/recommendation/data/config.json`.
-It specifics the field types and file names: 1) there are four types of field for user file\: id, gender, age and occupation;
-2) the filename is "users.dat", and the delimiter of file is "::".
-
-..  include:: ../../../demo/recommendation/data/config.json
-    :code: json
-    :literal:
-
-Preprocess Data
-```````````````
-You need to install python 3rd party libraries.
-IT IS HIGHLY RECOMMEND TO USE VIRTUALENV MAKE A CLEAN PYTHON ENVIRONMENT.
-
-..  code-block:: bash
-
-    pip install -r requirements.txt
-
-The general command for preprocessing the dataset is:
-
-..  code-block:: bash
-
-    cd demo/recommendation
-    ./preprocess.sh
-    
-And the detail steps are introduced as follows.
-
-Extract Movie/User features to python object
-'''''''''''''''''''''''''''''''''''''''''''''
-
-There are many features in movie or user in movielens 1m dataset.
-Each line of rating file just provides a Movie/User id to refer each movie or user.
-We process the movie/user feature file first, and pickle the feature (**Meta**) object as a file.
-
-Meta config file
-................
-
-**Meta config file** is used to specific **HOW** to parse each field in dataset.
-It could be translated from field config file, or written by hand.
-Its file format could be either json or yaml syntax file. Parser will automatically choose the file format by extension name.
-
-To convert Field config file to meta config file, just run:
-
-..  code-block:: bash
-
-    cd demo/recommendation/data
-    python config_generator.py config.json > meta_config.json
-
-The meta config file shows below:
-
-..  include:: ../../../demo/recommendation/data/meta_config.json
-    :code: json
-    :literal:
-
-There are two kinds of features in meta\: movie and user.
-
-* in movie file, whose name is movies.dat
-   * we just split each line by "::"
-   * pos 0 is id.
-   * pos 1 feature:
-      * name is title.
-      * it uses regex to parse this feature.
-      * it is a char based word embedding feature.
-      * it is a sequence.
-   * pos 2 feature:
-      * name is genres.
-      * type is one hot dense vector.
-      * dictionary is auto generated by parsing, each key is split by '|'
-* in user file, whose name is users.dat
-   * we just split each line by "::"
-   * pos 0 is id.
-   * pos 1 feature:
-       * name is gender
-       * just simple char based embedding.
-   * pos 2 feature:
-       * name is age
-       * just whole word embedding.
-       * embedding id will be sort by word.
-   * pos 3 feature:
-       * name is occupation.
-       * just simple whole word embedding.
-
-
-Meta file
-'''''''''
-
-After having meta config file, we can generate **Meta file**, a python pickle object which stores movie/user information.
-The following commands could be run to generate it.
-
-..  code-block:: bash
-
-    python meta_generator.py ml-1m meta.bin --config=meta_config.json
-
-And the structure of the meta file :code:`meta.bin` is:
-
-..  code-block:: text
-
-    +--+ movie
-    |      +--+ __meta__
-    |      |       +--+ raw_meta  # each feature meta config. list
-    |      |       |       +
-    |      |       |       |     # ID Field, we use id as key
-    |      |       |       +--+ {'count': 3883, 'max': 3952, 'is_key': True, 'type': 'id', 'min': 1}
-    |      |       |       |
-    |      |       |       |     # Titile field, the dictionary list of embedding.
-    |      |       |       +--+ {'dict': [ ... ], 'type': 'embedding', 'name': 'title', 'seq': 'sequence'}
-    |      |       |       |
-    |      |       |       |     # Genres field, the genres dictionary
-    |      |       |       +--+ {'dict': [ ... ], 'type': 'one_hot_dense', 'name': 'genres'}
-    |      |       |
-    |      |       +--+ feature_map [1, 2] # a list for raw_meta index for feature field.
-    |      |                               # it means there are 2 features for each key.
-    |      |                               #    * 0 offset of feature is raw_meta[1], Title.
-    |      |                               #    * 1 offset of feature is raw_meta[2], Genres.
-    |      |
-    |      +--+ 1 # movie 1 features
-    |      |    +
-    |      |    +---+ [[...], [...]] # title ids, genres dense vector
-    |      |
-    |      +--+ 2
-    |      |
-    |      +--+ ...
-    |
-    +--- user
-           +--+ __meta__
-           |       +
-           |       +--+ raw_meta
-           |       |       +
-           |       |       +--+ id field as user
-           |       |       |
-           |       |       +--+ {'dict': ['F', 'M'], 'type': 'embedding', 'name': 'gender', 'seq': 'no_sequence'}
-           |       |       |
-           |       |       +--+ {'dict': ['1', '18', '25', '35', '45', '50', '56'], 'type': 'embedding', 'name': 'age', 'seq': 'no_sequence'}
-           |       |       |
-           |       |       +--+ {'dict': [...], 'type': 'embedding', 'name': 'occupation', 'seq': 'no_sequence'}
-           |       |
-           |       +--+ feature_map [1, 2, 3]
-           |
-           +--+ 1 # user 1 features
-           |
-           +--+ 2
-           +--+ ...
-
-
-Split Training/Testing files
-''''''''''''''''''''''''''''
-
-We split :code:`ml-1m/ratings.dat` into a training and testing file. The way to split file is for each user, we split the
-rating by two parts. So each user in testing file will have some rating information in training file.
-
-Use :code:`separate.py` to separate the training and testing file.
-
-..  code-block:: bash
-
-    python split.py ml-1m/ratings.dat --delimiter="::" --test_ratio=0.1
-
-Then two files will be generated\: :code:`ml-1m/ratings.dat.train` and :code:`ml-1m/rating.data.test`.
-Move them to workspace :code:`data`, shuffle the train file, and prepare the file list for paddle train.
-
-..  code-block:: bash
-
-    shuf ml-1m/ratings.dat.train > ratings.dat.train
-    cp ml-1m/ratings.dat.test .
-    echo "./data/ratings.dat.train" > train.list
-    echo "./data/ratings.dat.test" > test.list
-
-
-Neural Network Configuration
-````````````````````````````
-
-Trainer Config File
-'''''''''''''''''''
-
-The network structure shows below.
-
-..  image:: rec_regression_network.png
-    :align: center
-    :alt: rec_regression_network
-
-The demo's neural network config file :code:`trainer_config.py` show as below.
-
-..  literalinclude:: ../../../demo/recommendation/trainer_config.py
-    :language: python
-    :lines: 15-
-
-In this :code:`trainer_config.py`, we just map each feature type to
-a feature vector, following shows how to map each feature to a vector shows below.
-
-* :code:`id`\: Just simple embedding, and then add to fully connected layer.
-* :code:`embedding`\:
-    - if is_sequence, get the embedding and do a text convolutional operation,
-      get the average pooling result.
-    - if not sequence, get the embedding and add to fully connected layer.
-* :code:`one_host_dense`\:
-    - just two fully connected layer.
-
-Then we combine each features of movie into one movie feature by a
-:code:`fc_layer` with multiple inputs, and do the same thing to user features,
-get one user feature. Then we calculate the cosine similarity of these two
-features.
-
-In these networks, we use several APIs in :ref:`api_trainer_config` . There are
-
-*  Data Layer, :ref:`api_trainer_config_helpers_layers_data_layer`
-*  Fully Connected Layer, :ref:`api_trainer_config_helpers_layers_fc_layer`
-*  Embedding Layer, :ref:`api_trainer_config_helpers_layers_embedding_layer`
-*  Context Projection Layer, :ref:`api_trainer_config_helpers_layers_context_projection`
-*  Pooling Layer, :ref:`api_trainer_config_helpers_layers_pooling_layer`
-*  Cosine Similarity Layer, :ref:`api_trainer_config_helpers_layers_cos_sim`
-*  Text Convolution Pooling Layer, :ref:`api_trainer_config_helpers_network_text_conv_pool`
-*  Declare Python Data Sources :ref:`api_trainer_config_helpers_data_sources`.
-
-Data Provider
-'''''''''''''
-
-..  literalinclude:: ../../../demo/recommendation/dataprovider.py
-    :language: python
-    :lines: 15-
-
-The data provider just read the meta.bin and rating file, yield each sample for training.
-In this :code:`dataprovider.py`, we should set\:
-
-* obj.slots\: The feature types and dimension.
-* use_seq\: Whether this :code:`dataprovider.py` in sequence mode or not.
-* process\: Return each sample of data to :code:`paddle`.
-
-The data provider details document see :ref:`api_pydataprovider2`.
-
-Train
-`````
-
-After prepare data, config network, writting data provider, now we can run paddle training.
-
-The :code:`run.sh` is shown as follow:
-
-..  literalinclude:: ../../../demo/recommendation/run.sh
-    :language: bash
-    :lines: 16-
-
-It just start a paddle training process, write the log to :code:`log.txt`,
-then print it on screen.
-
-Each command line argument in :code:`run.sh`, please refer to the :ref:`cmd_line_index` page. The short description of these arguments is shown as follow.
-
-*  config\: Tell paddle which file is neural network configuration.
-*  save_dir\: Tell paddle save model into :code:`./output`.
-*  use_gpu\: Use gpu or not. Default is false.
-*  trainer_count\: The compute thread in one machine.
-*  test_all_data_in_one_period\: Test All Data during one test period. Otherwise,
-   will test a :code:`batch_size` data in one test period.
-*  log_period\: Print log after train :code:`log_period` batches.
-*  dot_period\: Print a :code:`.` after train :code:`dot_period` batches.
-*  num_passes\: Train at most :code:`num_passes`.
-
-If training process starts successfully, the output likes follow:
-
-..  code-block:: text
-
-    I0601 08:07:22.832059 10549 TrainerInternal.cpp:157]  Batch=100 samples=160000 AvgCost=4.13494 CurrentCost=4.13494 Eval:  CurrentEval:
-
-    I0601 08:07:50.672627 10549 TrainerInternal.cpp:157]  Batch=200 samples=320000 AvgCost=3.80957 CurrentCost=3.48421 Eval:  CurrentEval:
-
-    I0601 08:08:18.877369 10549 TrainerInternal.cpp:157]  Batch=300 samples=480000 AvgCost=3.68145 CurrentCost=3.42519 Eval:  CurrentEval:
-
-    I0601 08:08:46.863963 10549 TrainerInternal.cpp:157]  Batch=400 samples=640000 AvgCost=3.6007 CurrentCost=3.35847 Eval:  CurrentEval:
-
-    I0601 08:09:15.413025 10549 TrainerInternal.cpp:157]  Batch=500 samples=800000 AvgCost=3.54811 CurrentCost=3.33773 Eval:  CurrentEval:
-    I0601 08:09:36.058670 10549 TrainerInternal.cpp:181]  Pass=0 Batch=565 samples=902826 AvgCost=3.52368 Eval:
-    I0601 08:09:46.215489 10549 Tester.cpp:101]  Test samples=97383 cost=3.32155 Eval:
-    I0601 08:09:46.215966 10549 GradientMachine.cpp:132] Saving parameters to ./output/model/pass-00000
-    I0601 08:09:46.233397 10549 ParamUtil.cpp:99] save dir ./output/model/pass-00000
-    I0601 08:09:46.233438 10549 Util.cpp:209] copy trainer_config.py to ./output/model/pass-00000
-    I0601 08:09:46.233541 10549 ParamUtil.cpp:147] fileName trainer_config.py
-
-The model is saved in :code:`output/` directory. You can use :code:`Ctrl-C` to stop training whenever you want.
-
-Evaluate and Predict
-````````````````````
-
-After training several passes, you can evaluate them and get the best pass. Just run
-
-.. code-block:: bash
-
-    ./evaluate.sh 
-
-You will see messages like this:
-
-.. code-block:: text
-
-    Best pass is 00009,  error is 3.06949, which means predict get error as 0.875998002281
-    evaluating from pass output/pass-00009
-
-Then, you can predict what any user will rate a movie. Just run
-
-..  code-block:: bash
-
-    python prediction.py 'output/pass-00009/'
-
-Predictor will read user input, and predict scores. It has a command-line user interface as follows:
-
-..  code-block:: text
-
-    Input movie_id: 9
-    Input user_id: 4
-    Prediction Score is 2.56
-    Input movie_id: 8
-    Input user_id: 2
-    Prediction Score is 3.13
diff --git a/doc/tutorials/rec/rec_regression_network.png b/doc/tutorials/rec/rec_regression_network.png
deleted file mode 100644
index 7d2b54d4fc..0000000000
Binary files a/doc/tutorials/rec/rec_regression_network.png and /dev/null differ
diff --git a/doc/tutorials/semantic_role_labeling/feature.jpg b/doc/tutorials/semantic_role_labeling/feature.jpg
deleted file mode 100644
index 0e3310e4ac..0000000000
Binary files a/doc/tutorials/semantic_role_labeling/feature.jpg and /dev/null differ
diff --git a/doc/tutorials/semantic_role_labeling/index_cn.md b/doc/tutorials/semantic_role_labeling/index_cn.md
deleted file mode 100644
index f6061766c0..0000000000
--- a/doc/tutorials/semantic_role_labeling/index_cn.md
+++ /dev/null
@@ -1,201 +0,0 @@
-# 语义角色标注教程 #
-
-语义角色标注（Semantic role labeling, SRL）是浅层语义解析的一种形式，其目的是在给定的输入句子中发现每个谓词的谓词论元结构。 SRL作为很多自然语言处理任务中的中间步骤是很有用的，如信息提取、文档自动分类和问答。 实例如下 [1]:
-
- [ <sub>A0</sub> He ] [ <sub>AM-MOD</sub> would ][ <sub>AM-NEG</sub> n’t ] [ <sub>V</sub> accept] [ <sub>A1</sub> anything of value ] from [<sub>A2</sub> those he was writing about ]. 
-
-- V: 动词
-- A0: 接受者
-- A1: 接受的东西
-- A2: 从……接受
-- A3: 属性
-- AM-MOD: 情态动词 
-- AM-NEG: 否定
-
-给定动词“accept”，句子中的组块将会扮演某些语义角色。这里，标签方案来自 Penn Proposition Bank。
-
-到目前为止，大多数成功的SRL系统是建立在某种形式的句法分析结果之上的，使用了基于句法结构的预定义特征模板。 本教程将介绍使用深度双向长短期记忆（DB-LSTM）模型[2]的端到端系统来解决SRL任务，这在很大程度上优于先前的最先进的系统。 这个系统将SRL任务视为序列标注问题。
-
-## 数据描述
-相关论文[2]采用 CoNLL-2005＆2012 共享任务中设置的数据进行训练和测试。由于数据许可的原因，演示采用 CoNLL-2005 的测试数据集，可以在网站上找到。
-
-用户只需执行以下命令就可以下载并处理原始数据：
-
-```bash
-cd data
-./get_data.sh
-```
-`data `目录会出现如下几个新的文件：
-```bash
-conll05st-release：the test data set of CoNll-2005 shared task 
-test.wsj.words：the Wall Street Journal data sentences
-test.wsj.props:  the propositional arguments
-feature: the extracted features from data set
-```
-
-## 训练
-### DB-LSTM
-请参阅情感分析的演示以了解有关长期短期记忆单元的更多信息。
-
-与在 Sentiment Analysis 演示中使用的 Bidirectional-LSTM 不同，DB-LSTM 采用另一种方法来堆叠LSTM层。首先，标准LSTM以正向处理该序列。该 LSTM 层的输入和输出作为下一个 LSTM 层的输入，并被反向处理。这两个标准 LSTM 层组成一对 LSTM。然后我们堆叠一对对的 LSTM 层后得到深度 LSTM 模型。
-
-下图展示了时间扩展的2层 DB-LSTM 网络。
-<center>
-![pic](./network_arch.png)
-</center>
-
-### 特征
-两个输入特征在这个流程中起着至关重要的作用：predicate（pred）和argument（arguments）。 还采用了两个其他特征：谓词上下文（ctx-p）和区域标记（mr）。 因为单个谓词不能精确地描述谓词信息，特别是当相同的词在句子中出现多于一次时。 使用谓词上下文，可以在很大程度上消除歧义。类似地，如果它位于谓词上下文区域中，则使用区域标记 m<sub>r</sub> = 1 来表示参数位置，反之则 m<sub>r</sub> = 0。这四个简单的特征是我们的SRL系统所需要的。上下文大小设置为1的一个样本的特征如下[2]所示：
-<center>
-![pic](./feature.jpg)
-</center>
-
-在这个示例中，相应的标记句子是：
-
-[ <sub>A1</sub> A record date ] has [ <sub>AM-NEG</sub> n't ] been [ <sub>V</sub> set ] . 
-
-在演示中, 我们采用上面的特征模板, 包括：  `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` 并使用 `B/I/O` 方案来标记每个参数。这些特征和标签存储在 `feature` 文件中, 用`\t`分割。
-
-### 数据提供
-
-`dataprovider.py` 是一个包装数据的 Python 文件。 函数 `hook()` 定义了网络的数据槽。六个特征和标签都是索引槽。
-```
-def hook(settings, word_dict, label_dict, **kwargs):
-    settings.word_dict = word_dict
-    settings.label_dict = label_dict
-    #all inputs are integral and sequential type
-    settings.slots = [
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(predicate_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(2),
-        integer_value_sequence(len(label_dict))]
-```
-相应的数据迭代器如下：
-```
-@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
-          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line in fdata:
-            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
-                line.strip().split('\t')
-
-            words = sentence.split()
-            sen_len = len(words)
-            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-
-            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
-            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
-            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
-            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
-
-            marks = mark.split()
-            mark_slot = [int(w) for w in marks]
-
-            label_list = label.split()
-            label_slot = [settings.label_dict.get(w) for w in label_list]
-            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
-                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
-```
-函数 `process` 返回8个特征list和1个标签list。
-
-### 神经网络配置
-
-`db_lstm.py` 是在训练过程中加载字典并定义数据提供程序模块和网络架构的神经网络配置文件。
-
-九个 `data_layer` 从数据提供程序加载实例。八个特征分别转换为向量，并由`mixed_layer`混合。 深度双向LSTM层提取softmax层的特征。目标函数是标签的交叉熵。
-
-### 训练 
-训练的脚本是 `train.sh`，用户只需执行:
-```bash
-  ./train.sh
-```
-`train.sh` 中的内容：
-```
-paddle train \
-  --config=./db_lstm.py \
-  --use_gpu=0 \
-  --log_period=5000 \
-  --trainer_count=1 \
-  --show_parameter_stats_period=5000 \
-  --save_dir=./output \
-  --num_passes=10000 \
-  --average_test_period=10000000 \
-  --init_model_path=./data \
-  --load_missing_parameter_strategy=rand \
-  --test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
-```
-
--  \--config=./db_lstm.py : 网络配置文件
--  \--use_gpu=false: 使用 CPU 训练（如果已安装 PaddlePaddle GPU版本并想使用 GPU 训练可以设置为true，目前 crf_layer 不支持 GPU）
--  \--log_period=500: 每20个batch输出日志
--  \--trainer_count=1: 设置线程数（或 GPU 数）
--  \--show_parameter_stats_period=5000: 每100个batch显示参数统计
--  \--save_dir=./output: 模型输出路径
--  \--num_passes=10000: 设置数据遍历次数，一个pass意味着PaddlePaddle训练数据集中的所有样本被遍历一次
--  \--average_test_period=10000000:  每个 average_test_period 批次对平均参数进行测试
--  \--init_model_path=./data: 参数初始化路径
--  \--load_missing_parameter_strategy=rand: 随机初始不存在的参数
--  \--test_all_data_in_one_period=1: 在一个周期内测试所有数据
-
-
-训练后，模型将保存在目录`output`中。 我们的训练曲线如下：
-<center>
-![pic](./src/curve.jpg)
-</center>
-
-### 测试
-测试脚本是 `test.sh`, 执行:
-```bash
-  ./test.sh
-```
-`tesh.sh` 的主要部分：
-```
-paddle train \
-  --config=./db_lstm.py \
-  --model_list=$model_list \
-  --job=test \
-  --config_args=is_test=1 \
-```
-
-  - \--config=./db_lstm.py: 网络配置文件
-  - \--model_list=$model_list.list: 模型列表文件
-  - \--job=test: 指示测试任务
-  - \--config_args=is_test=1: 指示测试任务的标记
-  - \--test_all_data_in_one_period=1: 在一个周期内测试所有数据
-  
-
-### 预测
-预测脚本是 `predict.sh`，用户只需执行：
-```bash
-  ./predict.sh
-  
-```
-在`predict.sh`中，用户应该提供网络配置文件，模型路径，标签文件，字典文件，特征文件。
-```
-python predict.py 
-     -c $config_file \
-     -w $best_model_path \
-     -l $label_file \
-     -p $predicate_dict_file  \
-     -d $dict_file \
-     -i $input_file \
-     -o $output_file
-```
-
-`predict.py` 是主要的可执行python脚本，其中包括函数：加载模型，加载数据，数据预测。网络模型将输出标签的概率分布。 在演示中，我们使用最大概率的标签作为结果。用户还可以根据概率分布矩阵实现柱搜索或维特比解码。
-
-预测后，结果保存在 `predict.res` 中。
-
-## 引用
-[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. 
-
-[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
diff --git a/doc/tutorials/semantic_role_labeling/index_en.md b/doc/tutorials/semantic_role_labeling/index_en.md
deleted file mode 100644
index 92d7c63483..0000000000
--- a/doc/tutorials/semantic_role_labeling/index_en.md
+++ /dev/null
@@ -1,204 +0,0 @@
-```eval_rst
-..  _semantic_role_labeling:
-```
-
-# Semantic Role labeling Tutorial #
-
-Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering.  An instance is as following [1]:
-
- [ <sub>A0</sub> He ] [ <sub>AM-MOD</sub> would ][ <sub>AM-NEG</sub> n’t ] [ <sub>V</sub> accept] [ <sub>A1</sub> anything of value ] from [<sub>A2</sub> those he was writing about ]. 
-
-- V: verb
-- A0: acceptor
-- A1: thing accepted
-- A2: accepted-from
-- A3: Attribute
-- AM-MOD: modal 
-- AM-NEG: negation
-
-Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank. 
-
-To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem. 
-
-## Data Description
-The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license,  the demo adopts the test data set of CoNLL-2005, which can be reached on website.
-
-To download and process the original data, user just need to execute the following command:
-
-```bash
-cd data
-./get_data.sh
-```
-Several new files appear in the `data `directory as follows.
-```bash
-conll05st-release：the test data set of CoNll-2005 shared task 
-test.wsj.words：the Wall Street Journal data sentences
-test.wsj.props:  the propositional arguments
-feature: the extracted features from data set
-```
-
-## Training
-### DB-LSTM
-Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit. 
-
-Unlike Bidirectional-LSTM that used in Sentiment Analysis demo,  the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model. 
-
-The following figure shows a temporal expanded 2-layer DB-LSTM network.
-<center>
-![pic](./src/network_arch.png)
-</center>
-
-### Features
-Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark m<sub>r</sub> = 1 to denote the argument position if it locates in the predicate context region, or m<sub>r</sub> = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
-<center>
-![pic](./src/feature.jpg)
-</center>
-
-In this sample, the coresponding labelled sentence is:
-
-[ <sub>A1</sub> A record date ] has [ <sub>AM-NEG</sub> n't ] been [ <sub>V</sub> set ] . 
-
-In the demo, we adopt the feature template as above, consists of :  `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`.
-
-### Data Provider
-
-`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The  Six features and label are all IndexSlots.
-```
-def hook(settings, word_dict, label_dict, **kwargs):
-    settings.word_dict = word_dict
-    settings.label_dict = label_dict
-    #all inputs are integral and sequential type
-    settings.slots = [
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(predicate_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(2),
-        integer_value_sequence(len(label_dict))]
-```
-The corresponding data iterator is as following:
-```
-@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
-          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line in fdata:
-            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
-                line.strip().split('\t')
-
-            words = sentence.split()
-            sen_len = len(words)
-            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-
-            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
-            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
-            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
-            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
-
-            marks = mark.split()
-            mark_slot = [int(w) for w in marks]
-
-            label_list = label.split()
-            label_slot = [settings.label_dict.get(w) for w in label_list]
-            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
-                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
-```
-The `process`function yield 9 lists which are 8 features and label.
- 
-### Neural Network Config
-`db_lstm.py` is the neural network config file to load the dictionaries and define the  data provider module and network architecture during the training procedure. 
-
-Nine `data_layer` load instances from data provider. Eight features are transformed into embedddings respectively, and mixed by `mixed_layer` .  Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
-
-### Run Training 
-The script for training is `train.sh`, user just need to execute:
-```bash
-  ./train.sh
-```
-The content in `train.sh`:
-```
-paddle train \
-  --config=./db_lstm.py \
-  --use_gpu=0 \
-  --log_period=5000 \
-  --trainer_count=1 \
-  --show_parameter_stats_period=5000 \
-  --save_dir=./output \
-  --num_passes=10000 \
-  --average_test_period=10000000 \
-  --init_model_path=./data \
-  --load_missing_parameter_strategy=rand \
-  --test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
-```
-
--  \--config=./db_lstm.py : network config file.
--  \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train, until now crf_layer do not support GPU
--  \--log_period=500: print log every 20 batches.
--  \--trainer_count=1: set thread number (or GPU count).
--  \--show_parameter_stats_period=5000: show parameter statistic every 100 batches.
--  \--save_dir=./output: output path to save models.
--  \--num_passes=10000: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
--  \--average_test_period=10000000:  do test on average parameter every average_test_period batches
--  \--init_model_path=./data: parameter initialization path 
--  \--load_missing_parameter_strategy=rand: random initialization unexisted parameters
--  \--test_all_data_in_one_period=1: test all data in one period
-
-
-After training, the models  will be saved in directory `output`. Our training curve is as following:
-<center>
-![pic](./src/curve.jpg)
-</center>
-
-### Run testing
-The script for testing is `test.sh`, user just need to execute:
-```bash
-  ./test.sh
-```
-The main part in `tesh.sh`
-```
-paddle train \
-  --config=./db_lstm.py \
-  --model_list=$model_list \
-  --job=test \
-  --config_args=is_test=1 \
-```
-
-  - \--config=./db_lstm.py: network config file
-  - \--model_list=$model_list.list: model list file
-  - \--job=test: indicate the test job
-  - \--config_args=is_test=1: flag to indicate test
-  - \--test_all_data_in_one_period=1: test all data in 1 period
-  
-
-### Run prediction
-The script for prediction is `predict.sh`, user just need to execute:
-```bash
-  ./predict.sh
-  
-```
-In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file
-```
-python predict.py 
-     -c $config_file \
-     -w $best_model_path \
-     -l $label_file \
-     -p $predicate_dict_file  \
-     -d $dict_file \
-     -i $input_file \
-     -o $output_file
-```
-
-`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix.
-
-After prediction,  the result is saved in `predict.res`.
-
-## Reference
-[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. 
-
-[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
diff --git a/doc/tutorials/semantic_role_labeling/network_arch.png b/doc/tutorials/semantic_role_labeling/network_arch.png
deleted file mode 100644
index 4ae7864212..0000000000
Binary files a/doc/tutorials/semantic_role_labeling/network_arch.png and /dev/null differ
diff --git a/doc/tutorials/semantic_role_labeling/src/curve.jpg b/doc/tutorials/semantic_role_labeling/src/curve.jpg
deleted file mode 100644
index baa35ae7f0..0000000000
Binary files a/doc/tutorials/semantic_role_labeling/src/curve.jpg and /dev/null differ
diff --git a/doc/tutorials/semantic_role_labeling/src/feature.jpg b/doc/tutorials/semantic_role_labeling/src/feature.jpg
deleted file mode 100644
index 0e3310e4ac..0000000000
Binary files a/doc/tutorials/semantic_role_labeling/src/feature.jpg and /dev/null differ
diff --git a/doc/tutorials/semantic_role_labeling/src/network_arch.png b/doc/tutorials/semantic_role_labeling/src/network_arch.png
deleted file mode 100644
index 4ae7864212..0000000000
Binary files a/doc/tutorials/semantic_role_labeling/src/network_arch.png and /dev/null differ
diff --git a/doc/tutorials/sentiment_analysis/index_cn.md b/doc/tutorials/sentiment_analysis/index_cn.md
deleted file mode 100644
index 1323ec1a6a..0000000000
--- a/doc/tutorials/sentiment_analysis/index_cn.md
+++ /dev/null
@@ -1,325 +0,0 @@
-# 情感分析教程
-
-情感分析有许多应用场景。 一个基本的应用场景是区分给定文本的褒贬两极性，给定的文本可以是一个文档、句子、或者是一个小的文本片段。 一个简单的例子如：把用户在购物网站、旅游网站、团购网站（亚马逊、天猫、淘宝等）上发表的评论分成正面评论和负面评论两类。
-
-情感分析也常用于基于大量评论和个人博客来监控社会媒体。 例如，研究人员分析了几个关于消费者信心和政治观点的调查，结果发现它们与同时期的Twitter消息中的情绪词频率相关 [1]。 另一个例子是通过分析每日Twitter博客的文本内容来预测股票变动 [2]。
-
-另一方面，抓取产品的用户评论并分析他们的情感，有助于理解用户对不同公司，不同产品，甚至不同竞争对手产品的偏好。
-
-本教程将指导您完成长期短期记忆（LSTM）网络的训练过程，以分类来自[大型电影评论数据集](http://ai.stanford.edu/~amaas/data/sentiment/)（有时称为[互联网电影数据库 (IMDB)](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf)）的句子的情感 。 此数据集包含电影评论及其相关联的类别标签，即正面和负面。
-
-## 数椐准备
-
-### IMDB 数椐介绍
-
-训练模型之前, 我们需要预处理数椐并构建一个字典。 首先, 你可以使用下面的脚本下载 IMDB 数椐集和[Moses](http://www.statmt.org/moses/)工具, 这是一个基于统计的机器翻译系统. 我们提供了一个数据预处理脚本，它不仅能够处理IMDB数据，还能处理其他用户自定义的数据。 为了使用提前编写的脚本，需要将标记的训练和测试样本移动到另一个路径，这已经在`get_imdb.sh`中完成。
-
-```
-cd demo/sentiment/data
-./get_imdb.sh
-```
-如果数椐获取成功，你将在目录```./demo/sentiment/data```中看到下面的文件：
-
-```
-aclImdb  get_imdb.sh  imdb  mosesdecoder-master
-```
-
-* aclImdb: 从外部网站上下载的原始数椐集。
-* imdb: 仅包含训练和测试数椐集。
-* mosesdecoder-master: Moses 工具。
-
-IMDB数据集包含25,000个已标注过的高极性电影评论用于训练，25,000个用于测试。负面的评论的得分小于等于4,正面的评论的得大于等于7，总评分10分。 运行完脚本 `./get_imdb.sh`后, 我们可以看到在目录 `aclImdb`中的数椐集的结构如下：
-
-```
-imdbEr.txt  imdb.vocab  README  test  train
-```
-* train: 训练数椐集。
-* test : 测试数椐集。
-* imdb.vocab: 字典文件。
-* imdbEr.txt: 字典imdb.vocab中每个切分单词的预期评级。
-* README: 数椐说明文档。
-
-测试集和训练集目录包含下面的文件:
-
-```
-labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt
-```
-
-* pos: 正面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
-* neg: 负面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
-* unsup: 未标记的评价样本，包含50,000个txt文件。
-* urls_xx.txt: 每个评论的网址。
-* xxBow.feat: 用于统计词频的Bow模型特征。
-
-### IMDB 数椐准备
-
-在这个例子中，我们只使用已经标注过的训练集和测试集，且默认在训练集上构建字典，而不使用IMDB数椐集中的imdb.vocab做为字典。训练集已经做了随机打乱排序而测试集没有。 Moses 工具中的脚本`tokenizer.perl` 用于切分单单词和标点符号。执行下面的命令就可以预处理数椐。
-
-```
-cd demo/sentiment/
-./preprocess.sh
-```
-preprocess.sh:
-
-```
-data_dir="./data/imdb"
-python preprocess.py -i data_dir
-```
-
-* data_dir: 输入数椐所在目录。
-* preprocess.py: 预处理脚本。
-
-运行成功后目录`demo/sentiment/data/pre-imdb` 结构如下:
-
-```
-dict.txt  labels.list  test.list  test_part_000  train.list  train_part_000
-```
-* test\_part\_000 and train\_part\_000: 所有标记的测试集和训练集， 训练集已经随机打乱。
-* train.list and test.list: 训练集和测试集文件列表。
-* dict.txt: 利用训练集生成的字典。
-* labels.txt: neg  0, pos 1, 含义：标签0表示负面的评论，标签1表示正面的评论。
-
-### 用户自定义数椐预处理
-
-如果你执行其它的用情感分析来分类文本的任务，可以按如下的结构来准备数椐. 我们提供了脚本来构建字典和预处理数椐。所以你只用按下面的结构来组织数椐就行了。
-
-```
-dataset
-|----train
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-|----test
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-```
-* dataset: 一级目录。
-* train, test: 二级目录。
-* class1,class2,...: 三级目录。
-* text_files: 文本格式的实例文件。
-
-所有同目录下的文本实例文件都是同级别的。 每个文本文件包含一个或者多个实例，每一行表示一个实例。 为了充分的随机打乱训练集, 在预处理含有多行数椐的文本文件时参数设置稍有不同, 执行`preprocess.sh`脚本时需要加上`-m True`参数。 tokenizer.perl 默认用来切分单记和标点符号，如果你不需要这个操作，在运行`preprocess.sh`时加上`-t False`参数即可。
-
-## 训练模型
-
-在这步任务中,我们使用了循环神经网络（RNN）的 LSTM 架构来训练情感分析模型。 引入LSTM模型主要是为了克服消失梯度的问题。 LSTM网络类似于具有隐藏层的标准循环神经网络, 但是隐藏层中的每个普通节点被一个记忆单元替换。 每个记忆单元包含四个主要的元素: 输入门, 具有自循环连接的神经元，忘记门和输出门。 更多的细节可以在文献中找到[4]。 LSTM架构的最大优点是它可以在长时间间隔内记忆信息，而没有短时记忆的损失。在有新的单词来临的每一个时间步骤内，存储在记忆单元区块的历史信息被更新用来迭代的学习单词以合理的序列程现。
-
-<center>![LSTM](src/lstm.png)</center>
-<center>图表 1. LSTM [3]</center>
-
-情感分析是自然语言理解中最典型的问题之一。 它的目的是预测在一个序列中表达的情感态度。 通常, ，仅仅是一些关键词，如形容词和副词，在预测序列或段落的情感中起主要作用。然而有些评论上下文非常长，例如 IMDB的数椐集。 我们只所以使用LSTM来执行这个任务是因为其改进的设计并且具有门机制。 首先，它能够从词级到具有可变上下文长度的上下文级别来总结表示。 第二，它可以在句子级别利用可扩展的上下文, 而大多数方法只是利用n-gram级别的知识。第三，它直接学习段落表示，而不是组合上下文级别信息。
-
-在本演示中，我们提供两个网络，即双向LSTM和三层堆叠LSTM。
-
-#### 双向LSTM
-
-图2是双向LSTM网络，后面连全连接层和softmax层。
-
-<center>![BiLSTM](src/bi_lstm.jpg)</center>
-<center>图 2. Bidirectional-LSTM </center>
-
-#### Stacked-LSTM
-图3是三层LSTM结构。图的底部是word embedding(对文档处理后形成的单词向量)。 接下来，连接三个LSTM隐藏层，并且第二个是反向LSTM。然后提取隐藏LSTM层的所有时间步长的最大词向量作为整个序列的表示。 最后，使用具有softmax激活的全连接前馈层来执行分类任务。 更多内容可查看参考文献 [5]。
-
-<center>![StackedLSTM](src/stacked_lstm.jpg)</center>
-<center>图 3. Stacked-LSTM for sentiment analysis </center>
-
-**配置**
-
-进入`demo/sentiment` 目录 , `trainer_config.py` 是一个配置文件的例子, 其中包含算法和网络配置。第一行从`sentiment_net.py`中导出预定义的网络。
-
-trainer_config.py:
-
-```python
-from sentiment_net import *
-
-data_dir  = "./data/pre-imdb"
-# whether this config is used for test
-is_test = get_config_arg('is_test', bool, False)
-# whether this config is used for prediction
-is_predict = get_config_arg('is_predict', bool, False)
-dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
-
-################## Algorithm Config #####################
-
-settings(
-  batch_size=128,
-  learning_rate=2e-3,
-  learning_method=AdamOptimizer(),
-  regularization=L2Regularization(8e-4),
-  gradient_clipping_threshold=25
-)
-
-#################### Network Config ######################
-stacked_lstm_net(dict_dim, class_dim=class_dim,
-                 stacked_num=3, is_predict=is_predict)
-#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
-```
-
-* **数椐定义**:
-   * get\_config\_arg(): 获取通过 `--config_args=xx` 设置的命令行参数。
-   * 定义训练数椐和测试数椐提供者, 这里使用了PaddlePaddle的Python接口来加载数椐。想了解更多细节可以参考PyDataProvider部分的文档
-
-* **算法配置**:
-   * 使用随机梯度下降（sgd）算法。
-   * 使用 adam 优化。
-   * 设置batch size大小为128。
-   * 设置平均sgd窗口。
-   * 设置全局学习率。
-* **网络配置**:
-   * dict_dim: 获取字典维度。
-   * class_dim: 设置类别数，IMDB有两个标签，即正面评价标签和负面评价标签。
-   * `stacked_lstm_net`: 预定义网络如图3所示，默认情况下使用此网络
-   * `bidirectional_lstm_net`: 预定义网络，如图2所示。
-
-**训练**
-
-首先安装PaddlePaddle。 然后使用下面的脚本 `train.sh` 来开启本地的训练。
-
-```
-cd demo/sentiment/
-./train.sh
-```
-
-train.sh:
-
-```
-config=trainer_config.py
-output=./model_output
-paddle train --config=$config \
-             --save_dir=$output \
-             --job=train \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --num_passes=10 \
-             --log_period=20 \
-             --dot_period=20 \
-             --show_parameter_stats_period=100 \
-             --test_all_data_in_one_period=1 \
-             2>&1 | tee 'train.log'
-```
-
-* \--config=$config: 设置网络配置。
-* \--save\_dir=$output: 设置输出路径以保存训练完成的模型。
-* \--job=train: 设置工作模式为训练。
-* \--use\_gpu=false: 使用CPU训练，如果你安装GPU版本的PaddlePaddle，并想使用GPU来训练设置为true。
-* \--trainer\_count=4:设置线程数（或GPU个数）。
-* \--num\_passes=15: 设置pass，PaddlePaddle中的一个pass意味着对数据集中的所有样本进行一次训练。
-* \--log\_period=20: 每20个batch打印一次日志。
-* \--show\_parameter\_stats\_period=100: 每100个batch打印一次统计信息。
-* \--test\_all_data\_in\_one\_period=1: 每次测试都测试所有数据。
-
-如果运行成功，输出日志保存在路径 `demo/sentiment/train.log`中，模型保存在目录`demo/sentiment/model_output/`中。  输出日志说明如下：
-
-```
-Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875  CurrentEval: classification_error_evaluator=0.36875
-...
-Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922
-Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406
-```
-- Batch=xx: 表示训练了xx个Batch。
-- samples=xx: 表示训练了xx个样本。。
-- AvgCost=xx: 从第0个batch到当前batch的平均损失。
-- CurrentCost=xx: 最新log_period个batch处理的当前损失。
-- Eval: classification\_error\_evaluator=xx: 表示第0个batch到当前batch的分类错误。
-- CurrentEval: classification\_error\_evaluator: 最新log_period个batch的分类错误。
-- Pass=0: 通过所有训练集一次称为一遍。 0表示第一次经过训练集。
-
-默认情况下，我们使用`stacked_lstm_net`网络，当传递相同的样本数时，它的收敛速度比`bidirectional_lstm_net`快。如果要使用双向LSTM，只需删除最后一行中的注释并把“stacked_lstm_net”注释掉。
-
-## 测试模型
-
-测试模型是指使用训练出的模型评估已标记的验证集。
-
-```
-cd demo/sentiment
-./test.sh
-```
-
-test.sh:
-
-```bash
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-  sort | head -n 1
-}
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="model_output/pass-${LOG[1]}"
-
-echo 'evaluating from pass '$evaluate_pass
-
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-net_conf=trainer_config.py
-paddle train --config=$net_conf \
-             --model_list=$model_list \
-             --job=test \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --config_args=is_test=1 \
-             2>&1 | tee 'test.log'
-```
-
-函数`get_best_pass`依据分类错误率获得最佳模型进行测试。 在本示例中，我们默认使用IMDB的测试数据集作为验证。 与训练不同，它需要在这里指定`--job = test`和模型路径，即`--model_list = $model_list`。如果运行成功，日志将保存在“demo / sentiment / test.log”的路径中。例如，在我们的测试中，最好的模型是`model_output / pass-00002`，分类误差是0.115645，如下：
-
-```
-Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645
-```
-
-## 预测
-
-`predict.py`脚本提供了一个预测接口。在使用它之前请安装PaddlePaddle的python api。 预测IMDB的未标记评论的一个实例如下：
-
-```
-cd demo/sentiment
-./predict.sh
-```
-predict.sh:
-
-```
-#Note the default model is pass-00002, you shold make sure the model path
-#exists or change the mode path.
-model=model_output/pass-00002/
-config=trainer_config.py
-label=data/pre-imdb/labels.list
-cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
-     --tconf=$config\
-     --model=$model \
-     --label=$label \
-     --dict=./data/pre-imdb/dict.txt \
-     --batch_size=1
-```
-
-* `cat ./data/aclImdb/test/pos/10007_10.txt` : 输入预测样本。
-* `predict.py` : 预测接口脚本。
-* `--tconf=$config` : 设置网络配置。
-* `--model=$model` : 设置模型路径。
-* `--label=$label` : 设置标签类别字典，这个字典是整数标签和字符串标签的一个对应。
-* `--dict=data/pre-imdb/dict.txt` : 设置字典文件。
-* `--batch_size=1` : 设置batch size。
-
-注意应该确保默认模型路径`model_output / pass-00002`存在或更改为其它模型路径。
-
-本示例的预测结果：
-
-```
-Loading parameters from model_output/pass-00002/
-./data/aclImdb/test/pos/10014_7.txt: predicting label is pos
-```
-我们真诚地感谢您的关注，并欢迎您来参与贡献。
-
-## 参考文档
-[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010. <br>
-[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.<br>
-[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.<br>
-[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019. <br>
-[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015. <br>
diff --git a/doc/tutorials/sentiment_analysis/index_en.md b/doc/tutorials/sentiment_analysis/index_en.md
deleted file mode 100644
index bb7681db44..0000000000
--- a/doc/tutorials/sentiment_analysis/index_en.md
+++ /dev/null
@@ -1,328 +0,0 @@
-# Sentiment Analysis Tutorial
-
-Sentiment analysis has many applications. A basic task in sentiment analysis is classifying the polarity of a given text at the document, sentence or feature/aspect level. One simple example is to classify the customer reviews in a shopping website, a tourism website, and group buying websites like Amazon, TaoBao, Tmall etc.
-
-Sentiment analysis is also used to monitor social media based on large amount of reviews or blogs. For example, the researchers analyzed several surveys on consumer confidence and political opinion, found they correlate to sentiment word frequencies in contemporaneous Twitter messages [1]. Another example is to forecast stock movements through analyzing the text content of a daily Twitter blog [2].
-
-On the other hand, grabbing the user comments of products and analyzing their sentiment are useful to understand user preferences for companies, products, even competing products.
-
-This tutorial will guide you through the process of training a Long Short Term Memory (LSTM) Network to classify the sentiment of sentences from [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), sometimes known as the Internet Movie Database (IMDB). This dataset contains movie reviews along with their associated binary sentiment polarity labels, namely positive and negative. So randomly guessing yields 50% accuracy.
-
-## Data Preparation
-
-### IMDB Data Introduction
-
-Before training models, we need to preprocess the data and build a dictionary. First, you can use following script to download IMDB dataset and [Moses](http://www.statmt.org/moses/) tool, which is a statistical machine translation system. We provide a data preprocessing script, which is capable of handling not only IMDB data, but also other user-defined data. In order to use the pre-written script, it needs to move labeled train and test samples to another path, which has been done in `get_imdb.sh`.
-
-```
-cd demo/sentiment/data
-./get_imdb.sh
-```
-If the data is obtained successfuly, you will see the following files at ```./demo/sentiment/data```:
-
-```
-aclImdb  get_imdb.sh  imdb  mosesdecoder-master
-```
-
-* aclImdb: raw dataset downloaded from website.
-* imdb: only contains train and test data.
-* mosesdecoder-master: Moses tool.
-
-IMDB dataset contains 25,000 highly polar movie reviews for training, and 25,000 for testing. A negative review has a score ≤ 4 out of 10, and a positive review has a score ≥ 7 out of 10. After running `./get_imdb.sh`, we can find the dataset has the following structure in `aclImdb`.
-
-```
-imdbEr.txt  imdb.vocab  README  test  train
-```
-* train: train sets.
-* test : test sets.
-* imdb.vocab: dictionary.
-* imdbEr.txt: expected rating for each token in imdb.vocab.
-* README: data documentation.
-
-The file in train set directory is as follows. The test set also contains them except `unsup` and `urls_unsup.txt`.
-
-```
-labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt
-```
-
-* pos: positive samples, contains 12,500 txt files, each file is one movie review.
-* neg: negative samples, contains 12,500 txt files, each file is one movie review.
-* unsup: unlabeled samples, contains 50,000 txt files.
-* urls_xx.txt: urls of each reviews.
-* xxBow.feat: already-tokenized bag of words (BoW) features.
-
-### IMDB Data Preparation
-
-In this demo, we only use labled train and test set and not use imdb.vocab as dictionary. By default, dictionary is builded on train set. Train set is shuffled and test set is not. `tokenizer.perl` in Moses tool is used to tokenize the words and punctuation. Simply execute the following command to preprcess data.
-
-```
-cd demo/sentiment/
-./preprocess.sh
-```
-preprocess.sh:
-
-```
-data_dir="./data/imdb"
-python preprocess.py -i data_dir
-```
-
-* data_dir: input data directory.
-* preprocess.py: preprocess script.
-
-If running successfully, you will see `demo/sentiment/data/pre-imdb` directory as follows:
-
-```
-dict.txt  labels.list  test.list  test_part_000  train.list  train_part_000
-```
-* test\_part\_000 and train\_part\_000: all labeled test and train sets. Train sets have be shuffled.
-* train.list and test.list: train and test file lists.
-* dict.txt: dictionary generated on train sets by default.
-* labels.txt: neg  0, pos 1, means label 0 is negative review, label 1 is positive review.
-
-### User-defined Data Preparation
-
-If you perform other sentiment classifcation task, you can prepare data as follows. We have provided the scripts to build dictionary and preprocess data. So just organize data as follows.
-
-```
-dataset
-|----train
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-|----test
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-```
-* dataset: 1st directory.
-* train, test: 2nd directory.
-* class1,class2,...: 3rd directory.
-* text_files: samples with text file format.
-
-All samples with text files format under the same folder are same category. Each text file contains one or more samples and each line is one sample. In order to shuffle fully, the preprocessing is a little different for data with multiple lines in one text file, which needs to set `-m True` in `preprocess.sh`. And tokenizer.perl is used by default. If you don't need it, only set `-t False` in `preprocess.sh'.
-
-## Training
-
-In this task, we use Recurrent Neural Network (RNN) of LSTM architecure to train sentiment analysis model. LSTM model was introduced primarily in order to overcome the problem of vanishing gradients. LSTM network resembles a standard recurrent neural network with a hidden layer, but each ordinary node in the hidden layer is replaced by a memory cell. Each memory cell contains four main elements: an input gate, a neuron with a self-recurrent connection, a forget gate and an output gate. More details can be found in the literature [4]. The biggest advantage of the LSTM architecture is that it learns to memorize information over long time intervals without the loss of short time memory. At each time step with a new coming word, historical information stored in the memory block is updated to iteratively learn the sequence representation.
-
-<center>![LSTM](./lstm.png)</center>
-<center>Figure 1. LSTM [3]</center>
-
-Sentiment analysis is among the most typical problems in natural language understanding. It aims at predicting the attitude expressed in a sequence. Usually, only some key words, like adjectives and adverbs words, play a major role in predicting the sentiment of sequences or paragraphs. However, some review or comment contexts are very long, such as IMDB dataset. We use LSTM to perform this task for its improved design with the gate mechanism. First, it is able to summarize the representation from word level to context level with variable context length which is adapted by the gate values. Second, it can utilize the expanded context at the sentence level, while most methods are good at utilizing n-gram level knowledge. Third, it learns the paragraph representation directly rather than combining the context level information. This results in this end-to-end framework.
-
-In this demo we provide two network, namely bidirectional-LSTM and three layers of stacked-LSTM.
-
-#### Bidirectional-LSTM
-
-One is a bidirectional LSTM network, connected by fully connected layer and softmax, as shown in Figure 2.
-
-<center>![BiLSTM](./bi_lstm.jpg)</center>
-<center>Figure 2. Bidirectional-LSTM </center>
-
-#### Stacked-LSTM
-Another is three-layer LSTM structure in Figure 3. The bottom of the figure is word embedding. Next, three LSTM-Hidden layers are connected and the second LSTM is reversed. Then extract the maximum hidden vectors of all time step of hidden and LSTM layer as the representation for the entire sequence. Finally, a fully connected feed forward layer with softmax activation is used to perform the classification task. This network is refered to paper [5].
-
-<center>![StackedLSTM](./stacked_lstm.jpg)</center>
-<center>Figure 3. Stacked-LSTM for sentiment analysis </center>
-
-**Config**
-
-Switch into `demo/sentiment` directory, `trainer_config.py` file is an example of the config, containing algorithm and newtork configure. The first line imports predefined networks from `sentiment_net.py`.
-
-trainer_config.py:
-
-```python
-from sentiment_net import *
-
-data_dir  = "./data/pre-imdb"
-# whether this config is used for test
-is_test = get_config_arg('is_test', bool, False)
-# whether this config is used for prediction
-is_predict = get_config_arg('is_predict', bool, False)
-dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
-
-################## Algorithm Config #####################
-
-settings(
-  batch_size=128,
-  learning_rate=2e-3,
-  learning_method=AdamOptimizer(),
-  average_window=0.5,
-  regularization=L2Regularization(8e-4),
-  gradient_clipping_threshold=25
-)
-
-#################### Network Config ######################
-stacked_lstm_net(dict_dim, class_dim=class_dim,
-                 stacked_num=3, is_predict=is_predict)
-#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
-```
-
-* **Data Definition**:
-   * get\_config\_arg(): get arguments setted by `--config_args=xx` in commandline argument.
-   * Define data provider, here using Python interface to load data. For details, you can refer to the document of PyDataProvider2.
-
-* **Algorithm Configuration**:
-   * set batch size of 128.
-   * set global learning rate.
-   * use adam optimization.
-   * set average sgd window.
-   * set L2 regularization.
-   * set gradient clipping threshold.
-* **Network Configuration**:
-   * dict_dim: dictionary dimension.
-   * class_dim: category number, IMDB has two label, namely positive and negative label.
-   * `stacked_lstm_net`: predefined network as shown in Figure 3, use this network by default.
-   * `bidirectional_lstm_net`: predefined network as shown in Figure 2.
-
-**Training**
-
-Install PaddlePaddle first if necessary. Then you can use script `train.sh` as follows to launch local training.
-
-```
-cd demo/sentiment/
-./train.sh
-```
-
-train.sh:
-
-```
-config=trainer_config.py
-output=./model_output
-paddle train --config=$config \
-             --save_dir=$output \
-             --job=train \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --num_passes=10 \
-             --log_period=20 \
-             --dot_period=20 \
-             --show_parameter_stats_period=100 \
-             --test_all_data_in_one_period=1 \
-             2>&1 | tee 'train.log'
-```
-
-* \--config=$config: set network config.
-* \--save\_dir=$output: set output path to save models.
-* \--job=train: set job mode to train.
-* \--use\_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train.
-* \--trainer\_count=4: set thread number (or GPU count).
-* \--num\_passes=15: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
-* \--log\_period=20: print log every 20 batches.
-* \--show\_parameter\_stats\_period=100: show parameter statistic every 100 batches.
-* \--test\_all_data\_in\_one\_period=1: test all data every testing.
-
-If the run succeeds, the output log is saved in path of `demo/sentiment/train.log` and model is saved in path of `demo/sentiment/model_output/`. The output log is explained as follows.
-
-```
-Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875  CurrentEval: classification_error_evaluator=0.36875
-...
-Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922
-Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406
-```
-- Batch=xx: means passing xx batches.
-- samples=xx: means passing xx samples.
-- AvgCost=xx: averaged cost from 0-th batch to current batch.
-- CurrentCost=xx: current cost of latest log_period batches.
-- Eval: classification\_error\_evaluator=xx: means classfication error from 0-th batch ro current batch.
-- CurrentEval: classification\_error\_evaluator: current classfication error of the lates log_period batches.
-- Pass=0: Going through all training set one time is called one pass. 0 means going through training set first time.
-
-By default, we use the `stacked_lstm_net` network, which converges at a faster rate than `bidirectional_lstm_net` when passing same sample number. If you want to use bidirectional LSTM, just remove comment in the last line and comment `stacked_lstm_net`.
-
-## Testing
-
-Testing means evaluating the labeled validation set using trained model.
-
-```
-cd demo/sentiment
-./test.sh
-```
-
-test.sh:
-
-```bash
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-  sort | head -n 1
-}
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="model_output/pass-${LOG[1]}"
-
-echo 'evaluating from pass '$evaluate_pass
-
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-net_conf=trainer_config.py
-paddle train --config=$net_conf \
-             --model_list=$model_list \
-             --job=test \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --config_args=is_test=1 \
-             2>&1 | tee 'test.log'
-```
-
-The function `get_best_pass` gets the best model by classification error rate for testing. In this example, We use test dataset of IMDB as validation by default. Unlike training, it needs to specify `--job=test` and model path, namely `--model_list=$model_list` here. If running successfully, the log is saved in path of `demo/sentiment/test.log`. For example, in our test, the best model is `model_output/pass-00002`, the classification error is 0.115645 as follows.
-
-```
-Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645
-```
-
-## Prediction
-
-`predict.py` provides a predicting interface. You should install python api of PaddlePaddle before using it. One example to predict unlabeled review of IMDB is as follows. Simply running:
-
-```
-cd demo/sentiment
-./predict.sh
-```
-predict.sh:
-
-```
-#Note the default model is pass-00002, you shold make sure the model path
-#exists or change the mode path.
-model=model_output/pass-00002/
-config=trainer_config.py
-label=data/pre-imdb/labels.list
-cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
-     --tconf=$config\
-     --model=$model \
-     --label=$label \
-     --dict=./data/pre-imdb/dict.txt \
-     --batch_size=1
-```
-
-* `cat ./data/aclImdb/test/pos/10007_10.txt` : the input sample.
-* `predict.py` : predicting interface.
-* `--tconf=$config` : set network configure.
-* ` --model=$model` : set model path.
-* `--label=$label` : set dictionary about corresponding relation between integer label and string label.
-* `--dict=data/pre-imdb/dict.txt` : set dictionary.
-* `--batch_size=1` : set batch size.
-
-Note you should make sure the default model path `model_output/pass-00002`
-exists or change the model path.
-
-Predicting result of this example:
-
-```
-Loading parameters from model_output/pass-00002/
-./data/aclImdb/test/pos/10014_7.txt: predicting label is pos
-```
-We sincerely appreciate your interest and welcome your contributions.
-
-## Reference
-[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010. <br>
-[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.<br>
-[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.<br>
-[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019. <br>
-[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015. <br>
diff --git a/doc/tutorials/sentiment_analysis/lstm.png b/doc/tutorials/sentiment_analysis/lstm.png
deleted file mode 100644
index aaf1fc690d..0000000000
Binary files a/doc/tutorials/sentiment_analysis/lstm.png and /dev/null differ
diff --git a/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg b/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg
deleted file mode 100644
index adec1606d6..0000000000
Binary files a/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg and /dev/null differ
diff --git a/doc/tutorials/sentiment_analysis/src/lstm.png b/doc/tutorials/sentiment_analysis/src/lstm.png
deleted file mode 100644
index aaf1fc690d..0000000000
Binary files a/doc/tutorials/sentiment_analysis/src/lstm.png and /dev/null differ
diff --git a/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg b/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg
deleted file mode 100644
index 4239055050..0000000000
Binary files a/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg and /dev/null differ
diff --git a/doc/tutorials/sentiment_analysis/stacked_lstm.jpg b/doc/tutorials/sentiment_analysis/stacked_lstm.jpg
deleted file mode 100644
index 4239055050..0000000000
Binary files a/doc/tutorials/sentiment_analysis/stacked_lstm.jpg and /dev/null differ
diff --git a/doc/tutorials/text_generation/index_cn.md b/doc/tutorials/text_generation/index_cn.md
deleted file mode 100644
index 41a87b926d..0000000000
--- a/doc/tutorials/text_generation/index_cn.md
+++ /dev/null
@@ -1,339 +0,0 @@
-# 文本生成教程 #
-
-在语言生成领域中，“序列到序列”（sequence to sequence）的方法已被证明是一种强大的模型。它可以被应用于进行机器翻译（machine translation）、query改写（query rewriting）、图像描述（image captioning）等等。
-
-本篇教程将会指导你通过训练一个“序列到序列”的神经网络机器翻译（NMT）模型来将法语翻译成英语。
-
-我们遵循 [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473) 这篇文章，其中详细说明了模型架构，以及在WMT-14数据集上得到良好表现的训练过程。本篇教程在PaddlePaddle中重现了这一良好的训练结果。
-
-我们感谢@caoying的pull request，其中定义了模型架构和solver配置。
-
-## 数据准备 ##
-### 下载与解压缩 ###
-从该链接 [http://www-lium.univ-lemans.fr/~schwenk/cslm\_joint\_paper/](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/) 下载WMT-14数据集，然后解压，并将Develop和Test数据分别放入不同的文件夹。
-
-- **Train data**: [bitexts (选择过后的)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)
-- **Develop and Test data**: [dev 与 test 数据](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)
-
-在Linux下，只需要简单地运行以下命令。否则你需要自己下载、解压、拆分到不同文件夹、并且分别重命名文件后缀。
-
-```bash
-cd demo/seqToseq/data
-./wmt14_data.sh
-```
-
-我们会发现数据集 `wmt14` 中包含如下表所示的3个文件夹。
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<colgroup>
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-</colgroup>
-
-<thead>
-<tr>
-<th scope="col" class="left">folder name</th>
-<th scope="col" class="left">French-English parallel corpora file</th>
-<th scope="col" class="left">number of total file</th>
-<th scope="col" class="left">size</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">train_data</td>
-<td class="left">ccb2_pc30.src, ccb2_pc30.trg, etc</td>
-<td class="left">12</td>
-<td class="left">3.55G</td>
-</tr>
-
-<tr>
-<td class="left">test_data</td>
-<td class="left">ntst1213.src, ntst1213.trg</td>
-<td class="left">2</td>
-<td class="left">1636k</td>
-</tr>
-
-<tr>
-<td class="left">gen_data</td>
-<td class="left">ntst14.src, ntst14.trg</td>
-<td class="left">2</td>
-<td class="left">864k</td>
-</tr>
-</tbody>
-</table>
-<br/>
-
-- 每个文件夹都包含法语到英语的平行语料库
-- **XXX.src** 是原始法语文件；**XXX.trg** 是目标英语文件
-- **XXX.src** 和 **XXX.trg** 的行数应该一致
-- 每行都是一个法语或者英语的句子
-- **XXX.src** 和 **XXX.trg** 中任意第i行的句子之间都有着一一对应的关系
-
-### 用户自定义数据集 ###
-
-如果你想进行诸如语义转述（Paraphrasing）等其他“序列到序列”的任务，你只需要按照如下方式组织数据，并将它们放在`demo/seqToseq/data`目录下：
-
-    dataset
-      train
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-      test
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-      gen
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-  
-- 一级目录：数据集文件夹名称
-- 二级目录：train、test和gen这三个文件夹是固定的
-- 三级目录：源语言到目标语言的平行语料库文件
-  - **XXX.src** 是源语言的文件，**XXX.trg** 时目标语言的文件
-  - 文件中的每行都必须是一个句子
-  - **XXX.src** 和 **XXX.trg** 中任意第i行的句子之间都必须有着一一对应的关系
-
-## 数据预处理 ##
-### 预处理工作流程 ###
-- 将每个源语言到目标语言的平行语料库文件合并为一个文件：
-  - 合并每个 **XXX.src** 和 **XXX.trg** 文件为 **XXX**
-  - **XXX** 中的第i行 = **XXX.src** 中的第i行 + '\t' + **XXX.trg**中的第i行
-- 创建训练数据的“源字典”和“目标字典”，每个字典都有DICTSIZE个单词，包括：
-  - 词频最高的（DICTSIZE - 3）个单词
-  - 3个特殊符号
-  - `<s>`：序列的开始
-  - `<e>`：序列的结束
-  - `<unk>`：未包含在字典中的单词
-
-### 预处理命令和结果
-对数据集进行预处理的基本命令是：
-
-```python
-cd demo/seqToseq/
-python preprocess.py -i INPUT [-d DICTSIZE] [-m]
-```
-
-- `-i INPUT`：输入的原始数据集路径
-- `-d DICTSIZE`：指定的字典单词数，如果没有设置，字典会包含输入数据集中的所有单词
-- `-m --mergeDict`：合并 “源字典”和“目标字典”，使得两个字典有相同的上下文
-
-你将会看到如下消息：
-
-    concat parallel corpora for dataset
-    build source dictionary for train data
-    build target dictionary for train data
-    dictionary size is XXX
-
-然后你只需要运行以下命令：
-
-```python
-python preprocess.py -i data/wmt14 -d 30000
-```
-
-这将花费数分钟的时间，并且将预处理好的数据集存放在`demo/seqToseq/data/pre-wmt14`目录下。目录结构如下：
-
-    train test gen train.list test.list gen.list src.dict trg.dict# Text generation Tutorial #
-
-- **train, test, gen**：分别包含了法语到英语的平行语料库的训练数据、测试数据和生成数据。文件夹中的每个文件的每一行包含两部分，首先是法语序列，然后是对应的英语序列。
-- **train.list, test.list, gen.list**：分别为train，test，gen文件夹中的文件列表
-- **src.dict, trg.dict**：源（法语）/目标（英语）字典，每个字典包含总共30000个单词：29997个最高频单词和3个特殊符号
-
-## 模型训练 ##
-### 简介###
-
-神经网络机器翻译（NMT）旨在建立一个可以被协同调至最优翻译效果的单神经元网络。近期提出的NMT模型通常都属于编解码模型（encoder–decoder models）的一种。编解码模型将一个源语句编码为一个定长的向量，然后解码器通过这个向量生成一个目标语句。
-
-在这个任务中，我们使用了一个编解码模型的扩展，它同时学习排列(align)与翻译。每当模型在翻译过程中生成了一个单词，它就会在源语句中搜索出最相关信息的位置的集合。解码器根据上下文向量预测出一个目标单词，这个向量与源中搜索出的位置和所有之前生成的目标单词有关。如想了解更多详细的解释，可以参考 [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473)。
-
-这个模型对于编解码模型来说，最不同的特色是它并没有将输入语句编码为一个单独的定长向量。相反，它将输入语句编码为向量的序列，其中每个向量对应输入语句中的一个元素。然后在解码被翻译的语句时，会自适应地从这些向量中选择一个子集出来。这使得NMT模型得以解放出来，不必再将任意长度源语句中的所有信息压缩至一个定长的向量中。该模型在长语句翻译的场景下效果提升更加明显，在任意长度语句翻译的场景下都可以观察到其效果的提升。
-<center>![](./encoder-decoder-attention-model.png)</center>
-<center>Figure 1. Encoder-Decoder-Attention-Model</center>
-
-### 使用PaddlePaddle训练模型 ###
-我们在训练之前需要常见一个模型配置文件，这里是一个例子`demo/seqToseq/translation/train.conf`。前三行import了定义network，job_mode和attention_mode的python函数。
-
-```python
-from seqToseq_net import *
-is_generating = False
-
-### Data Definiation
-train_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
-                             is_generating = is_generating)
-
-### Algorithm Configuration
-settings(
-    learning_method = AdamOptimizer(),
-    batch_size = 50,
-    learning_rate = 5e-4)
-
-### Network Architecture
-gru_encoder_decoder(train_conf, is_generating)
-```
-
-1. **Data Definiation**：在示例中我们定义了一个序列到序列的训练和测试数据。它返回train_conf作为配置，其输入参数如下：
-  - data_dir：训练数据和测试数据的目录
-  - is_generating：这个配置是否用来生成，这里设置为False
-2. **Algorithm Configuration**：在示例中我们使用SGD训练算法（默认），和ADAM学习方法，指定batch_size为50，learning_rate为5e-4
-3. **Network Architecture**：在示例中我们使用attention版本的GRU编解码网络。它包括了一个双向的GRU作为编码器和解码器，它模拟了解码翻译过程中在源语句中的搜索。
-
-### 训练模型的命令与结果###
-写完模型配置之后，我们可以通过以下命令来训练模型：
-
-```bash
-cd demo/seqToseq/translation
-./train.sh
-```
-
-`train.sh` 的内容如下所示：
-
-```bash
-paddle train \
---config='translation/train.conf' \
---save_dir='translation/model' \
---use_gpu=false \
---num_passes=16 \
---show_parameter_stats_period=100 \
---trainer_count=4 \
---log_period=10 \
---dot_period=5 \
-2>&1 | tee 'translation/train.log'
-```
-- config: 设置神经网络的配置文件
-- save_dir: 设置保存模型的输出路径
-- use_gpu: 是否使用GPU训练，这里设置为使用CPU
-- num_passes: 设置passes的数量。paddle中的一条pass表示训练数据集中所有的样本一次
-- show_parameter_stats_period: 这里每隔100个batch显示一次参数统计信息
-- trainer_count: 设置CPU线程数或者GPU设备数
-- log_period: 这里每隔10个batch打印一次日志
-- dot_period: 这里每个5个batch打印一个点"."
-
-训练的损失函数默认每隔10个batch打印一次，你将会看到如下消息：
-
-    I0719 19:16:45.952062 15563 TrainerInternal.cpp:160]  Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155  CurrentEval: classification_error_evaluator=0.737155
-    I0719 19:17:56.707319 15563 TrainerInternal.cpp:160]  Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392  CurrentEval: classification_error_evaluator=0.659065
-    .....
-- AvgCost：从第0个batch到当前batch的平均cost
-- CurrentCost:：当前batch的cost
-- classification\_error\_evaluator(Eval)：从第0个评估到当前评估中，每个单词的预测错误率
-- classification\_error\_evaluator(CurrentEval)：当前评估中，每个单词的预测错误率
-
-当classification\_error\_evaluator的值低于0.35时，模型就训练成功了。
-
-## 文本生成 ##
-### 简介###
-
-一般而言，NMT模型受制于源语句的编码，并且通过给出当前目标单词来预测下一个目标单词。在训练过程中，当前单词在相比之下总是被当作真值（ground truth）。在生成过程中，当前单词是解码器最后一步的输出，这来自于PaddlePaddle的内存中。
-
-而且，我们使用集束搜索（Beam Search）来生成序列。集束搜索使用广度优先搜索来构建搜索树。对于树的每一层，生成当前层的所有后继状态，并将它们按照启发代价（heuristic cost）升序排列。但是这种方法在每层只保存预设数量的最优状态（这个数量称为beam size）。
-
-### 预训练的模型 ###
-我们在拥有50个节点的集群中训练模型，每个节点有两个6核CPU。我们在5天里训练了16个pass，其中每条pass花费了7个小时。model_dir中有16个子目录，每个里面都包含202MB的全部的模型参数。然后我们发现pass-00012的模型有着最高的BLEU值27.77（参考文献[BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf)）。要下载解压这个模型，只需在linux下运行如下命令：
-
-```bash
-cd demo/seqToseq/data
-./wmt14_model.sh
-```
-
-### 使用PaddlePaddle生成模型 ###
-在翻译法语句子之前，我们需要创建模型配置文件。这里是一个例子`demo/seqToseq/translation/gen.conf`。前三行import了定义network，job_mode和attention_mode的python函数。
-
-```python
-from seqToseq_net import *
-is_generating = True
-
-################## Data Definiation #####################
-gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
-                           is_generating = is_generating,
-                           gen_result = "./translation/gen_result")
-
-############## Algorithm Configuration ##################
-settings(
-  learning_method = AdamOptimizer(),
-  batch_size = 1,
-  learning_rate = 0)
-
-################# Network configure #####################
-gru_encoder_decoder(gen_conf, is_generating)
-```
-
-1. **Data Definiation**：在示例中我们定义了一个序列到序列的生成数据。它返回gen_conf作为配置，其输入参数如下：
-  - data_dir：生成数据的目录
-  - is_generating：这个配置是否用来生成，这里设置为True
-  - gen_result：保存生成结果的文件
-2. **Algorithm Configuration**：在生成过程中我们使用SGD训练算法，并指定batch_size为1（每次生成1个序列），learning_rate为0
-3. **Network Architecture**：本质上与训练模型一样
-
-### 生成模型的命令与结果 ###
-写完模型配置之后，我们可以通过以下命令来进行从法语到英语的文本翻译：
-
-```bash
-cd demo/seqToseq/translation
-./gen.sh
-```
-
- `gen.sh` 的内容如下所示。与训练模型不同的是，这里有一些不同的参数需要指定：
-
-```bash
-paddle train \
---job=test \
---config='translation/gen.conf' \
---save_dir='data/wmt14_model' \
---use_gpu=true \
---num_passes=13 \
---test_pass=12 \
---trainer_count=1 \
-2>&1 | tee 'translation/gen.log'
-```
-- job：设置任务的模式为测试
-- save_dir：存储模型的路径
-- num_passes and test_pass：从test_pass到（num_passes - 1）加载模型参数，这里只加载 `data/wmt14_model/pass-00012`
-
-你将会看到这样的消息：
-
-    I0706 14:48:31.178915 31441 GradientMachine.cpp:143] Loading parameters from data/wmt14_model/pass-00012
-    I0706 14:48:40.012039 31441 Tester.cpp:125]  Batch=100 samples=100 AvgCost=0
-    I0706 14:48:48.898632 31441 Tester.cpp:125]  Batch=200 samples=200 AvgCost=0
-    ...
-
-然后在`demo/seqToseq/translation/gen_result`中的生成结果如下所示：
-
-    0
-    0       -11.1314         The <unk> <unk> about the width of the seats while large controls are at stake <e>
-    1       -11.1519         The <unk> <unk> on the width of the seats while large controls are at stake <e>
-    2       -11.5988         The <unk> <unk> about the width of the seats while large controls are at stake . <e>
-
-    1
-    0       -24.4149         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of the Dubai <unk> . <e>
-    1       -26.9524         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s <unk> . <e>
-    2       -27.9574         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s Dubai <unk> . <e>
-    ...
-
-- 这是集束搜索的结果，其中beam size是3
-- 第一行的“0”和第6行的“1”表示生成数据的序列id
-- 其他六行列出了集束搜索的结果
-  - 第二列是集束搜索的得分（从大到小）
-  - 第三列是生成的英语序列
-- 有两个特殊标识：
-  - `<e>`：序列的结尾
-  - `<unk>`：不包含在字典中的单词
-
-### BLEU评估 ###
-对机器翻译的人工评估工作很广泛但也很昂贵。一篇论文 [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf) 展示了一种方法，当需要快速或者频繁的评估时，使用自动的替补来替代经验丰富的人工评判。[Moses](http://www.statmt.org/moses/) 是一个统计学的机器翻译系统，我们使用其中的 [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) 来做BLEU评估。运行以下命令来下载这个脚本：
-
-```bash
-cd demo/seqToseq/translation
-./moses_bleu.sh
-```
-
-由于标准的翻译结果已经下载到这里`data/wmt14/gen/ntst14.trg`，我们可以运行以下命令来做BLEU评估。
-
-```bash
-cd demo/seqToseq/translation
-./eval_bleu.sh FILE BEAMSIZE
-```
-
-- FILE：生成的结果文件
-- BEAMSIZE：集束搜索中的扩展广度
diff --git a/doc/tutorials/text_generation/index_en.md b/doc/tutorials/text_generation/index_en.md
deleted file mode 100644
index 5d8e667c20..0000000000
--- a/doc/tutorials/text_generation/index_en.md
+++ /dev/null
@@ -1,338 +0,0 @@
-# Text generation Tutorial #
-
-Sequence to sequence has been proven to be a powerful model for language generation. It can be used for machine translation, query rewriting, image captioning, etc.
-
-This tutorial guides you through training a sequence to sequence model for neural machine translation (NMT) network that translates French to English.
-
-We follow the paper [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473) , which details the model architecture and training procedure for good performance on WMT-14 dataset. This tutorial reproduces this result in PaddlePaddle.
-
-We thank @caoying for the pull request that defines the model architecture and solver configurations.
-
-## Data Preparation ##
-### Download and Extract ###
-Download the WMT-14 dataset from [http://www-lium.univ-lemans.fr/~schwenk/cslm\_joint\_paper/](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/), extract it, and divide Develop and Test data into separate folder.
-
-- **Train data**: [bitexts (after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)
-- **Develop and Test data**: [dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)
-
-To do this, simply run the following commands in linux, otherwise, you need to download, extract, divide, and rename the file suffix respectively.
-
-```bash
-cd demo/seqToseq/data
-./wmt14_data.sh
-```
-
-We should find that the dataset `wmt14` has three folders as shown in the following table.
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<colgroup>
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-</colgroup>
-
-<thead>
-<tr>
-<th scope="col" class="left">folder name</th>
-<th scope="col" class="left">French-English parallel corpora file</th>
-<th scope="col" class="left">number of total file</th>
-<th scope="col" class="left">size</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">train_data</td>
-<td class="left">ccb2_pc30.src, ccb2_pc30.trg, etc</td>
-<td class="left">twelve</td>
-<td class="left">3.55G</td>
-</tr>
-
-<tr>
-<td class="left">test_data</td>
-<td class="left">ntst1213.src, ntst1213.trg</td>
-<td class="left">two</td>
-<td class="left">1636k</td>
-</tr>
-
-<tr>
-<td class="left">gen_data</td>
-<td class="left">ntst14.src, ntst14.trg</td>
-<td class="left">two</td>
-<td class="left">864k</td>
-</tr>
-</tbody>
-</table>
-<br/>
-
-- Each folder has French-English parallel corpora
-- **XXX.src** are source French files; **XXX.trg** are target English files.
-- The number of lines of **XXX.src** and **XXX.trg** should be the same.
-- Each line is a French/English sentence.
-- There is a one-to-one correspondence between the sentence at the i-th line of **XXX.src** and **XXX.trg**.
-
-### User Defined Dataset ###
-
-If you need to do other sequence-to-sequence tasks, such as Paraphrasing, you only need to organize the data as follows, and place them in `demo/seqToseq/data`:
-
-    dataset
-      train
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-      test
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-      gen
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-- 1st directory: dataset folder name
-- 2nd directory: folder of train, test, and gen. The names of these three folders are fixed.
-- 3rd file: Source-Target parallel corpora files.
-  - **XXX.src** are source files, **XXX.trg** are target files.
-  - Each line of the file must be a sequence.
-  - There should be a one-to-one correspondence between the i-th sequence of **XXX.src** and **XXX.trg**.
-
-## Data Preprocess ##
-### Preprocessing Workflow ###
-- Concat each Source-Target parallel corpora to be one file:
-  - concat each **XXX.src** and **XXX.trg** to be **XXX**.
-  - the i-th line of **XXX** = the i-th line of **XXX.src** + '\t' + the i-th line of **XXX.trg**
-- Build source and target dictionary of train data, each dictionary has DICTSIZE words:
-  - the most frequent (DICTSIZE-3) words
-  - 3 special token:
-    - `<s>`: the start of a sequence
-    - `<e>`: the end of a sequence
-    - `<unk>`: a word not included in dictionary
-
-### Preprocessing Command and Result
-The general command for preprocessing the dataset is:
-
-```python
-cd demo/seqToseq/
-python preprocess.py -i INPUT [-d DICTSIZE] [-m]
-```
-
-- `-i INPUT`: the path of input original dataset
-- `-d DICTSIZE`: the specified word count of dictionary, if not set, dictionary will contain all the words in input dataset
-- `-m --mergeDict`: merge source and target dictionary, thus, two dictionaries have the same context
-
-And you will see messages like this:
-
-    concat parallel corpora for dataset
-    build source dictionary for train data
-    build target dictionary for train data
-    dictionary size is XXX
-
-Here, you can simply run the command:
-
-```python
-python preprocess.py -i data/wmt14 -d 30000
-```
-
-It will take several minutes, and store the preprocessed dataset in `demo/seqToseq/data/pre-wmt14`, the directory has following structure.
-
-    train test gen train.list test.list gen.list src.dict trg.dict
-
-- **train, test, gen**: folder contains French-English parallel corpora of train data, test data and gen data respectively. Each line of file in folder contains two parts, the former is a French sequence, and the latter is a corresponding English sequence.
-- **train.list, test.list, gen.list**: text contains a file list in train folder, test folder and gen folder respectively
-- **src.dict, trg.dict**: source (French) / target (English) dictionary, each dictionary has 30000 words: the most frequent 29997 words and 3 special token
-
-## Model Training ##
-### Introduction ###
-
-Neural machine translation (NMT) aims at building a single neural network that can be jointly tuned to maximize translation performance. Recently proposed NMT models often belong to a family of encoder–decoder models. Encoder-Decoder models encode a source sentence into a fixed-length vector from which a decoder generates a target sentence.
-
-In this task, we use an extension to the encoder–decoder model which learns to align and translate jointly. Each time the model generates a word in a translation, it searches for a set of positions in the source sentence for the most relevant information.  The decoder predicts a target word based on the context vectors associated with these source positions and all the previous generated target words. For more detailed explanation, readers can refer to paper [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473).
-
-The most distinguishing feature of this model is that it doesn't encode an input sentence into a single ﬁxed-length vector. Instead, it encodes the input sentence into a sequence of vectors, where one vector corresponds to an input element. A subset of these vectors is chosen adaptively while decoding the translated sentence. This frees a NMT model from having to squash all the information of a source sentence, regardless of its length, into a ﬁxed-length vector. The improvement of this model is more apparent for longer sentences, but the improvement can be observed for sentences of any length.
-<center>![](./encoder-decoder-attention-model.png)</center>
-<center>Figure 1. Encoder-Decoder-Attention-Model</center>
-
-### Training Model in PaddlePaddle ###
-We need to create a model config file before training. Here is an example `demo/seqToseq/translation/train.conf`. The first three lines import python function for defining network, and define the job_mode and attention_mode.
-
-```python
-from seqToseq_net import *
-is_generating = False
-
-### Data Definiation
-train_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
-                             is_generating = is_generating)
-
-### Algorithm Configuration
-settings(
-    learning_method = AdamOptimizer(),
-    batch_size = 50,
-    learning_rate = 5e-4)
-
-### Network Architecture
-gru_encoder_decoder(train_conf, is_generating)
-```
-
-1. **Data Definiation**: We define a SeqToSeq train and test data in our example. It returns train_conf as the configuration, following is its input arguments:
-   - data_dir: directory of train data and test data
-   - is\_generating: whether this config is used for generating, here is false
-2. **Algorithm Configuration**: We use the SGD training algorithm (default), ADAM learning method in our example, specify batch_size as 50, and learning rate as 5e-4.
-3. **Network Architecture**: We use an attention version of GRU Encoder-Decoder network in our example. It consists a bidirectional GRU as an encoder and a decoder that emulates searching through a source sentence during decoding a translation.
-
-### Training Command and Result###
-After writing the model config, we can train the model by running the command:
-
-```bash
-cd demo/seqToseq/translation
-./train.sh
-```
-
-The `train.sh` is shown as follows:
-
-```bash
-paddle train \
---config='translation/train.conf' \
---save_dir='translation/model' \
---use_gpu=false \
---num_passes=16 \
---show_parameter_stats_period=100 \
---trainer_count=4 \
---log_period=10 \
---dot_period=5 \
-2>&1 | tee 'translation/train.log'
-```
-- config: set config of neural network
-- save_dir: set output path to save models
-- use_gpu: whether to use GPU to train, here use CPU
-- num_passes: set number of passes. One pass in paddle means training all samples in dataset one time
-- show_parameter_stats_period: here show parameter statistic every 100 batches
-- trainer_count: set number of CPU threads or GPU devices
-- log_period: here print log every 10 batches
-- dot_period: here print '.' every 5 batches
-
-The training loss function is printed every 10 batch by default, and you will see messages like this:
-
-    I0719 19:16:45.952062 15563 TrainerInternal.cpp:160]  Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155  CurrentEval: classification_error_evaluator=0.737155
-    I0719 19:17:56.707319 15563 TrainerInternal.cpp:160]  Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392  CurrentEval: classification_error_evaluator=0.659065
-    .....
-- AvgCost: Average Cost from 0th batch to current batch
-- CurrentCost: Cost in current batch
-- classification\_error\_evaluator(Eval): False prediction rate for each word from 0th evaluation to current evaluation
-- classification\_error\_evaluator(CurrentEval): False prediction rate for each word in current evaluation
-
-And when the classification\_error\_evaluator is less than 0.35, the model is trained sucessfully.
-
-## Text Generation ##
-### Introduction ###
-
-Generally speaking, the NMT model is conditioned on the encodings of the source sentence, and then to predict the next target word by given the current target word. In the training process, the current word is always knowns as the ground truth, by contrast. In the generating process, the current word is the output of the decoder in last time step, which is accessed to from a memory in PaddlePaddle.
-
-Besides, we use Beam Search to generate sequences. Beam search uses breadth-first search to build its search tree. At each level of the tree, it generates all successors of the states at the current level, sorting them in increasing order of heuristic cost. However, it only stores a predetermined number of best states at each level (called the beam size).
-
-### Pretrained model ###
-We trained the model on a cluster with 50 nodes, each node has two 6-core CPUs. We trained 16 passes in 5 days, where each pass takes 7 hours. The model_dir has 16 sub-folder, each of which contains the whole model parameters with 202MB size. And we find pass-00012 model has the highest BLEU 27.77 (see paper [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf)). To download and extract this model, simply run the following commands in linux.
-
-```bash
-cd demo/seqToseq/data
-./wmt14_model.sh
-```
-
-### Generating Model in PaddlePaddle ###
-We need to create a model config file before translating French sequence. Here is an example `demo/seqToseq/translation/gen.conf`, the first three lines import python function for defining network, and define the job\_mode and attention\_mode.
-
-```python
-from seqToseq_net import *
-is_generating = True
-
-################## Data Definiation #####################
-gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
-                           is_generating = is_generating,
-                           gen_result = "./translation/gen_result")
-
-############## Algorithm Configuration ##################
-settings(
-  learning_method = AdamOptimizer(),
-  batch_size = 1,
-  learning_rate = 0)
-
-################# Network configure #####################
-gru_encoder_decoder(gen_conf, is_generating)
-```
-
-1. **Data Definiation**: We defines an SeqToSeq gen data in our example. It returns gen_conf as the configuration, following is its input arguments:
-   - data\_dir: directory of gen data
-   - is\_generating: whether this config is used for generating, here is true
-   - gen\_result: file to store the generation result
-2. **Algorithm Configuration**: We use SGD traing algorithm in generation, and specify batch_size as 1 (each time generate one sequence), and learning rate as 0.
-3. **Network Architecture**: Essentially the same as the training model.
-
-### Generating Command and Result ###
-After writing the model config, we can do text translation from French to English by running the command:
-
-```bash
-cd demo/seqToseq/translation
-./gen.sh
-```
-
-The `gen.sh` is shown as follows, unlike training, there are some different arguments to specify:
-
-```bash
-paddle train \
---job=test \
---config='translation/gen.conf' \
---save_dir='data/wmt14_model' \
---use_gpu=true \
---num_passes=13 \
---test_pass=12 \
---trainer_count=1 \
-2>&1 | tee 'translation/gen.log'
-```
-- job: set job mode to test
-- save_dir: the path of saved models
-- num_passes and test_pass: loading model parameters from test_pass to (num_passes - 1), here only loads `data/wmt14_model/pass-00012`
-
-You will see messages like this:
-
-    I0706 14:48:31.178915 31441 GradientMachine.cpp:143] Loading parameters from data/wmt14_model/pass-00012
-    I0706 14:48:40.012039 31441 Tester.cpp:125]  Batch=100 samples=100 AvgCost=0
-    I0706 14:48:48.898632 31441 Tester.cpp:125]  Batch=200 samples=200 AvgCost=0
-    ...
-
-And the generating result in `demo/seqToseq/translation/gen_result` likes:
-
-    0
-    0       -11.1314         The <unk> <unk> about the width of the seats while large controls are at stake <e>
-    1       -11.1519         The <unk> <unk> on the width of the seats while large controls are at stake <e>
-    2       -11.5988         The <unk> <unk> about the width of the seats while large controls are at stake . <e>
-
-    1
-    0       -24.4149         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of the Dubai <unk> . <e>
-    1       -26.9524         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s <unk> . <e>
-    2       -27.9574         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s Dubai <unk> . <e>
-    ...
-
-- This is the beam search result, where beam size is 3
-- '0' in 1st-line and '1' in 6th-line mean the sequence-id in gen data
-- Other six lines list the beam search results
-  - The 2nd-column is the score of beam search (from large to small)
-  - The 3rd-colunm is the generating English sequence
-- There is 2 special tokens:
-  - `<e>`: the end of a sequence
-  - `<unk>`: a word not included in dictionary
-
-### Bleu Evalutaion ###
-Human evaluations of machine translation are extensive but expensive. Paper [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf) presents a method as an automated understudy to skilled human judges which substitutes for them when there is need for quick or frequent evaluations. [Moses](http://www.statmt.org/moses/) is a statistical machine translation system, and we use [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) of it to do Bleu Evalution. To download this script, simply run the following command:
-
-```bash
-cd demo/seqToseq/translation
-./moses_bleu.sh
-```
-
-Since the standard translation is alrealy downloaded as `data/wmt14/gen/ntst14.trg`, we can do Bleu Evalution by running the command:
-
-```bash
-cd demo/seqToseq/translation
-./eval_bleu.sh FILE BEAMSIZE
-```
-
-- FILE: the generation result file
-- BEAMSIZE: expand width in beam search
diff --git a/doc/v1_api_tutorials/README.md b/doc/v1_api_tutorials/README.md
new file mode 100644
index 0000000000..071b8da61f
--- /dev/null
+++ b/doc/v1_api_tutorials/README.md
@@ -0,0 +1,5 @@
+The tutorials in v1_api_tutorials are using v1_api currently, and will be upgraded to v2_api later.
+Thus, v1_api_tutorials is a temporary directory. We decide not to maintain it and will delete it in future.
+
+Please go to [PaddlePaddle/book](https://github.com/PaddlePaddle/book) and 
+[PaddlePaddle/models](https://github.com/PaddlePaddle/models) to learn PaddlePaddle.
diff --git a/doc/tutorials/embedding_model/index_cn.md b/doc/v1_api_tutorials/embedding_model/index_cn.md
similarity index 100%
rename from doc/tutorials/embedding_model/index_cn.md
rename to doc/v1_api_tutorials/embedding_model/index_cn.md
diff --git a/doc/tutorials/embedding_model/index_en.md b/doc/v1_api_tutorials/embedding_model/index_en.md
similarity index 100%
rename from doc/tutorials/embedding_model/index_en.md
rename to doc/v1_api_tutorials/embedding_model/index_en.md
diff --git a/doc/tutorials/embedding_model/neural-n-gram-model.png b/doc/v1_api_tutorials/embedding_model/neural-n-gram-model.png
similarity index 100%
rename from doc/tutorials/embedding_model/neural-n-gram-model.png
rename to doc/v1_api_tutorials/embedding_model/neural-n-gram-model.png
diff --git a/doc/tutorials/gan/gan.png b/doc/v1_api_tutorials/gan/gan.png
similarity index 100%
rename from doc/tutorials/gan/gan.png
rename to doc/v1_api_tutorials/gan/gan.png
diff --git a/doc/tutorials/gan/index_en.md b/doc/v1_api_tutorials/gan/index_en.md
similarity index 100%
rename from doc/tutorials/gan/index_en.md
rename to doc/v1_api_tutorials/gan/index_en.md
diff --git a/doc/tutorials/gan/mnist_sample.png b/doc/v1_api_tutorials/gan/mnist_sample.png
similarity index 100%
rename from doc/tutorials/gan/mnist_sample.png
rename to doc/v1_api_tutorials/gan/mnist_sample.png
diff --git a/doc/tutorials/gan/uniform_sample.png b/doc/v1_api_tutorials/gan/uniform_sample.png
similarity index 100%
rename from doc/tutorials/gan/uniform_sample.png
rename to doc/v1_api_tutorials/gan/uniform_sample.png
diff --git a/doc/tutorials/imagenet_model/resnet_block.jpg b/doc/v1_api_tutorials/imagenet_model/resnet_block.jpg
similarity index 100%
rename from doc/tutorials/imagenet_model/resnet_block.jpg
rename to doc/v1_api_tutorials/imagenet_model/resnet_block.jpg
diff --git a/doc/tutorials/imagenet_model/resnet_model_cn.md b/doc/v1_api_tutorials/imagenet_model/resnet_model_cn.md
similarity index 100%
rename from doc/tutorials/imagenet_model/resnet_model_cn.md
rename to doc/v1_api_tutorials/imagenet_model/resnet_model_cn.md
diff --git a/doc/tutorials/imagenet_model/resnet_model_en.md b/doc/v1_api_tutorials/imagenet_model/resnet_model_en.md
similarity index 100%
rename from doc/tutorials/imagenet_model/resnet_model_en.md
rename to doc/v1_api_tutorials/imagenet_model/resnet_model_en.md
diff --git a/doc/tutorials/quick_start/index_cn.rst b/doc/v1_api_tutorials/quick_start/index_cn.rst
similarity index 100%
rename from doc/tutorials/quick_start/index_cn.rst
rename to doc/v1_api_tutorials/quick_start/index_cn.rst
diff --git a/doc/tutorials/quick_start/index_en.md b/doc/v1_api_tutorials/quick_start/index_en.md
similarity index 100%
rename from doc/tutorials/quick_start/index_en.md
rename to doc/v1_api_tutorials/quick_start/index_en.md
diff --git a/doc/tutorials/quick_start/src/NetContinuous_cn.jpg b/doc/v1_api_tutorials/quick_start/src/NetContinuous_cn.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/NetContinuous_cn.jpg
rename to doc/v1_api_tutorials/quick_start/src/NetContinuous_cn.jpg
diff --git a/doc/tutorials/quick_start/src/NetContinuous_en.png b/doc/v1_api_tutorials/quick_start/src/NetContinuous_en.png
similarity index 100%
rename from doc/tutorials/quick_start/src/NetContinuous_en.png
rename to doc/v1_api_tutorials/quick_start/src/NetContinuous_en.png
diff --git a/doc/tutorials/quick_start/src/NetConv_cn.jpg b/doc/v1_api_tutorials/quick_start/src/NetConv_cn.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/NetConv_cn.jpg
rename to doc/v1_api_tutorials/quick_start/src/NetConv_cn.jpg
diff --git a/doc/tutorials/quick_start/src/NetConv_en.png b/doc/v1_api_tutorials/quick_start/src/NetConv_en.png
similarity index 100%
rename from doc/tutorials/quick_start/src/NetConv_en.png
rename to doc/v1_api_tutorials/quick_start/src/NetConv_en.png
diff --git a/doc/tutorials/quick_start/src/NetLR_cn.jpg b/doc/v1_api_tutorials/quick_start/src/NetLR_cn.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/NetLR_cn.jpg
rename to doc/v1_api_tutorials/quick_start/src/NetLR_cn.jpg
diff --git a/doc/tutorials/quick_start/src/NetLR_en.png b/doc/v1_api_tutorials/quick_start/src/NetLR_en.png
similarity index 100%
rename from doc/tutorials/quick_start/src/NetLR_en.png
rename to doc/v1_api_tutorials/quick_start/src/NetLR_en.png
diff --git a/doc/tutorials/quick_start/src/NetRNN_cn.jpg b/doc/v1_api_tutorials/quick_start/src/NetRNN_cn.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/NetRNN_cn.jpg
rename to doc/v1_api_tutorials/quick_start/src/NetRNN_cn.jpg
diff --git a/doc/tutorials/quick_start/src/NetRNN_en.png b/doc/v1_api_tutorials/quick_start/src/NetRNN_en.png
similarity index 100%
rename from doc/tutorials/quick_start/src/NetRNN_en.png
rename to doc/v1_api_tutorials/quick_start/src/NetRNN_en.png
diff --git a/doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg b/doc/v1_api_tutorials/quick_start/src/PipelineNetwork_cn.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg
rename to doc/v1_api_tutorials/quick_start/src/PipelineNetwork_cn.jpg
diff --git a/doc/tutorials/quick_start/src/PipelineNetwork_en.jpg b/doc/v1_api_tutorials/quick_start/src/PipelineNetwork_en.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/PipelineNetwork_en.jpg
rename to doc/v1_api_tutorials/quick_start/src/PipelineNetwork_en.jpg
diff --git a/doc/tutorials/quick_start/src/PipelineTest_cn.jpg b/doc/v1_api_tutorials/quick_start/src/PipelineTest_cn.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/PipelineTest_cn.jpg
rename to doc/v1_api_tutorials/quick_start/src/PipelineTest_cn.jpg
diff --git a/doc/tutorials/quick_start/src/PipelineTest_en.png b/doc/v1_api_tutorials/quick_start/src/PipelineTest_en.png
similarity index 100%
rename from doc/tutorials/quick_start/src/PipelineTest_en.png
rename to doc/v1_api_tutorials/quick_start/src/PipelineTest_en.png
diff --git a/doc/tutorials/quick_start/src/PipelineTrain_cn.jpg b/doc/v1_api_tutorials/quick_start/src/PipelineTrain_cn.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/PipelineTrain_cn.jpg
rename to doc/v1_api_tutorials/quick_start/src/PipelineTrain_cn.jpg
diff --git a/doc/tutorials/quick_start/src/PipelineTrain_en.png b/doc/v1_api_tutorials/quick_start/src/PipelineTrain_en.png
similarity index 100%
rename from doc/tutorials/quick_start/src/PipelineTrain_en.png
rename to doc/v1_api_tutorials/quick_start/src/PipelineTrain_en.png
diff --git a/doc/tutorials/quick_start/src/Pipeline_cn.jpg b/doc/v1_api_tutorials/quick_start/src/Pipeline_cn.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/Pipeline_cn.jpg
rename to doc/v1_api_tutorials/quick_start/src/Pipeline_cn.jpg
diff --git a/doc/tutorials/quick_start/src/Pipeline_en.jpg b/doc/v1_api_tutorials/quick_start/src/Pipeline_en.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/Pipeline_en.jpg
rename to doc/v1_api_tutorials/quick_start/src/Pipeline_en.jpg
diff --git a/go/glide.lock b/go/glide.lock
index 1ecdd21752..aabc03657f 100644
--- a/go/glide.lock
+++ b/go/glide.lock
@@ -1,6 +1,8 @@
-hash: 1b9b07408ca7fac27a374dc2ccd2433e4bff090484008a037df967284949a582
-updated: 2017-08-07T23:37:48.867469328Z
+hash: 328e7b9b7306b45e7b9879139a9f86698115981f6283032e1312093a6a6ddb04
+updated: 2017-10-16T08:00:23.484693528Z
 imports:
+- name: github.com/alecthomas/gometalinter
+  version: bae2f1293d092fd8167939d5108d1b025eaef9de
 - name: github.com/beorn7/perks
   version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
   subpackages:
@@ -10,7 +12,7 @@ imports:
 - name: github.com/cockroachdb/cmux
   version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92
 - name: github.com/coreos/etcd
-  version: d0d1a87aa96ae14914751d42264262cb69eda170
+  version: f1d7dd87da3e8feab4aaf675b8e29c6a5ed5f58b
   subpackages:
   - alarm
   - auth
@@ -149,7 +151,7 @@ imports:
 - name: github.com/satori/go.uuid
   version: 879c5887cd475cd7864858769793b2ceb0d44feb
 - name: github.com/sirupsen/logrus
-  version: a3f95b5c423586578a4e099b11a46c2479628cac
+  version: f006c2ac4710855cf0f916dd6b77acf6b048dc6e
 - name: github.com/topicai/candy
   version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc
 - name: github.com/ugorji/go
@@ -159,12 +161,13 @@ imports:
 - name: github.com/xiang90/probing
   version: 07dd2e8dfe18522e9c447ba95f2fe95262f63bb2
 - name: golang.org/x/crypto
-  version: 1351f936d976c60a0a48d728281922cf63eafb8d
+  version: 9419663f5a44be8b34ca85f08abc5fe1be11f8a3
   repo: https://github.com/golang/crypto.git
   vcs: git
   subpackages:
   - bcrypt
   - blowfish
+  - ssh/terminal
 - name: golang.org/x/net
   version: c8c74377599bd978aee1cf3b9b63a8634051cec2
   subpackages:
@@ -219,3 +222,4 @@ testImports:
   version: 05e8a0eda380579888eb53c394909df027f06991
   subpackages:
   - assert
+
diff --git a/go/glide.yaml b/go/glide.yaml
index a90e71b615..4b22ab2caa 100644
--- a/go/glide.yaml
+++ b/go/glide.yaml
@@ -24,3 +24,5 @@ import:
   vcs: git
 - package: github.com/satori/go.uuid
   version: v1.1.0
+- package: github.com/alecthomas/gometalinter
+  version: v1.2.1
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 184ec65d3f..4bc3fdeeea 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -19,10 +19,10 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 proto_library(framework_proto SRCS framework.proto)
 
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
-cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim)
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim op_info)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
-cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto proto_desc)
+cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope proto_desc)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 
@@ -43,13 +43,11 @@ cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
 
 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward)
-set(EXECUTOR_TEST_OP elementwise_add_op gaussian_random_op feed_op fetch_op
-    mul_op sum_op squared_l2_distance_op fill_constant_op sgd_op)
-if(WITH_GPU)
-    nv_test(executor_test SRCS executor_test.cc DEPS executor ${EXECUTOR_TEST_OP})
-else()
-    cc_test(executor_test SRCS executor_test.cc DEPS executor ${EXECUTOR_TEST_OP})
-endif()
 
 cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
 cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
+
+cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
+        proto_desc)
+cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
+cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
index d13530e340..8a7a949346 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -120,6 +120,57 @@ class EnumInContainer {
   std::unordered_set<T> container_;
 };
 
+template <typename T>
+struct ExtractAttribute {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  T* operator()(Attribute& attr) const {
+    T* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<T>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
+                   attr_name_, typeid(T).name(), attr.type().name());
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
+// special handle bool
+// FIXME(yuyang18): Currently we cast bool into int in python binding. It is
+// hard to change the logic there. In another way, we should correct handle
+// if the user set `some_flag=1`.
+//
+// FIX ME anytime if there is a better solution.
+template <>
+struct ExtractAttribute<bool> {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  bool* operator()(Attribute& attr) const {
+    if (attr.type() == typeid(int)) {  // NOLINT
+      int val = boost::get<int>(attr);
+      attr = static_cast<bool>(val);
+    } else if (attr.type() == typeid(float)) {  // NOLINT
+      float val = boost::get<float>(attr);
+      attr = static_cast<bool>(val);
+    }
+    bool* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<bool>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
+                   attr_name_, attr.type().name());
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
 // check whether a certain attribute fit its limits
 // an attribute can have more than one limits
 template <typename T>
@@ -171,9 +222,10 @@ class TypedAttrChecker {
       attr_map[attr_name_] = val;
     }
     Attribute& attr = attr_map.at(attr_name_);
-    T& attr_value = boost::get<T>(attr);
+    ExtractAttribute<T> extract_attr(attr_name_);
+    T* attr_value = extract_attr(attr);
     for (const auto& checker : value_checkers_) {
-      checker(attr_value);
+      checker(*attr_value);
     }
   }
 
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 063b108500..c78e056071 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -28,14 +28,15 @@ namespace paddle {
 namespace framework {
 
 static inline std::unique_ptr<OperatorBase> CreateGradOp(
-    const OperatorBase& op) {
+    const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
+    std::unordered_map<std::string, std::string>* grad_to_var) {
   OpDescBind op_desc;
   op_desc.SetInputMap(op.Inputs());
   op_desc.SetOutputMap(op.Outputs());
   op_desc.SetType(op.Type());
   op_desc.SetAttrMap(op.Attrs());
   auto& info = OpInfoMap::Instance().Get(op.Type());
-  auto grad_descs = info.GradOpMaker()(op_desc);
+  auto grad_descs = info.GradOpMaker()(op_desc, no_grad_set, grad_to_var);
   std::vector<std::unique_ptr<OperatorBase>> grad_ops;
   grad_ops.reserve(grad_descs.size());
   std::transform(grad_descs.begin(), grad_descs.end(),
@@ -98,7 +99,9 @@ static std::unique_ptr<OperatorBase> NOP() {
 //  See Backward.h for details
 static std::unique_ptr<OperatorBase> BackwardRecursive(
     const OperatorBase& forwardOp,
-    std::unordered_set<std::string>& no_grad_names, size_t& uniq_id) {
+    std::unordered_set<std::string>& no_grad_names,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    size_t& uniq_id) {
   //  If all input gradients of forwarding operator do not need to calculate,
   //  just return an NOP. Not return null ptr because NOP does not take
   //  too much time for calculation, but it is useful for simplifying logic.
@@ -136,7 +139,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
     for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
          ++it, ++local_op_id) {
       auto& fwd = *it;
-      auto bwd = BackwardRecursive(*fwd, no_grad_names, uniq_id);
+      auto bwd = BackwardRecursive(*fwd, no_grad_names, grad_to_var, uniq_id);
       ForEachVarName(bwd->Outputs(),
                      [&dup_output_ops, local_op_id](const std::string& out) {
                        dup_output_ops[out].emplace_back(local_op_id);
@@ -187,7 +190,8 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
       net->InsertOp(pos.first + 1, std::move(pos.second));
     }
   } else {
-    std::unique_ptr<OperatorBase> grad_op(CreateGradOp(forwardOp));
+    std::unique_ptr<OperatorBase> grad_op(
+        CreateGradOp(forwardOp, no_grad_names, grad_to_var));
 
     ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
                                           const std::string& grad_input) {
@@ -226,7 +230,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
           *static_cast<const OperatorBase*>(&rnnop.stepnet());
       // create stepnet's gradient op
       rnn_grad_op->set_stepnet(
-          BackwardRecursive(stepnet_op, no_grad_names, uniq_id));
+          BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
     }
 
     if (net->ops_.empty()) {  // Current no aux op is added to network
@@ -253,7 +257,8 @@ std::unique_ptr<OperatorBase> Backward(
     no_grad_names.insert(name + kGradVarSuffix);
   }
   size_t uid = 0;
-  return BackwardRecursive(forwardOp, no_grad_names, uid);
+  std::unordered_map<std::string, std::string> grad_to_var;
+  return BackwardRecursive(forwardOp, no_grad_names, &grad_to_var, uid);
 }
 
 // ====================================  //
@@ -268,30 +273,61 @@ static bool AllGradInSet(const std::vector<std::string>& names,
   return true;
 }
 
+static void CreateGradVarInBlock(
+    size_t grad_op_start_index,
+    const std::unordered_map<std::string, std::string>& param_name_map,
+    BlockDescBind* block_desc,
+    std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
+  auto ops = block_desc->AllOps();
+  for (size_t op_index = grad_op_start_index; op_index < ops.size();
+       ++op_index) {
+    ForEachVarName(ops[op_index]->Outputs(),
+                   [&](const std::string& grad_var_name) {
+                     if (block_desc->HasVar(grad_var_name)) {
+                       return false;
+                     }
+                     block_desc->Var(grad_var_name);
+                     auto it = param_name_map.find(grad_var_name);
+                     if (it == param_name_map.end()) {
+                       return false;
+                     }
+                     auto param_var_name = it->second;
+                     auto& grad_record = (*grad_var_record)[param_var_name];
+                     grad_record.name_ = grad_var_name;
+                     grad_record.block_idx_ = block_desc->ID();
+                     grad_record.op_idx_ = static_cast<int>(op_index);
+                     return false; /* not break */
+                   });
+  }
+}
+
 std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
     const std::unique_ptr<OpDescBind>& op_desc,
-    std::unordered_set<std::string>& no_grad_vars) {
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var) {
   std::vector<std::unique_ptr<OpDescBind>> grad_op_descs;
-  // All input gradients of forwarding operator do not need to calculat.
+  // All input gradients of forwarding operator do not need to calculate.
   const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
-  if (AllGradInSet(inputs, no_grad_vars)) {
+  if (AllGradInSet(inputs, *no_grad_vars)) {
     return grad_op_descs;  // empty vector
   }
   // All output gradients of forwarding operator do not need to calculate.
   const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
-  if (AllGradInSet(outputs, no_grad_vars)) {
+  if (AllGradInSet(outputs, *no_grad_vars)) {
     for (const std::string& name : inputs) {
-      no_grad_vars.insert(GradVarName(name));
+      no_grad_vars->insert(GradVarName(name));
     }
     return grad_op_descs;  // empty vector
   }
 
-  grad_op_descs = OpRegistry::CreateGradOpDescs(op_desc.get());
+  grad_op_descs = OpInfoMap::Instance()
+                      .Get(op_desc->Type())
+                      .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var);
 
   std::list<std::unique_ptr<OpDescBind>> pending_fill_zeros_ops;
   for (auto& desc : grad_op_descs) {
     for (const std::string& in_name : desc->InputArgumentNames()) {
-      if (no_grad_vars.count(in_name)) {
+      if (no_grad_vars->count(in_name)) {
         std::string prefix = in_name.substr(
             0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
         std::string new_name = prefix + kZeroVarSuffix;
@@ -301,11 +337,6 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
         pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
       }
     }
-    for (const std::string& out_name : desc->OutputArgumentNames()) {
-      if (no_grad_vars.count(out_name)) {
-        desc->Rename(out_name, kEmptyVarName);
-      }
-    }
   }
 
   for (auto& p : pending_fill_zeros_ops) {
@@ -316,23 +347,25 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
 
 std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
     ProgramDescBind& program_desc, int block_idx,
-    std::unordered_set<std::string>& no_grad_vars) {
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var) {
   BlockDescBind* cur_block = program_desc.Block(block_idx);
   std::deque<std::unique_ptr<OpDescBind>>& op_descs = cur_block->ops_;
   std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
   size_t grad_desc_idx = 0;
   std::vector<std::unique_ptr<OpDescBind>> backward_descs;
+
   for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
     std::vector<std::unique_ptr<OpDescBind>> op_grads =
-        MakeOpGrad(*it, no_grad_vars);
+        MakeOpGrad(*it, no_grad_vars, grad_to_var);
 
     if ((*it)->Type() == "recurrent") {
       PADDLE_ENFORCE_EQ(
-          op_grads.size(), size_t(1),
+          op_grads.size(), static_cast<size_t>(1),
           "rnn_op's gradient process should contain only one op.");
-      int step_block_idx = (*it)->GetBlockAttr("stop_block");
-      auto backward_block_op_descs =
-          MakeBlockBackward(program_desc, step_block_idx, no_grad_vars);
+      int step_block_idx = (*it)->GetBlockAttr("step_block");
+      auto backward_block_op_descs = MakeBlockBackward(
+          program_desc, step_block_idx, no_grad_vars, grad_to_var);
       BlockDescBind* backward_block = program_desc.AppendBlock(*cur_block);
       for (auto& ptr : backward_block_op_descs) {
         backward_block->ops_.push_back(std::move(ptr));
@@ -376,24 +409,56 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
     backward_descs.insert(backward_descs.begin() + p.first + 1,
                           std::move(p.second));
   }
+
   return backward_descs;
 }
 
-void AppendBackward(ProgramDescBind& program_desc,
-                    const std::unordered_set<std::string>& no_grad_vars) {
+ParamGradInfoMap AppendBackward(
+    ProgramDescBind& program_desc, const VarDescBind& target,
+    const std::unordered_set<std::string>& no_grad_vars) {
   std::unordered_set<std::string> no_grad_var_names;
   no_grad_var_names.reserve(no_grad_vars.size() + 1);
   no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
   for (auto& name : no_grad_vars) {
     no_grad_var_names.insert(GradVarName(name));
   }
+
   const int root_block_idx = 0;
-  auto backward_op_descs =
-      MakeBlockBackward(program_desc, root_block_idx, no_grad_var_names);
-  auto& forw_op_descs = program_desc.Block(root_block_idx)->ops_;
+  auto root_block = program_desc.Block(root_block_idx);
+  auto& all_ops = root_block->ops_;
+
+  // insert fill one op for target
+  std::string fill_one_op_out = GradVarName(target.Name());
+  std::unique_ptr<OpDescBind> fill_one_op(
+      new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
+                     {{"shape", std::vector<int>{1}},
+                      {"value", static_cast<float>(1.0)},
+                      {"data_type", framework::DataType::FP32}}));
+  all_ops.push_back(std::move(fill_one_op));
+  size_t forward_op_num = all_ops.size();
+  size_t forward_block_num = program_desc.Size();
+
+  // Insert backward operators
+  std::unordered_map<std::string, std::string> grad_to_var;
+  auto backward_op_descs = MakeBlockBackward(program_desc, root_block_idx,
+                                             &no_grad_var_names, &grad_to_var);
+
+  std::unordered_map<std::string, GradVarInfo> retv;
+
+  // Create Variable
   for (auto& ptr : backward_op_descs) {
-    forw_op_descs.push_back(std::move(ptr));
+    all_ops.push_back(std::move(ptr));
+  }
+  root_block->Var(fill_one_op_out);
+
+  // create grad_var for all blocks in this program
+  CreateGradVarInBlock(forward_op_num, grad_to_var, root_block, &retv);
+  for (size_t block_index = forward_block_num;
+       block_index < program_desc.Size(); ++block_index) {
+    CreateGradVarInBlock(0, grad_to_var, program_desc.Block(block_index),
+                         &retv);
   }
+  return retv;
 }
 
 }  // namespace framework
diff --git a/paddle/framework/backward.h b/paddle/framework/backward.h
index f1ab805645..96154fa82c 100644
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
@@ -14,7 +14,10 @@
 
 #pragma once
 
+#include <string>
+#include <unordered_map>
 #include <unordered_set>
+
 #include "paddle/framework/operator.h"
 #include "paddle/framework/program_desc.h"
 
@@ -27,10 +30,27 @@ extern std::unique_ptr<OperatorBase> Backward(
     const OperatorBase& forwardOp,
     const std::unordered_set<std::string>& no_grad_vars);
 
-// TODO(jiayi): Add target as parameter and generate backward op
-// according to target.
-void AppendBackward(ProgramDescBind& program_desc,
-                    const std::unordered_set<std::string>& no_grad_vars);
+struct GradVarInfo {
+  GradVarInfo() {}
+  GradVarInfo(const std::string& name, int block_idx, int op_idx)
+      : name_(name), block_idx_(block_idx), op_idx_(op_idx) {}
+
+  bool operator==(const GradVarInfo& b) const {
+    return name_ == b.name_ && block_idx_ == b.block_idx_ &&
+           op_idx_ == b.op_idx_;
+  }
+
+  std::string name_;
+  int block_idx_;
+  int op_idx_;
+};
+
+using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
+                                            GradVarInfo /*grad_var_info*/>;
+
+ParamGradInfoMap AppendBackward(
+    ProgramDescBind& program_desc, const VarDescBind& target,
+    const std::unordered_set<std::string>& no_grad_vars);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 3b7cbcd989..5302afcafb 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -18,6 +18,7 @@
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/var_desc.h"
 #include "paddle/operators/net_op.h"
 
 namespace paddle {
@@ -169,6 +170,45 @@ class MultInOutOpMaker : public OpProtoAndCheckerMaker {
   }
 };
 
+class MinusGradOpDescMaker : public GradOpDescMakerBase {
+ public:
+  using GradOpDescMakerBase::GradOpDescMakerBase;
+
+  std::vector<std::unique_ptr<OpDescBind>> operator()() const override {
+    std::vector<std::unique_ptr<OpDescBind>> retv;
+    auto x_g = InputGrad("X");
+    if (!x_g.empty()) {
+      auto *op_desc = new OpDescBind();
+      op_desc->SetType("scale");
+      op_desc->SetInput("X", OutputGrad("Out"));
+      op_desc->SetOutput("Out", x_g);
+      op_desc->SetAttr("scale", 1.0f);
+      retv.emplace_back(op_desc);
+    }
+
+    auto y_g = InputGrad("Y");
+    if (!y_g.empty()) {
+      auto *op_desc = new OpDescBind();
+      op_desc->SetType("scale");
+      op_desc->SetInput("X", OutputGrad("Out"));
+      op_desc->SetOutput("Out", y_g);
+      op_desc->SetAttr("scale", -1.0f);
+      retv.emplace_back(op_desc);
+    }
+    return retv;
+  }
+};
+
+class MinusOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "");
+    AddInput("Y", "");
+    AddOutput("Out", "");
+    AddComment("minus for unittest");
+  }
+};
 }  // namespace framework
 }  // namespace paddle
 
@@ -187,6 +227,7 @@ REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
 REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad,
             f::NOP);
 REGISTER_OP(mult_in_out, f::NOP, f::MultInOutOpMaker, mult_in_out_grad, f::NOP);
+REGISTER_OPERATOR(minus, f::NOP, f::MinusOpMaker, f::MinusGradOpDescMaker);
 
 TEST(Backward, simple_op_not_need_grad) {
   auto fwd = f::OpRegistry::CreateOp(
@@ -395,12 +436,13 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
             2UL       /* external input number */
                 + 1UL /* external output number*/
                 + 1UL /* number of gradient of external output*/
-                + 2U /* internal variable number*/);
+                + 2UL /* internal variable number*/
+            );
   EXPECT_EQ(grad_fc.Outputs(all).size(),
             2UL       /* input number of mul*/
-                + 2UL /* input number of rowwise_add
-                       */
-                + 1UL /* input number of sigmod */);
+                + 2UL /* input number of rowwise_add*/
+                + 1UL /* input number of sigmod */
+                - 1UL /* out2 is not needed*/);
   EXPECT_EQ(bwd_net->ops_[1]->Inputs(all).size(), 0UL);
   EXPECT_EQ(bwd_net->ops_[1]->Outputs(all).size(), 0UL);
   EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL);
@@ -427,10 +469,14 @@ TEST(Backward, simple_single_op) {
   op->SetInput("b", {"b"});
   op->SetOutput("Out", {"out"});
 
-  AppendBackward(program, {});
+  auto target = f::VarDescBind("out");
+  auto var_to_grad = AppendBackward(program, target, {});
+
+  ASSERT_EQ(block->AllOps().size(), 3UL);
+  f::OpDescBind *fill_op = block->AllOps()[1];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  ASSERT_EQ(block->AllOps().size(), 2UL);
-  f::OpDescBind *grad_op = block->AllOps()[1];
+  f::OpDescBind *grad_op = block->AllOps()[2];
   EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
@@ -440,6 +486,13 @@ TEST(Backward, simple_single_op) {
             std::vector<std::string>({f::GradVarName("x")}));
   EXPECT_EQ(grad_op->Output(f::GradVarName("b")),
             std::vector<std::string>({f::GradVarName("b")}));
+
+  EXPECT_EQ(var_to_grad.size(), 2UL);
+  EXPECT_EQ(var_to_grad.at("b"), f::GradVarInfo(f::GradVarName("b"), 0, 2));
+  EXPECT_EQ(var_to_grad.at("x"), f::GradVarInfo(f::GradVarName("x"), 0, 2));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x")));
 }
 
 TEST(Backward, default_attribute) {
@@ -451,14 +504,19 @@ TEST(Backward, default_attribute) {
   op->SetInput("X", {"x"});
   op->SetInput("Y", {"y"});
   op->SetOutput("Out", {"out"});
+  op->CheckAttrs();
 
-  AppendBackward(program, {});
+  auto target = f::VarDescBind("out");
+  AppendBackward(program, target, {});
 
-  ASSERT_EQ(block->AllOps().size(), 2UL);
+  ASSERT_EQ(block->AllOps().size(), 3UL);
   EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
   EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);
 
-  f::OpDescBind *grad_op = block->AllOps()[1];
+  f::OpDescBind *fill_op = block->AllOps()[1];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDescBind *grad_op = block->AllOps()[2];
   ASSERT_EQ(grad_op->Type(), "mul_grad");
   EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
   EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
@@ -486,10 +544,15 @@ TEST(Backward, simple_mult_op) {
   op3->SetInput("b", {"b3"});
   op3->SetOutput("Out", {"out3"});
 
-  AppendBackward(program, {});
+  auto target = f::VarDescBind("out3");
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {});
 
-  ASSERT_EQ(block->AllOps().size(), 6UL);
-  f::OpDescBind *grad_op1 = block->AllOps()[5];
+  ASSERT_EQ(block->AllOps().size(), 6UL + 1);
+  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDescBind *grad_op1 = block->AllOps()[6];
   EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -500,7 +563,7 @@ TEST(Backward, simple_mult_op) {
   EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
             std::vector<std::string>({f::GradVarName("b1")}));
 
-  f::OpDescBind *grad_op2 = block->AllOps()[4];
+  f::OpDescBind *grad_op2 = block->AllOps()[5];
   EXPECT_EQ(grad_op2->Type(), "mul_grad");
   ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
   ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
@@ -514,7 +577,7 @@ TEST(Backward, simple_mult_op) {
   EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
             std::vector<std::string>({f::GradVarName("y2")}));
 
-  f::OpDescBind *grad_op3 = block->AllOps()[3];
+  f::OpDescBind *grad_op3 = block->AllOps()[4];
   EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
@@ -524,6 +587,23 @@ TEST(Backward, simple_mult_op) {
             std::vector<std::string>({f::GradVarName("out2")}));
   EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
             std::vector<std::string>({f::GradVarName("b3")}));
+
+  EXPECT_EQ(var_to_grad.size(), 6UL);
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("out1"),
+            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("out2"),
+            f::GradVarInfo(f::GradVarName("out2"), 0, 4));
+  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out2")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
 }
 
 TEST(Backward, intermedia_var_no_grad) {
@@ -554,10 +634,15 @@ TEST(Backward, intermedia_var_no_grad) {
   op4->SetInput("Y", {"out3"});
   op4->SetOutput("Out", {"out4"});
 
-  AppendBackward(program, {"out3"});
+  auto target = f::VarDescBind("out4");
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {"out3"});
 
-  ASSERT_EQ(block->AllOps().size(), 6UL);
-  f::OpDescBind *grad_op1 = block->AllOps()[5];
+  ASSERT_EQ(block->AllOps().size(), 7UL);
+  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDescBind *grad_op1 = block->AllOps()[6];
   EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -568,7 +653,7 @@ TEST(Backward, intermedia_var_no_grad) {
   EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
             std::vector<std::string>({f::GradVarName("b1")}));
 
-  f::OpDescBind *grad_op4 = block->AllOps()[4];
+  f::OpDescBind *grad_op4 = block->AllOps()[5];
   EXPECT_EQ(grad_op4->Type(), "mul_grad");
   ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
   ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
@@ -579,8 +664,17 @@ TEST(Backward, intermedia_var_no_grad) {
             std::vector<std::string>({f::GradVarName("out4")}));
   EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
             std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
-            std::vector<std::string>({f::kEmptyVarName}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector<std::string>());
+
+  EXPECT_EQ(var_to_grad.size(), 3UL);
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("out1"),
+            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
 }
 
 TEST(Backward, var_no_grad) {
@@ -601,10 +695,15 @@ TEST(Backward, var_no_grad) {
   op2->SetOutput("Y", {"y2"});
   op2->SetOutput("Z", {"z2"});
 
-  AppendBackward(program, {"z1"});
+  auto target = f::VarDescBind("z2");
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {"z1"});
+
+  ASSERT_EQ(block->AllOps().size(), 6UL);
+  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  ASSERT_EQ(block->AllOps().size(), 5UL);
-  f::OpDescBind *grad_op2 = block->AllOps()[2];
+  f::OpDescBind *grad_op2 = block->AllOps()[3];
   ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
   ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
   ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
@@ -618,10 +717,9 @@ TEST(Backward, var_no_grad) {
             std::vector<std::string>({f::GradVarName("z2")}));
   EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
             std::vector<std::string>({f::GradVarName("y1")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("H")),
-            std::vector<std::string>({f::kEmptyVarName}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector<std::string>());
 
-  f::OpDescBind *fill_zero_op = block->AllOps()[3];
+  f::OpDescBind *fill_zero_op = block->AllOps()[4];
   ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
   ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
   ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
@@ -629,7 +727,7 @@ TEST(Backward, var_no_grad) {
   EXPECT_EQ(fill_zero_op->Output("Y"),
             std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
 
-  f::OpDescBind *grad_op1 = block->AllOps()[4];
+  f::OpDescBind *grad_op1 = block->AllOps()[5];
   ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
   ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
   ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -645,6 +743,15 @@ TEST(Backward, var_no_grad) {
             std::vector<std::string>({f::GradVarName("x1")}));
   EXPECT_EQ(grad_op1->Output(f::GradVarName("H")),
             std::vector<std::string>({f::GradVarName("h1")}));
+
+  EXPECT_EQ(var_to_grad.size(), 3UL);
+  EXPECT_EQ(var_to_grad.at("y1"), f::GradVarInfo(f::GradVarName("y1"), 0, 3));
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("h1"), f::GradVarInfo(f::GradVarName("h1"), 0, 5));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("y1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("h1")));
 }
 
 TEST(Backward, shared_var) {
@@ -669,10 +776,15 @@ TEST(Backward, shared_var) {
   op3->SetInput("b", {"b3"});
   op3->SetOutput("Out", {"out3"});
 
-  AppendBackward(program, {});
+  auto target = f::VarDescBind("out3");
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {});
 
-  ASSERT_EQ(block->AllOps().size(), 7UL);
-  f::OpDescBind *grad_op3 = block->AllOps()[3];
+  ASSERT_EQ(block->AllOps().size(), 8UL);
+  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDescBind *grad_op3 = block->AllOps()[4];
   ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
@@ -683,7 +795,7 @@ TEST(Backward, shared_var) {
   EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
             std::vector<std::string>({f::GradVarName("b3")}));
 
-  f::OpDescBind *grad_op4 = block->AllOps()[4];
+  f::OpDescBind *grad_op4 = block->AllOps()[5];
   ASSERT_EQ(grad_op4->Type(), "mul_grad");
   ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
   ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
@@ -697,7 +809,7 @@ TEST(Backward, shared_var) {
   EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
             std::vector<std::string>({f::GradVarName("y2")}));
 
-  f::OpDescBind *sum_op = block->AllOps()[5];
+  f::OpDescBind *sum_op = block->AllOps()[6];
   ASSERT_EQ(sum_op->Type(), "sum");
   ASSERT_EQ(sum_op->InputNames().size(), 1UL);
   ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
@@ -707,7 +819,7 @@ TEST(Backward, shared_var) {
   EXPECT_EQ(sum_op->Output("Out"),
             std::vector<std::string>({f::GradVarName("out1")}));
 
-  f::OpDescBind *grad_op1 = block->AllOps()[6];
+  f::OpDescBind *grad_op1 = block->AllOps()[7];
   ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -717,4 +829,41 @@ TEST(Backward, shared_var) {
             std::vector<std::string>({f::GradVarName("x1")}));
   EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
             std::vector<std::string>({f::GradVarName("b1")}));
-}
\ No newline at end of file
+
+  EXPECT_EQ(var_to_grad.size(), 5UL);
+  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
+  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("out1"),
+            f::GradVarInfo(f::GradVarName("out1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 7));
+  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 7));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
+}
+
+TEST(Backward, half_backward) {
+  f::ProgramDesc *program_desc = GetNewProgramDesc();
+  f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
+  f::BlockDescBind *block = program.Block(0);
+  auto *op1 = block->AppendOp();
+  op1->SetType("minus");
+  op1->SetInput("X", {"a"});
+  op1->SetInput("Y", {"b"});
+  op1->SetOutput("Out", {"out"});
+
+  auto target = f::VarDescBind("out");
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {"b"});
+  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+  auto ops = block->AllOps();
+  ASSERT_EQ(3UL, ops.size());
+
+  EXPECT_EQ(var_to_grad.size(), 1UL);
+  EXPECT_EQ(var_to_grad.at("a"),
+            f::GradVarInfo(f::GradVarName("a"), 0, forward_len + 1));
+}
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index 509aa235d3..47b75228cd 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -18,19 +18,22 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-VarDescBind *BlockDescBind::NewVar(const std::string &name) {
+VarDescBind *BlockDescBind::Var(const std::string &name) {
   need_update_ = true;
   auto it = vars_.find(name);
-  PADDLE_ENFORCE(it == vars_.end(), "Duplicated variable %s", name);
-  auto var = new VarDescBind(name);
+  if (it != vars_.end()) {
+    return it->second.get();
+  }
+  auto *var = new VarDescBind(name);
   vars_[name].reset(var);
   return var;
 }
 
-VarDescBind *BlockDescBind::Var(const std::string &name) const {
+VarDescBind *BlockDescBind::FindVar(const std::string &name) const {
   auto it = vars_.find(name);
-  PADDLE_ENFORCE(it != vars_.end(),
-                 "Can not find variable %s in current block.", name);
+  if (it == vars_.end()) {
+    return nullptr;
+  }
   return it->second.get();
 }
 
@@ -66,7 +69,7 @@ std::vector<OpDescBind *> BlockDescBind::AllOps() const {
   return res;
 }
 
-void BlockDescBind::Sync() {
+void BlockDescBind::Flush() {
   if (need_update_) {
     auto &op_field = *this->desc_->mutable_ops();
     op_field.Clear();
@@ -91,9 +94,10 @@ BlockDescBind *BlockDescBind::ParentBlock() const {
   return prog_->Block(static_cast<size_t>(this->desc_->parent_idx()));
 }
 
-void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) {
-  BlockDesc *desc = block.RawPtr();
-  this->attrs_[name] = desc;
+BlockDesc *BlockDescBind::Proto() {
+  Flush();
+  return desc_;
 }
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
index 3437e89923..9fb88f9632 100644
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -33,14 +33,6 @@ class ProgramDescBind;
 
 class BlockDescBind {
  public:
-  friend std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
-      ProgramDescBind &program_desc, int block_idx,
-      std::unordered_set<std::string> &no_grad_vars);
-
-  friend void AppendBackward(
-      ProgramDescBind &program_desc,
-      const std::unordered_set<std::string> &no_grad_vars);
-
   BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
       : prog_(prog), desc_(desc), need_update_(false) {}
 
@@ -48,9 +40,9 @@ class BlockDescBind {
 
   int32_t Parent() const { return desc_->parent_idx(); }
 
-  VarDescBind *NewVar(const std::string &name_bytes);
+  VarDescBind *Var(const std::string &name_bytes);
 
-  VarDescBind *Var(const std::string &name_bytes) const;
+  VarDescBind *FindVar(const std::string &name_bytes) const;
 
   bool HasVar(const std::string &var_name) const;
 
@@ -64,11 +56,13 @@ class BlockDescBind {
 
   std::vector<OpDescBind *> AllOps() const;
 
-  void Sync();
+  void Flush();
 
-  BlockDesc *RawPtr() { return desc_; }
+  BlockDesc *Proto();
 
- private:
+  // FIXME(yuyang18): backward will access private data of BlockDesc.
+  // Mark it public temporary. We can fix it later.
+ public:
   ProgramDescBind *prog_;  // not_own
   BlockDesc *desc_;        // not_own
   bool need_update_;
diff --git a/paddle/framework/details/op_registry.h b/paddle/framework/details/op_registry.h
index daa474e8c5..357ad21f39 100644
--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
@@ -18,6 +18,7 @@
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/op_proto_maker.h"
 #include "paddle/framework/operator.h"
+#include "paddle/framework/var_type_inference.h"
 
 namespace paddle {
 namespace framework {
@@ -26,7 +27,8 @@ namespace details {
 enum OpInfoFillType {
   kOperator = 0,
   kOpProtoAndCheckerMaker = 1,
-  kGradOpDescMaker = 2
+  kGradOpDescMaker = 2,
+  kVarTypeInference = 3
 };
 
 template <typename T>
@@ -38,7 +40,9 @@ struct OpInfoFillTypeID {
                       ? kOpProtoAndCheckerMaker
                       : (std::is_base_of<GradOpDescMakerBase, T>::value
                              ? kGradOpDescMaker
-                             : static_cast<OpInfoFillType>(-1)));
+                             : (std::is_base_of<VarTypeInference, T>::value
+                                    ? kVarTypeInference
+                                    : static_cast<OpInfoFillType>(-1))));
   }
 };
 
@@ -97,12 +101,26 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
 template <typename T>
 struct OpInfoFiller<T, kGradOpDescMaker> {
   void operator()(const char* op_type, OpInfo* info) const {
-    info->grad_op_maker_ = [](const OpDescBind& fwd_op) {
-      T maker(fwd_op);
+    info->grad_op_maker_ = [](
+        const OpDescBind& fwd_op,
+        const std::unordered_set<std::string>& no_grad_set,
+        std::unordered_map<std::string, std::string>* grad_to_var) {
+      T maker(fwd_op, no_grad_set, grad_to_var);
       return maker();
     };
   }
 };
+
+template <typename T>
+struct OpInfoFiller<T, kVarTypeInference> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->infer_var_type_ = [](const OpDescBind& fwd_op, BlockDescBind* block) {
+      T inference;
+      inference(fwd_op, block);
+    };
+  }
+};
+
 }  // namespace details
 
 }  // namespace framework
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index c388b2198e..8e82e28bac 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -66,7 +66,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
 
   // Instantiate all the vars in the global scope
   for (auto& var : block.vars()) {
-    scope->NewVar(var.name());
+    scope->Var(var.name());
   }
 
   Scope& local_scope = scope->NewScope();
@@ -78,7 +78,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
       for (auto& var : block.ops(i).outputs()) {
         for (auto& argu : var.arguments()) {
           if (local_scope.FindVar(argu) == nullptr) {
-            local_scope.NewVar(argu);
+            local_scope.Var(argu);
           }
         }
       }
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
deleted file mode 100644
index 137e53d849..0000000000
--- a/paddle/framework/executor_test.cc
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/executor.h"
-
-#include <memory>
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/framework/attribute.h"
-#include "paddle/framework/backward.h"
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-
-USE_OP(elementwise_add);
-USE_OP(gaussian_random);
-USE_OP(feed);
-USE_OP(fetch);
-USE_OP(mul);
-USE_OP(sum);
-USE_OP(squared_l2_distance);
-USE_OP(fill_constant);
-USE_OP(sgd);
-
-using namespace paddle::platform;
-using namespace paddle::framework;
-
-void AddOp(const std::string& type, const VariableNameMap& inputs,
-           const VariableNameMap& outputs, AttributeMap attrs,
-           paddle::framework::BlockDescBind* block) {
-  // insert output
-  for (auto kv : outputs) {
-    for (auto v : kv.second) {
-      auto var = block->NewVar(v);
-      var->SetDataType(paddle::framework::DataType::FP32);
-    }
-  }
-
-  // insert op
-  auto op = block->AppendOp();
-  op->SetType(type);
-  for (auto& kv : inputs) {
-    op->SetInput(kv.first, kv.second);
-  }
-  for (auto& kv : outputs) {
-    op->SetOutput(kv.first, kv.second);
-  }
-  op->SetAttrMap(attrs);
-}
-
-// Tensors in feed value variable will only be in CPUPlace
-// So we can memcpy the data from vector<T> to feed_value
-template <typename T>
-void SetFeedVariable(const std::vector<std::vector<T>>& inputs,
-                     const std::vector<std::vector<int64_t>>& dims) {
-  Variable* g_feed_value = GetGlobalScope().FindVar("feed_value");
-  auto& feed_inputs =
-      *(g_feed_value->GetMutable<std::vector<paddle::framework::Tensor>>());
-  size_t size = inputs.size();
-  feed_inputs.resize(size);
-  for (size_t i = 0; i < size; i++) {
-    T* dst = feed_inputs[i].mutable_data<T>(make_ddim(dims[i]), CPUPlace());
-    memcpy(dst, inputs[i].data(), inputs[i].size() * sizeof(T));
-  }
-}
-
-// Tensors in fetch value variable will only be in CPUPlace
-// So we can memcpy the data from fetch_value to vector<T>
-template <typename T>
-std::vector<std::vector<T>> GetFetchVariable() {
-  Variable* g_fetch_value = GetGlobalScope().FindVar("fetch_value");
-  auto& fetch_outputs =
-      *(g_fetch_value->GetMutable<std::vector<paddle::framework::Tensor>>());
-
-  size_t size = fetch_outputs.size();
-  std::vector<std::vector<T>> result;
-  result.reserve(size);
-  for (size_t i = 0; i < size; i++) {
-    std::vector<T> tmp;
-    tmp.resize(fetch_outputs[i].numel());
-    memcpy(tmp.data(), fetch_outputs[i].data<T>(),
-           fetch_outputs[i].numel() * sizeof(T));
-    result.push_back(tmp);
-  }
-
-  return result;
-}
-
-class ExecutorTesterRandom : public ::testing::Test {
- public:
-  virtual void SetUp() override {
-    int input_dim = 3, batch_size = 2, embed_dim = 5;
-
-    auto temp_init_root_block = init_pdesc_.add_blocks();
-    temp_init_root_block->set_idx(0);
-    temp_init_root_block->set_parent_idx(-1);
-    paddle::framework::ProgramDescBind& init_program =
-        paddle::framework::ProgramDescBind::Instance(&init_pdesc_);
-    paddle::framework::BlockDescBind* init_root_block = init_program.Block(0);
-
-    AddOp("gaussian_random", {}, {{"Out", {"w1"}}},
-          {{"dims", std::vector<int>{input_dim, embed_dim}}}, init_root_block);
-    AddOp("gaussian_random", {}, {{"Out", {"w2"}}},
-          {{"dims", std::vector<int>{embed_dim, input_dim}}}, init_root_block);
-    AddOp("fetch", {{"Input", {"w1"}}}, {}, {{"col", 0}}, init_root_block);
-    AddOp("fetch", {{"Input", {"w2"}}}, {}, {{"col", 1}}, init_root_block);
-
-    // flush
-    init_program.Proto();
-
-    // run block
-    auto temp_root_block = pdesc_.add_blocks();
-    temp_root_block->set_idx(0);
-    temp_root_block->set_parent_idx(-1);
-    paddle::framework::ProgramDescBind& program =
-        paddle::framework::ProgramDescBind::Instance(&pdesc_);
-    paddle::framework::BlockDescBind* root_block = program.Block(0);
-
-    // feed data
-    inputs_.push_back({1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
-    dims_.push_back({batch_size, input_dim});
-    AddOp("feed", {}, {{"Out", {"a"}}},
-          {{"dims", std::vector<int>{batch_size, input_dim}}, {"col", 0}},
-          root_block);
-
-    // forward
-    AddOp("mul", {{"X", {"a"}}, {"Y", {"w1"}}}, {{"Out", {"b"}}}, {},
-          root_block);
-    AddOp("mul", {{"X", {"b"}}, {"Y", {"w2"}}}, {{"Out", {"a_out"}}}, {},
-          root_block);
-    AddOp("squared_l2_distance", {{"X", {"a"}}, {"Y", {"a_out"}}},
-          {{"Out", {"l2_distance"}}, {"sub_result", {"l2_distance_sub"}}}, {},
-          root_block);
-
-    // backward
-    AddOp("fill_constant", {}, {{"Out", {"l2_distance@GRAD"}}},
-          {{"shape", std::vector<int>{batch_size, 1}}, {"value", float(1.0)}},
-          root_block);
-    AppendBackward(program, {});
-
-    // update
-    AddOp("fill_constant", {}, {{"Out", {"learning_rate"}}},
-          {{"shape", std::vector<int>{1}}, {"value", float(0.001)}},
-          root_block);
-    AddOp("sgd", {{"Param", {"w1"}},
-                  {"LearningRate", {"learning_rate"}},
-                  {"Grad", {"w1@GRAD"}}},
-          {{"ParamOut", {"w1"}}}, {}, root_block);
-    AddOp("sgd", {{"Param", {"w2"}},
-                  {"LearningRate", {"learning_rate"}},
-                  {"Grad", {"w2@GRAD"}}},
-          {{"ParamOut", {"w2"}}}, {}, root_block);
-
-    AddOp("fetch", {{"Input", {"w1"}}}, {}, {{"col", 0}}, root_block);
-    AddOp("fetch", {{"Input", {"w2"}}}, {}, {{"col", 1}}, root_block);
-    AddOp("fetch", {{"Input", {"l2_distance"}}}, {}, {{"col", 0}}, root_block);
-
-    // flush
-    program.Proto();
-  }
-
- protected:
-  ProgramDesc init_pdesc_;
-  ProgramDesc pdesc_;
-  std::vector<std::vector<float>> inputs_;
-  std::vector<std::vector<int64_t>> dims_;
-};
-
-class ExecutorTesterFeedAndFetch : public ::testing::Test {
- public:
-  virtual void SetUp() override {
-    auto temp_root_block = pdesc_.add_blocks();
-    temp_root_block->set_idx(0);
-    temp_root_block->set_parent_idx(-1);
-
-    // wrap to BlockDescBind
-    paddle::framework::ProgramDescBind& program =
-        paddle::framework::ProgramDescBind::Instance(&pdesc_);
-    paddle::framework::BlockDescBind* root_block = program.Block(0);
-
-    std::vector<int> dim{6};
-
-    AddOp("feed", {}, {{"Out", {"a"}}}, {{"dims", dim}, {"col", 0}},
-          root_block);
-    AddOp("feed", {}, {{"Out", {"b"}}}, {{"dims", dim}, {"col", 1}},
-          root_block);
-    AddOp("fetch", {{"Input", {"a"}}}, {}, {{"col", 0}}, root_block);
-    AddOp("fetch", {{"Input", {"b"}}}, {}, {{"col", 1}}, root_block);
-
-    // flush
-    program.Proto();
-
-    std::vector<float> vec1 = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
-    std::vector<float> vec2 = {4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
-    inputs_.push_back(vec1);
-    inputs_.push_back(vec2);
-    dims_.push_back({static_cast<int64_t>(vec1.size())});
-    dims_.push_back({static_cast<int64_t>(vec2.size())});
-  }
-
- protected:
-  ProgramDesc pdesc_;
-  std::vector<std::vector<float>> inputs_;
-  std::vector<std::vector<int64_t>> dims_;
-};
-
-#ifndef PADDLE_WITH_CUDA
-TEST_F(ExecutorTesterRandom, CPU) {
-  std::vector<Place> places;
-  CPUPlace cpu_place;
-  places.push_back(cpu_place);
-
-  // We have a global Scope and BuddyAllocator, and we must ensure
-  // global BuddyAllocator is initialized before global Scope. Thus,
-  // global Scope will deconstruct before BuddyAllocator. Otherwise,
-  // "pointer being freed was not allocated" error will appear.
-  paddle::memory::Used(cpu_place);
-
-  std::unique_ptr<Executor> executor(new Executor(places));
-
-  executor->Run(init_pdesc_, &GetGlobalScope(), 0);
-  SetFeedVariable<float>(inputs_, dims_);
-  executor->Run(pdesc_, &GetGlobalScope(), 0);
-  std::vector<std::vector<float>> result = GetFetchVariable<float>();
-}
-
-TEST_F(ExecutorTesterFeedAndFetch, CPU) {
-  std::vector<Place> places;
-  CPUPlace cpu_place;
-  places.push_back(cpu_place);
-
-  // We have a global Scope and BuddyAllocator, and we must ensure
-  // global BuddyAllocator is initialized before global Scope. Thus,
-  // global Scope will deconstruct before BuddyAllocator. Otherwise,
-  // "pointer being freed was not allocated" error will appear.
-  paddle::memory::Used(cpu_place);
-
-  std::unique_ptr<Executor> executor(new Executor(places));
-
-  for (int batch_id = 0; batch_id < 3; batch_id++) {
-    SetFeedVariable<float>(inputs_, dims_);
-    executor->Run(pdesc_, &GetGlobalScope(), 0);
-    std::vector<std::vector<float>> result = GetFetchVariable<float>();
-    PADDLE_ENFORCE_EQ(result.size(), inputs_.size());
-    for (size_t i = 0; i < result.size(); ++i) {
-      PADDLE_ENFORCE_EQ(result[i].size(), inputs_[i].size());
-      for (size_t j = 0; j < result[i].size(); ++j) {
-        PADDLE_ENFORCE_EQ(result[i][j], inputs_[i][j]);
-      }
-    }
-  }
-}
-#else
-TEST_F(ExecutorTesterRandom, GPU) {
-  std::vector<Place> places;
-  GPUPlace gpu_place(0);
-  places.push_back(gpu_place);
-
-  // We have a global Scope and BuddyAllocator, and we must ensure
-  // global BuddyAllocator is initialized before global Scope. Thus,
-  // global Scope will deconstruct before BuddyAllocator. Otherwise,
-  // "pointer being freed was not allocated" error will appear.
-  // If paddle is compiled with GPU, both CPU and GPU BuddyAllocator
-  // need to be used at first.
-  paddle::memory::Used(CPUPlace());
-  paddle::memory::Used(gpu_place);
-
-  std::unique_ptr<Executor> executor(new Executor(places));
-
-  executor->Run(init_pdesc_, &GetGlobalScope(), 0);
-  for (int batch_id = 0; batch_id < 3; batch_id++) {
-    SetFeedVariable<float>(inputs_, dims_);
-    executor->Run(pdesc_, &GetGlobalScope(), 0);
-  }
-}
-
-TEST_F(ExecutorTesterFeedAndFetch, GPU) {
-  std::vector<Place> places;
-  GPUPlace gpu_place(0);
-  places.push_back(gpu_place);
-  // We have a global Scope and BuddyAllocator, and we must ensure
-  // global BuddyAllocator is initialized before global Scope. Thus,
-  // global Scope will deconstruct before BuddyAllocator. Otherwise,
-  // "pointer being freed was not allocated" error will appear.
-  // If paddle is compiled with GPU, both CPU and GPU BuddyAllocator
-  // need to be used at first.
-  paddle::memory::Used(CPUPlace());
-  paddle::memory::Used(gpu_place);
-
-  std::unique_ptr<Executor> executor(new Executor(places));
-
-  for (int batch_id = 0; batch_id < 3; batch_id++) {
-    SetFeedVariable<float>(inputs_, dims_);
-    executor->Run(pdesc_, &GetGlobalScope(), 0);
-    std::vector<std::vector<float>> result = GetFetchVariable<float>();
-    PADDLE_ENFORCE_EQ(result.size(), inputs_.size());
-    for (size_t i = 0; i < result.size(); ++i) {
-      PADDLE_ENFORCE_EQ(result[i].size(), inputs_[i].size());
-      for (size_t j = 0; j < result[i].size(); ++j) {
-        PADDLE_ENFORCE_EQ(result[i][j], inputs_[i][j]);
-      }
-    }
-  }
-}
-#endif
diff --git a/paddle/framework/feed_fetch_method.h b/paddle/framework/feed_fetch_method.h
new file mode 100644
index 0000000000..826d180bfc
--- /dev/null
+++ b/paddle/framework/feed_fetch_method.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/scope.h"
+#include "paddle/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+void SetFeedVariable(const LoDTensor& input, const std::string& var_name,
+                     size_t index) {
+  // If var_name Variable is not found in GlobalScope, a new variable will
+  // be created.
+  Variable* g_feed_value = GetGlobalScope().Var(var_name);
+  auto& feed_inputs =
+      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
+  if (index >= feed_inputs.size()) {
+    feed_inputs.resize(index + 1);
+  }
+  // shared data with input tensor
+  feed_inputs[index].ShareDataWith<T>(input);
+  // set lod
+  feed_inputs[index].set_lod(input.lod());
+}
+
+LoDTensor& GetFetchVariable(const std::string& var_name, size_t index) {
+  // Since we want to fetch LodTensor from a variable, the variable must
+  // be created alreadly.
+  Variable* g_fetch_value = GetGlobalScope().FindVar(var_name);
+  auto& fetch_outputs =
+      *(g_fetch_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
+  PADDLE_ENFORCE_LT(index, fetch_outputs.size());
+  return fetch_outputs[index];
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/feed_fetch_type.h b/paddle/framework/feed_fetch_type.h
new file mode 100644
index 0000000000..bc4ae440fc
--- /dev/null
+++ b/paddle/framework/feed_fetch_type.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+using FeedFetchType = LoDTensor;
+using FeedFetchList = std::vector<FeedFetchType>;
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index b7a63f9ba1..65760b07ad 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -97,16 +97,26 @@ enum DataType {
   FP64 = 6;
 }
 
-message LoDTensorDesc {
+message TensorDesc {
   required DataType data_type = 1;
   repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-  optional int32 lod_level = 3 [ default = 0 ];
+}
+
+message LoDTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int32 lod_level = 2 [ default = 0 ];
 }
 
 message VarDesc {
+  enum VarType {
+    LOD_TENSOR = 1;
+    SELECTED_ROWS = 2;
+  }
   required string name = 1;
-  optional LoDTensorDesc lod_tensor = 2;
-  optional bool persistable = 3 [ default = false ];
+  required VarType type = 2;
+  optional LoDTensorDesc lod_tensor = 3;
+  optional TensorDesc selected_rows = 4;
+  optional bool persistable = 5 [ default = false ];
 }
 
 message BlockDesc {
diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h
index e9ae6e2206..94944c79b6 100644
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -13,6 +13,8 @@
    limitations under the License. */
 
 #pragma once
+#include <string>
+#include <unordered_set>
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/operator.h"
 
@@ -21,27 +23,50 @@ namespace framework {
 
 class GradOpDescMakerBase {
  public:
-  explicit GradOpDescMakerBase(const OpDescBind& fwd_op) : fwd_op_(fwd_op) {}
+  explicit GradOpDescMakerBase(
+      const OpDescBind& fwd_op,
+      const std::unordered_set<std::string>& no_grad_set,
+      std::unordered_map<std::string, std::string>* grad_to_var)
+      : fwd_op_(fwd_op), no_grad_set_(no_grad_set), grad_to_var_(grad_to_var) {}
 
   virtual ~GradOpDescMakerBase() = default;
   virtual std::vector<std::unique_ptr<OpDescBind>> operator()() const = 0;
 
  protected:
-  static std::vector<std::string> ToGradNames(
-      const std::vector<std::string>& var_names) {
+  std::vector<std::string> InputGrad(const std::string& name,
+                                     bool drop_empty_grad = true) const {
     std::vector<std::string> ret_val;
+    auto var_names = this->Input(name);
     ret_val.reserve(var_names.size());
     std::transform(var_names.begin(), var_names.end(),
-                   std::back_inserter(ret_val), GradVarName);
-    return ret_val;
-  }
-
-  std::vector<std::string> InputGrad(const std::string& name) const {
-    return ToGradNames(fwd_op_.Input(name));
+                   std::back_inserter(ret_val),
+                   [this](const std::string& fwd_var_name) -> std::string {
+                     auto g_name = GradVarName(fwd_var_name);
+                     if (no_grad_set_.count(g_name)) {
+                       return kEmptyVarName;
+                     } else {
+                       (*this->grad_to_var_)[g_name] = fwd_var_name;
+                       return g_name;
+                     }
+                   });
+    if (!drop_empty_grad) {
+      return ret_val;
+    }
+    std::vector<std::string> dropped_ret_val;
+    dropped_ret_val.reserve(ret_val.size());
+    std::copy_if(ret_val.begin(), ret_val.end(),
+                 std::back_inserter(dropped_ret_val),
+                 [](const std::string& str) { return str != kEmptyVarName; });
+    return dropped_ret_val;
   }
 
   std::vector<std::string> OutputGrad(const std::string& name) const {
-    return ToGradNames(fwd_op_.Output(name));
+    std::vector<std::string> ret_val;
+    auto onames = this->Output(name);
+    ret_val.reserve(onames.size());
+    std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val),
+                   GradVarName);
+    return ret_val;
   }
 
   std::vector<std::string> InputNames() const {
@@ -75,6 +100,8 @@ class GradOpDescMakerBase {
 
  private:
   const OpDescBind& fwd_op_;
+  const std::unordered_set<std::string>& no_grad_set_;
+  std::unordered_map<std::string, std::string>* grad_to_var_;
 };
 
 class SingleGradOpDescMaker : public GradOpDescMakerBase {
@@ -91,6 +118,7 @@ class SingleGradOpDescMaker : public GradOpDescMakerBase {
   virtual std::unique_ptr<OpDescBind> Apply() const = 0;
 };
 
+template <bool DropEmptyIG = true>
 class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
  public:
   using SingleGradOpDescMaker::SingleGradOpDescMaker;
@@ -102,7 +130,8 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
 
     for (auto& input_param : this->InputNames()) {
       grad->SetInput(input_param, this->Input(input_param));
-      grad->SetOutput(GradVarName(input_param), this->InputGrad(input_param));
+      grad->SetOutput(GradVarName(input_param),
+                      this->InputGrad(input_param, DropEmptyIG));
     }
 
     for (auto& output_param : this->OutputNames()) {
@@ -120,5 +149,13 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
   }
 };
 
+class EmptyGradOpMaker : public GradOpDescMakerBase {
+ public:
+  using GradOpDescMakerBase::GradOpDescMakerBase;
+  std::vector<std::unique_ptr<OpDescBind>> operator()() const override {
+    return {};
+  }
+};
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index d3c11ad60a..18fabe481d 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -32,7 +32,7 @@ OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
 }
 
 OpDesc *OpDescBind::Proto() {
-  Sync();
+  Flush();
   return &op_desc_;
 }
 
@@ -100,6 +100,12 @@ void OpDescBind::SetAttr(const std::string &name, const Attribute &v) {
   need_update_ = true;
 }
 
+void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) {
+  BlockDesc *desc = block.Proto();
+  this->attrs_[name] = desc;
+  need_update_ = true;
+}
+
 void OpDescBind::SetAttrMap(
     const std::unordered_map<std::string, Attribute> &attr_map) {
   attrs_ = attr_map;
@@ -159,7 +165,7 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
   void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
 };
 
-void OpDescBind::Sync() {
+void OpDescBind::Flush() {
   if (need_update_) {
     this->op_desc_.mutable_inputs()->Clear();
     for (auto &ipt : inputs_) {
@@ -214,9 +220,12 @@ static InferShapeFuncMap &InferShapeFuncs() {
 void OpDescBind::CheckAttrs() {
   PADDLE_ENFORCE(!Type().empty(),
                  "CheckAttr() can not be called before type is setted.");
-  const auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
-  PADDLE_ENFORCE_NOT_NULL(checker, "Operator \"%s\" has no registered checker.",
-                          Type());
+  auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
+  if (checker == nullptr) {
+    // checker is not configured. That operator could be generated by Paddle,
+    // not by users.
+    return;
+  }
   checker->Check(attrs_);
 }
 
@@ -230,5 +239,19 @@ void OpDescBind::InferShape(const BlockDescBind &block) const {
   it->second(&ctx);
 }
 
+void OpDescBind::InferVarType(BlockDescBind *block) const {
+  auto &info = OpInfoMap::Instance().Get(this->Type());
+  if (info.infer_var_type_) {
+    info.infer_var_type_(*this, block);
+  } else {
+    // all output type is LoDTensor by default
+    for (auto &out_pair : this->outputs_) {
+      for (auto &out_var_name : out_pair.second) {
+        block->Var(out_var_name)->SetType(VarDesc::LOD_TENSOR);
+      }
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index 90155fadea..313bf538ac 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -89,8 +89,6 @@ class OpDescBind {
     this->need_update_ = true;
   }
 
-  void Sync();
-
   const VariableNameMap &Inputs() const { return inputs_; }
 
   const VariableNameMap &Outputs() const { return outputs_; }
@@ -104,6 +102,10 @@ class OpDescBind {
 
   void InferShape(const BlockDescBind &block) const;
 
+  void InferVarType(BlockDescBind *block) const;
+
+  void Flush();
+
  private:
   template <typename MapType>
   static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
diff --git a/paddle/framework/op_info.h b/paddle/framework/op_info.h
index c504f69e30..e926180780 100644
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@@ -19,7 +19,6 @@
 #include <unordered_map>
 
 #include "paddle/framework/attribute.h"
-#include "paddle/framework/op_desc.h"
 #include "paddle/framework/type_defs.h"
 #include "paddle/platform/macros.h"
 
@@ -31,6 +30,7 @@ struct OpInfo {
   GradOpMakerFN grad_op_maker_;
   OpProto* proto_{nullptr};
   OpAttrChecker* checker_{nullptr};
+  InferVarTypeFN infer_var_type_;
 
   bool HasOpProtoAndChecker() const {
     return proto_ != nullptr && checker_ != nullptr;
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
index b118edae17..504afbd5db 100644
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -59,16 +59,5 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDescBind& op_desc) {
                   op_desc.GetAttrMap());
 }
 
-std::vector<std::unique_ptr<OpDescBind>> OpRegistry::CreateGradOpDescs(
-    OpDescBind* op_desc) {
-  auto& info = OpInfoMap::Instance().Get(op_desc->Type());
-
-  if (info.Checker() != nullptr) {
-    info.Checker()->Check(*op_desc->MutableAttrMap());
-  }
-
-  return info.grad_op_maker_(*op_desc);
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 5ca3af52a6..0bda87dfa1 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -79,22 +79,9 @@ class OpRegistry {
 
   static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
 
-  static std::vector<std::unique_ptr<OpDescBind>> CreateGradOpDescs(
-      OpDescBind* op_desc);
-
   static std::unique_ptr<OperatorBase> CreateOp(const OpDescBind& op_desc);
 };
 
-template <typename OpType, typename ProtoMakerType, typename GradOpType>
-class OpRegistrar : public Registrar {
- public:
-  explicit OpRegistrar(const char* op_type) { OpRegistrar(op_type, ""); }
-  OpRegistrar(const char* op_type, const char* grad_op_type) {
-    OpRegistry::RegisterOp<OpType, ProtoMakerType, GradOpType>(op_type,
-                                                               grad_op_type);
-  }
-};
-
 template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
 struct OpKernelRegistrarFunctor;
 
@@ -160,17 +147,18 @@ class OpKernelRegistrar : public Registrar {
 /**
  * Macro to register Operator.
  */
-#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,           \
-                    grad_op_class)                                             \
-  REGISTER_OPERATOR(grad_op_type, grad_op_class);                              \
-  class _GradOpDescMaker_##grad_op_type##_                                     \
-      : public ::paddle::framework::DefaultGradOpDescMaker {                   \
-    using ::paddle::framework::DefaultGradOpDescMaker::DefaultGradOpDescMaker; \
-                                                                               \
-   protected:                                                                  \
-    virtual std::string GradOpType() const { return #grad_op_type; }           \
-  };                                                                           \
-  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_,     \
+#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,       \
+                    grad_op_class)                                         \
+  REGISTER_OPERATOR(grad_op_type, grad_op_class);                          \
+  class _GradOpDescMaker_##grad_op_type##_                                 \
+      : public ::paddle::framework::DefaultGradOpDescMaker<true> {         \
+    using ::paddle::framework::DefaultGradOpDescMaker<                     \
+        true>::DefaultGradOpDescMaker;                                     \
+                                                                           \
+   protected:                                                              \
+    virtual std::string GradOpType() const { return #grad_op_type; }       \
+  };                                                                       \
+  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \
                     op_maker_class);
 
 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 97a142d5f1..cf15f9933a 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -403,11 +403,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
  private:
   DDim GetDim(const std::string& name) const override {
-    return framework::make_ddim(block_.Var(name)->Shape());
+    return framework::make_ddim(block_.FindVar(name)->Shape());
   }
 
   void SetDim(const std::string& name, const DDim& dim) override {
-    block_.Var(name)->SetShape(framework::vectorize(dim));
+    block_.FindVar(name)->SetShape(framework::vectorize(dim));
   }
 
   const OpDescBind& op_;
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index a02f4668bc..d7890ac8d0 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -84,7 +84,7 @@ TEST(OperatorBase, all) {
   paddle::framework::Scope scope;
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  scope.NewVar("OUT1");
+  scope.Var("OUT1");
   ASSERT_EQ(paddle::framework::op_run_num, 0);
   op->Run(scope, device_context);
   ASSERT_EQ(paddle::framework::op_run_num, 1);
@@ -237,12 +237,12 @@ TEST(OpKernel, multi_inputs) {
 
   paddle::platform::CPUDeviceContext cpu_device_context;
   paddle::framework::Scope scope;
-  scope.NewVar("x0")->GetMutable<Tensor>();
-  scope.NewVar("x1")->GetMutable<Tensor>();
-  scope.NewVar("x2")->GetMutable<Tensor>();
-  scope.NewVar("k0")->GetMutable<Tensor>();
-  scope.NewVar("y0")->GetMutable<Tensor>();
-  scope.NewVar("y1")->GetMutable<Tensor>();
+  scope.Var("x0")->GetMutable<Tensor>();
+  scope.Var("x1")->GetMutable<Tensor>();
+  scope.Var("x2")->GetMutable<Tensor>();
+  scope.Var("k0")->GetMutable<Tensor>();
+  scope.Var("y0")->GetMutable<Tensor>();
+  scope.Var("y1")->GetMutable<Tensor>();
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   op->Run(scope, cpu_device_context);
diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc
index e89f9a46d5..fcb7292884 100644
--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -45,7 +45,7 @@ BlockDescBind *ProgramDescBind::AppendBlock(const BlockDescBind &parent) {
 
 ProgramDesc *ProgramDescBind::Proto() {
   for (auto &block : blocks_) {
-    block->Sync();
+    block->Flush();
   }
   return prog_;
 }
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index 5821bac928..5bf5e91f25 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -31,7 +31,7 @@ Scope& Scope::NewScope() const {
   return *kids_.back();
 }
 
-Variable* Scope::NewVar(const std::string& name) {
+Variable* Scope::Var(const std::string& name) {
   auto iter = vars_.find(name);
   if (iter != vars_.end()) {
     return iter->second;
@@ -42,8 +42,8 @@ Variable* Scope::NewVar(const std::string& name) {
   return v;
 }
 
-Variable* Scope::NewVar() {
-  return NewVar(string::Sprintf("%p.%d", this, vars_.size()));
+Variable* Scope::Var() {
+  return Var(string::Sprintf("%p.%d", this, vars_.size()));
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
@@ -65,16 +65,12 @@ void Scope::DropKids() {
   kids_.clear();
 }
 
-std::once_flag feed_variable_flag;
-
 framework::Scope& GetGlobalScope() {
-  static std::unique_ptr<framework::Scope> g_scope{nullptr};
-  std::call_once(feed_variable_flag, [&]() {
-    g_scope.reset(new framework::Scope());
-    g_scope->NewVar("feed_value");
-    g_scope->NewVar("fetch_value");
-  });
-  return *(g_scope.get());
+  static framework::Scope* g_scope = nullptr;
+  if (g_scope == nullptr) {
+    g_scope = new framework::Scope();
+  }
+  return *g_scope;
 }
 
 }  // namespace framework
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index a8cfb107c2..a7fce3514b 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -45,10 +45,10 @@ class Scope {
   Scope& NewScope() const;
 
   /// Create a variable with given name if it doesn't exist.
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);
 
   /// Create a variable with a scope-unique name.
-  Variable* NewVar();
+  Variable* Var();
 
   /// Find a variable in the scope or any of its ancestors.  Returns
   /// nullptr if cannot find.
diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc
index 9d51e355b0..7cc5e3510d 100644
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
@@ -23,8 +23,8 @@ TEST(Scope, VarsShadowing) {
   Scope& ss1 = s.NewScope();
   Scope& ss2 = s.NewScope();
 
-  Variable* v0 = s.NewVar("a");
-  Variable* v1 = ss1.NewVar("a");
+  Variable* v0 = s.Var("a");
+  Variable* v1 = ss1.Var("a");
 
   EXPECT_NE(v0, v1);
 
@@ -40,7 +40,7 @@ TEST(Scope, FindVar) {
   EXPECT_EQ(nullptr, s.FindVar("a"));
   EXPECT_EQ(nullptr, ss.FindVar("a"));
 
-  ss.NewVar("a");
+  ss.Var("a");
 
   EXPECT_EQ(nullptr, s.FindVar("a"));
   EXPECT_NE(nullptr, ss.FindVar("a"));
@@ -49,7 +49,7 @@ TEST(Scope, FindVar) {
 TEST(Scope, FindScope) {
   Scope s;
   Scope& ss = s.NewScope();
-  Variable* v = s.NewVar("a");
+  Variable* v = s.Var("a");
 
   EXPECT_EQ(&s, s.FindScope(v));
   EXPECT_EQ(&s, ss.FindScope(v));
diff --git a/paddle/operators/feed_op.cu b/paddle/framework/selected_rows.cc
similarity index 75%
rename from paddle/operators/feed_op.cu
rename to paddle/framework/selected_rows.cc
index 7b6a2ac91e..c74459c9dd 100644
--- a/paddle/operators/feed_op.cu
+++ b/paddle/framework/selected_rows.cc
@@ -1,18 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
+    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/feed_op.h"
+#include "paddle/framework/selected_rows.h"
 
-namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(feed, ops::FeedKernel<float>);
+namespace paddle {
+namespace framework {}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/selected_rows.h b/paddle/framework/selected_rows.h
new file mode 100644
index 0000000000..cd90781371
--- /dev/null
+++ b/paddle/framework/selected_rows.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+
+class SelectedRows {
+ public:
+  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
+      : rows_(rows), height_(height) {
+    value_.reset(new Tensor());
+  }
+
+  SelectedRows() { value_.reset(new Tensor()); }
+
+  platform::Place place() const { return value_->place(); }
+
+  const Tensor& value() const { return *value_; }
+
+  Tensor* mutable_value() { return value_.get(); }
+
+  int64_t height() const { return height_; }
+
+  void set_height(int64_t height) { height_ = height; }
+
+  const Vector<int64_t>& rows() const { return rows_; }
+
+  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
+
+  DDim GetCompleteDims() const {
+    std::vector<int64_t> dims = vectorize(value_->dims());
+    dims[0] = height_;
+    return make_ddim(dims);
+  }
+
+ private:
+  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
+  // SelectedRows are simplely concated when adding together. Until a
+  // SelectedRows add a Tensor, will the duplicate rows be handled.
+  Vector<int64_t> rows_;
+  std::unique_ptr<Tensor> value_{nullptr};
+  int64_t height_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/selected_rows_test.cc b/paddle/framework/selected_rows_test.cc
new file mode 100644
index 0000000000..4ee13a65d7
--- /dev/null
+++ b/paddle/framework/selected_rows_test.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/selected_rows.h"
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace framework {
+
+class SelectedRowsTester : public ::testing::Test {
+ public:
+  virtual void SetUp() override {
+    std::vector<int64_t> rows{0, 4, 7};
+    int64_t height = 10;
+    int64_t row_numel = 100;
+    selected_rows_.reset(new SelectedRows(rows, height));
+
+    Tensor* value = selected_rows_->mutable_value();
+    value->mutable_data<float>(
+        make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
+  }
+
+ protected:
+  platform::CPUPlace place_;
+  std::unique_ptr<SelectedRows> selected_rows_{nullptr};
+};
+
+TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); }
+
+TEST_F(SelectedRowsTester, dims) {
+  ASSERT_EQ(selected_rows_->value().dims(), make_ddim({3, 100}));
+}
+
+TEST_F(SelectedRowsTester, complete_dims) {
+  ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100}));
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index 64aab16ae5..b93f980cf6 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -19,9 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-// TODO(longfei): Once after both CompileTimeInferShapeContext and
-// RuntimeInferShapeContext get merged, we can rename InferShapeContext into
-// InferShapeContext so to replace the current InferShapeContext.
 class InferShapeContext {
  public:
   virtual ~InferShapeContext() {}
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 3962d55324..04a1106a4a 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -100,6 +100,22 @@ class Tensor {
   inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
                        const platform::DeviceContext& ctx);
 
+  // FIXME(yuyang18): CopyFrom should without template T, use the replace
+  // `CopyFrom` with `CopyFromTensor`
+  inline void CopyFromTensor(const Tensor& src,
+                             const platform::Place& dst_place,
+                             const platform::DeviceContext& ctx) {
+    // NOLINTNEXTLINES_8 cpplint.py will recognize below lines as functions.
+    // That is a bug of cpplint.py. Just ignore lint these lines.
+    if (src.type() == std::type_index(typeid(double))) {
+      CopyFrom<double>(src, dst_place, ctx);
+    } else if (src.type() == std::type_index(typeid(float))) {
+      CopyFrom<float>(src, dst_place, ctx);
+    } else if (src.type() == std::type_index(typeid(int))) {
+      CopyFrom<int>(src, dst_place, ctx);
+    }
+  }
+
   /**
    * @brief   Copy the content of an external vector to a tensor.
    *
diff --git a/paddle/framework/tensor_array.cc b/paddle/framework/tensor_array.cc
index 7ae16e99cd..06459cbfd7 100644
--- a/paddle/framework/tensor_array.cc
+++ b/paddle/framework/tensor_array.cc
@@ -76,6 +76,17 @@ LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
                            const std::vector<DySeqMeta>& meta, const LoD& lod,
                            size_t level);
 
+std::vector<size_t> GenDyBatchIndice(const DySeqMetaBatch& meta, int batch_id) {
+  // collect indice need to copy to the batch
+  std::vector<size_t> indice;
+  for (const auto& seq : meta) {
+    size_t id = seq.begin + batch_id;
+    if (id >= seq.end) break;
+    indice.push_back(id);
+  }
+  return indice;
+}
+
 }  // namespace detail
 
 const LoDTensor& TensorArray::Read(size_t index) const {
@@ -113,8 +124,8 @@ LoDTensor TensorArray::Pack(size_t level, const std::vector<DySeqMeta>& meta,
   return detail::PackDynamicBatch(values_, meta, lod, level);
 }
 
-std::vector<DySeqMeta> TensorArray::Unpack(const LoDTensor& source, int level,
-                                           bool length_desend) {
+DySeqMetaBatch TensorArray::Unpack(const LoDTensor& source, int level,
+                                   bool length_desend) {
   detail::DynamicBatchUnpacker unpacker(source, level,
                                         length_desend /*descend*/);
 
@@ -129,6 +140,7 @@ std::vector<DySeqMeta> TensorArray::Unpack(const LoDTensor& source, int level,
     Write(batch_id, unpacker.GetBatch(batch_id));
   }
 
+  PADDLE_ENFORCE(!unpacker.meta.empty());
   return unpacker.meta;
 }
 
@@ -218,13 +230,7 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
   PADDLE_ENFORCE(!meta.empty(), "should build meta first");
   LoDTensor result;
 
-  // collect indice need to copy to the batch
-  std::vector<size_t> indice;
-  for (const auto& seq : meta) {
-    size_t id = seq.begin + index;
-    if (id >= seq.end) break;
-    indice.push_back(id);
-  }
+  auto indice = detail::GenDyBatchIndice(meta, index);
   PADDLE_ENFORCE(!indice.empty(), "invalid batch at %d", index);
 
   // copy the indice of records in LoDTensor
@@ -237,9 +243,9 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
   for (size_t i = 0; i < indice.size(); i++) {
     auto index = indice[i];
     auto target = result.Slice<value_type>(i, i + 1);
-    auto source_ = source->Slice<value_type>(index, index + 1);
+    auto slice = source->Slice<value_type>(index, index + 1);
 
-    target.CopyFrom<value_type>(source_, platform::CPUPlace(),
+    target.CopyFrom<value_type>(slice, platform::CPUPlace(),
                                 platform::CPUDeviceContext());
   }
 
diff --git a/paddle/framework/tensor_array.h b/paddle/framework/tensor_array.h
index 293da04997..046ecb5221 100644
--- a/paddle/framework/tensor_array.h
+++ b/paddle/framework/tensor_array.h
@@ -34,6 +34,13 @@ struct DySeqMeta {
   size_t ori_idx;
 };
 
+using DySeqMetaBatch = std::vector<DySeqMeta>;
+
+/*
+ * Extract the indices of instances.
+ */
+std::vector<size_t> GenDyBatchIndice(const DySeqMetaBatch &metas, int batch_id);
+
 /*
  * TensorArray is a C-array-like array of tensors, it is meant to be used with
  * dynamic iteration primitives such as while_loop. It is used to segment inputs
@@ -69,7 +76,7 @@ class TensorArray {
    * Recover the original LoD-arranged LoDTensor with the `values`, `level` and
    * `indice_map`.
    */
-  LoDTensor Pack(size_t level, const std::vector<DySeqMeta> &meta,
+  LoDTensor Pack(size_t level, const DySeqMetaBatch &meta,
                  const LoD &lod) const;
 
   /*
@@ -77,8 +84,7 @@ class TensorArray {
    * `values`, if set `desend`, will sort by length in descending order else in
    * ascending order.
    */
-  std::vector<DySeqMeta> Unpack(const LoDTensor &source, int level,
-                                bool length_desend);
+  DySeqMetaBatch Unpack(const LoDTensor &source, int level, bool length_desend);
 
   /*
    * Pack the values into a tensor with rank one higher than each tensor in
diff --git a/paddle/framework/type_defs.h b/paddle/framework/type_defs.h
index 6f65a942ba..00da728939 100644
--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
@@ -16,12 +16,18 @@
 #include <functional>
 #include <map>
 #include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
 #include "paddle/platform/variant.h"
 
 namespace paddle {
 namespace framework {
 class OperatorBase;
 class OpDescBind;
+class BlockDescBind;
+class BlockDesc;
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 
 // The order should be as same as framework.proto
@@ -36,8 +42,12 @@ using OpCreator = std::function<OperatorBase*(
     const std::string& /*type*/, const VariableNameMap& /*inputs*/,
     const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
 
-using GradOpMakerFN =
-    std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>;
+using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDescBind>>(
+    const OpDescBind&, const std::unordered_set<std::string>& /*no_grad_set*/,
+    std::unordered_map<std::string, std::string>* /*grad_to_var*/)>;
+
+using InferVarTypeFN = std::function<void(const OpDescBind& /*op_desc*/,
+                                          BlockDescBind* /*block*/)>;
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index a88e813b5e..c302217e5a 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -13,32 +13,58 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/var_desc.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 
 void VarDescBind::SetShape(const std::vector<int64_t> &dims) {
-  VectorToRepeated(dims, desc_.mutable_lod_tensor()->mutable_dims());
+  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
 }
 
 void VarDescBind::SetDataType(DataType data_type) {
-  desc_.mutable_lod_tensor()->set_data_type(data_type);
+  mutable_tensor_desc()->set_data_type(data_type);
 }
 
 std::vector<int64_t> VarDescBind::Shape() const {
-  return RepeatedToVector(desc_.lod_tensor().dims());
+  return RepeatedToVector(tensor_desc().dims());
 }
 
-DataType VarDescBind::GetDataType() const {
-  return desc_.lod_tensor().data_type();
-}
+DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); }
 
 void VarDescBind::SetLoDLevel(int32_t lod_level) {
+  PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
   desc_.mutable_lod_tensor()->set_lod_level(lod_level);
 }
 
 int32_t VarDescBind::GetLodLevel() const {
+  PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
   return desc_.lod_tensor().lod_level();
 }
+
+const TensorDesc &VarDescBind::tensor_desc() const {
+  PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type");
+  switch (desc_.type()) {
+    case VarDesc::SELECTED_ROWS:
+      return desc_.selected_rows();
+    case VarDesc::LOD_TENSOR:
+      return desc_.lod_tensor().tensor();
+    default:
+      PADDLE_THROW("Unexpected branch.");
+  }
+}
+
+TensorDesc *VarDescBind::mutable_tensor_desc() {
+  PADDLE_ENFORCE(desc_.has_type(),
+                 "invoke MutableTensorDesc must after set type");
+  switch (desc_.type()) {
+    case VarDesc::SELECTED_ROWS:
+      return desc_.mutable_selected_rows();
+    case VarDesc::LOD_TENSOR:
+      return desc_.mutable_lod_tensor()->mutable_tensor();
+    default:
+      PADDLE_THROW("Unexpected branch.");
+  }
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h
index 464fece85f..688a46f839 100644
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -34,6 +34,7 @@ inline std::vector<T> RepeatedToVector(
 template <typename T, typename RepeatedField>
 inline void VectorToRepeated(const std::vector<T> &vec,
                              RepeatedField *repeated_field) {
+  repeated_field->Clear();
   repeated_field->Reserve(vec.size());
   for (const auto &elem : vec) {
     *repeated_field->Add() = elem;
@@ -44,6 +45,7 @@ inline void VectorToRepeated(const std::vector<T> &vec,
 template <typename RepeatedField>
 inline void VectorToRepeated(const std::vector<bool> &vec,
                              RepeatedField *repeated_field) {
+  repeated_field->Clear();
   repeated_field->Reserve(vec.size());
   for (auto elem : vec) {
     *repeated_field->Add() = elem;
@@ -52,7 +54,10 @@ inline void VectorToRepeated(const std::vector<bool> &vec,
 
 class VarDescBind {
  public:
-  explicit VarDescBind(const std::string &name) { desc_.set_name(name); }
+  explicit VarDescBind(const std::string &name) {
+    desc_.set_name(name);
+    desc_.set_type(VarDesc::LOD_TENSOR);
+  }
 
   VarDesc *Proto() { return &desc_; }
 
@@ -70,7 +75,14 @@ class VarDescBind {
 
   int32_t GetLodLevel() const;
 
+  VarDesc::VarType GetType() const { return desc_.type(); }
+
+  void SetType(VarDesc::VarType type) { desc_.set_type(type); }
+
  private:
+  const TensorDesc &tensor_desc() const;
+  TensorDesc *mutable_tensor_desc();
+
   VarDesc desc_;
 };
 }  // namespace framework
diff --git a/paddle/framework/var_type_inference.h b/paddle/framework/var_type_inference.h
new file mode 100644
index 0000000000..32abbeb334
--- /dev/null
+++ b/paddle/framework/var_type_inference.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/type_defs.h"
+
+namespace paddle {
+namespace framework {
+
+class VarTypeInference {
+ public:
+  virtual ~VarTypeInference() {}
+  virtual void operator()(const OpDescBind& op_desc,
+                          BlockDescBind* block) const = 0;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/var_type_inference_test.cc b/paddle/framework/var_type_inference_test.cc
new file mode 100644
index 0000000000..87399208e9
--- /dev/null
+++ b/paddle/framework/var_type_inference_test.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/var_type_inference.h"
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+
+class SumOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class SumOpVarTypeInference : public VarTypeInference {
+ public:
+  void operator()(const OpDescBind &op_desc,
+                  BlockDescBind *block) const override {
+    auto &inputs = op_desc.Input("X");
+    auto default_var_type = VarDesc::SELECTED_ROWS;
+
+    bool any_input_is_lod_tensor = std::any_of(
+        inputs.begin(), inputs.end(), [block](const std::string &name) {
+          return block->Var(name)->GetType() == VarDesc::LOD_TENSOR;
+        });
+    if (any_input_is_lod_tensor) {
+      default_var_type = VarDesc::LOD_TENSOR;
+    }
+
+    auto out_var_name = op_desc.Output("Out").front();
+    block->Var(out_var_name)->SetType(default_var_type);
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker,
+                  paddle::framework::SumOpVarTypeInference);
+REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP,
+                  paddle::framework::SumOpMaker);
+
+namespace paddle {
+namespace framework {
+
+TEST(InferVarType, sum_op) {
+  auto &prog = ProgramDescBind::Instance(&GetProgramDesc());
+  auto *op = prog.Block(0)->AppendOp();
+  op->SetType("sum");
+  op->SetInput("X", {"test_a", "test_b", "test_c"});
+  op->SetOutput("Out", {"test_out"});
+
+  prog.Block(0)->Var("test_a")->SetType(VarDesc::SELECTED_ROWS);
+  prog.Block(0)->Var("test_b")->SetType(VarDesc::SELECTED_ROWS);
+  prog.Block(0)->Var("test_c")->SetType(VarDesc::SELECTED_ROWS);
+  prog.Block(0)->Var("test_out");
+
+  op->InferVarType(prog.Block(0));
+
+  ASSERT_EQ(VarDesc::SELECTED_ROWS, prog.Block(0)->Var("test_out")->GetType());
+
+  prog.Block(0)->Var("test_b")->SetType(VarDesc::LOD_TENSOR);
+  op->InferVarType(prog.Block(0));
+  ASSERT_EQ(VarDesc::LOD_TENSOR, prog.Block(0)->Var("test_out")->GetType());
+}
+
+TEST(InferVarType, sum_op_without_infer_var_type) {
+  auto &prog = ProgramDescBind::Instance(&GetProgramDesc());
+  auto *op = prog.Block(0)->AppendOp();
+  op->SetType("sum_without_infer_var_type");
+  op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
+  op->SetOutput("Out", {"test2_out"});
+
+  prog.Block(0)->Var("test2_a")->SetType(VarDesc::SELECTED_ROWS);
+  prog.Block(0)->Var("test2_b")->SetType(VarDesc::SELECTED_ROWS);
+  prog.Block(0)->Var("test2_c")->SetType(VarDesc::SELECTED_ROWS);
+  prog.Block(0)->Var("test2_out");
+
+  op->InferVarType(prog.Block(0));
+
+  ASSERT_EQ(VarDesc_VarType_LOD_TENSOR,
+            prog.Block(0)->Var("test2_out")->GetType());
+}
+
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index ce071323ff..0bb6f84c22 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -462,8 +462,8 @@ void LambdaCost::calcGrad(const real* outputScore,
       real score_j = score[index_j];
       real dcgDif = 0;
       if (j < sortSize) {
-        dcgDif = (std::pow(2, score_i) - std::pow(2, score_j)) /
-                 (std::log(i + 2) - std::log(j + 2));
+        dcgDif = (std::pow(2, score_i) - std::pow(2, score_j)) *
+                 (1 / std::log(i + 2) - 1 / std::log(j + 2));
       } else {
         dcgDif =
             (std::pow(2, score_i) - std::pow(2, score_j)) / std::log(i + 2);
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 4002a3d074..9813a55607 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -86,6 +86,7 @@ protected:
   /// Also used in 'use_mkldnn' case.
   std::vector<Argument> outputOtherDevice_;
   /// If there are several outputs, map them by each name.
+  /// MKLDNNLayer use it only to merge output grad
   std::map<std::string, Argument*> outputMap_;
   /// Used to merge grad on different devices.
   MatrixPtr tmpGrad_;
@@ -325,6 +326,11 @@ public:
     outputMap_[name] = output;
   }
 
+  /**
+   * Get the output map size, if layer has multi-output.
+   */
+  size_t getOutputMapSize() { return outputMap_.size(); }
+
   /**
    * Get the output based on layer's name.
    */
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index 0d6742e909..8b67a1ef4f 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -225,8 +225,6 @@ void MKLDNNConvLayer::resetFwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
   if (cvtInVal_) {
     pipeline.push_back(*cvtInVal_);
   }
@@ -245,7 +243,7 @@ void MKLDNNConvLayer::resetFwdPipeline(
 
 void MKLDNNConvLayer::resetInValue(
     std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& in) {
-  const MatrixPtr& inMat = inputLayers_[0]->getOutput().value;
+  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
   in = MKLDNNMatrix::create(inMat, pd->src_primitive_desc());
 
   // create buffer and reorder if input value do not match
@@ -310,15 +308,20 @@ void MKLDNNConvLayer::resetOutValue(
     const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
     memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
     cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
-    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+    if (cpuOutVal_->getPrimitiveDesc() != pd->dst_primitive_desc()) {
+      out = MKLDNNMatrix::create(nullptr, pd->dst_primitive_desc());
       cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
-      CHECK(cvtOutVal_) << "should not be emptry";
+      CHECK(cvtOutVal_) << "should not be empty";
     } else {
-      // CPU output share the same data of MKLDNN output
-      cpuOut->setData(out->getData());
       cpuOutVal_ = out;
     }
+    // when output is cpu device, change the mkldnn output value and make them
+    // share the same data. Then if next layer use inputlayer->getOuputValue()
+    // to achieve the input value, it will get the right data.
+    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
+    return;
   }
+  output_.value = std::dynamic_pointer_cast<Matrix>(out);
 }
 
 void MKLDNNConvLayer::resetBwdWgtPD(
@@ -412,8 +415,6 @@ void MKLDNNConvLayer::resetBwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
   if (cvtOutGrad_) {
     pipeline.push_back(*cvtOutGrad_);
   }
@@ -446,28 +447,27 @@ void MKLDNNConvLayer::resetBwdPipeline(
 
 void MKLDNNConvLayer::resetOutGrad(
     std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD, MKLDNNMatrixPtr& out) {
-  const MatrixPtr& outMat = output_.grad;
-  out = MKLDNNMatrix::create(outMat, wgtPD->diff_dst_primitive_desc());
-  CHECK(outVal_ != nullptr &&
-        out->getPrimitiveDesc() == outVal_->getPrimitiveDesc())
-      << "primitive desc of out grad and value should be equal";
-
-  // TODO(TJ): merge outgrad
-  // create reorder if has output grad does not match
   cpuOutGrad_ = nullptr;
   cvtOutGrad_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
+  CHECK(outVal_ != nullptr &&
+        outVal_->getPrimitiveDesc() == wgtPD->diff_dst_primitive_desc())
+      << "primitive desc of out grad and value should be equal";
+  if (outputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
+  } else {
     const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    outMat->setData(cpuOut->getData());
     // same PrimitiveDesc with cpuInVal_
     CHECK(cpuOutVal_);
     cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
-    if (cpuOutGrad_->getPrimitiveDesc() == out->getPrimitiveDesc()) {
-      out = cpuOutGrad_;
-    } else {
-      out = MKLDNNMatrix::create(nullptr, wgtPD->diff_dst_primitive_desc());
+    // create reorder if primitive desc does not match
+    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
+      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
       cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
       CHECK(cvtOutGrad_);
+    } else {
+      // share the same data of CPU output
+      output_.grad->setData(cpuOut->getData());
+      out = cpuOutGrad_;
     }
   }
 }
@@ -496,32 +496,30 @@ void MKLDNNConvLayer::resetWgtBiasGrad(
 void MKLDNNConvLayer::resetInGrad(
     std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
     MKLDNNMatrixPtr& in) {
+  in = nullptr;
+  cpuInGrad_ = nullptr;
+  cvtInGrad_ = nullptr;
   if (dataPD == nullptr) {
     return;
   }
 
-  // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
-  in = MKLDNNMatrix::create(inputLayers_[0]->getOutput().grad,
-                            dataPD->diff_src_primitive_desc());
-  CHECK(nullptr != inVal_ &&
-        in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
-      << "primitive desc of input grad and value should be equal";
-
-  // create reorder if has output grad does not match
-  cpuInGrad_ = nullptr;
-  cvtInGrad_ = nullptr;
-  if (!inputIsOnlyMKLDNN()) {
+  if (inputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetInGrad(in, dataPD->diff_src_primitive_desc());
+    CHECK(nullptr != inVal_ &&
+          in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
+        << "primitive desc of input grad and value should be equal";
+  } else {
     const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE);
     // same PrimitiveDesc with cpuInVal_
     CHECK(cpuInVal_);
     cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc());
-    if (cpuInGrad_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
-      const MatrixPtr& dnnIn = getInputGrad(0, MKLDNN_DEVICE);
-      in = MKLDNNMatrix::create(dnnIn, in->getPrimitiveDesc());
+    in = cpuInGrad_;
+    // create reorder if PrimitiveDesc does not match
+    if (cpuInGrad_->getPrimitiveDesc() != dataPD->diff_src_primitive_desc()) {
+      in = MKLDNNMatrix::create(getInputGrad(0, MKLDNN_DEVICE),
+                                dataPD->diff_src_primitive_desc());
       cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_);
       CHECK(cvtInGrad_);
-    } else {
-      in = cpuInGrad_;
     }
   }
 }
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index e829456d6a..cf19a15568 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -180,10 +180,10 @@ void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
 void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
   out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
   if (!outputIsOnlyMKLDNN()) {
-    // fc cpu output value do not need create convert
-    // just share point
+    // fc cpu output value do not need create convert, just share data
     getOutput(CPU_DEVICE).value->setData(out->getData());
   }
+  output_.value = std::dynamic_pointer_cast<Matrix>(out);
 }
 
 void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
@@ -214,8 +214,6 @@ void MKLDNNFcLayer::resetFwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
   if (bias) {
     fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out));
   } else {
@@ -237,19 +235,14 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
 }
 
 void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  // TODO(TJ): merge outgrad
-  int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
-  output_.grad->setData(getOutput(device).grad->getData());
-  // for MKLDNN device:
-  // can not directly cast outputgrad to mkldnnmatrix,
-  // since each layer can not write the inputgrad to mkldnn inputgrad.
-  // So just create from matrix with outputvalue format.
-  // for CPU device:
-  // fc do not need to convert from cpu device since output is always nc format
-  // only need create from cpu device
   CHECK(outVal_);
-  out =
-      MKLDNNMatrix::create(getOutput(device).grad, outVal_->getPrimitiveDesc());
+  if (outputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
+  } else {
+    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
+    output_.grad->setData(cpuOut->getData());
+    out = MKLDNNMatrix::create(cpuOut, outVal_->getPrimitiveDesc());
+  }
 }
 
 void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
@@ -267,13 +260,11 @@ void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
 
 void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) {
   in = nullptr;
-  const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad;
-  if (inGrad == nullptr) {
+  if (inputLayers_[0]->getOutput().grad == nullptr) {
     return;
   }
-  // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
   CHECK(inVal_);
-  in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
+  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
 }
 
 void MKLDNNFcLayer::resetBwdWgtPD(
@@ -314,7 +305,6 @@ void MKLDNNFcLayer::resetBwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
   CHECK(inVal_);
   if (bias) {
     bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt, *bias));
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index c09fd89462..5f9923da76 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -65,6 +65,17 @@ protected:
   MKLDNNMatrixPtr biasVal_;
   MKLDNNMatrixPtr biasGrad_;
 
+  // merge grad primitive
+  std::shared_ptr<mkldnn::primitive> mergeGrad_;
+  std::vector<mkldnn::primitive> pipelineMergeGrad_;
+  // tmp input argument to save input grad, only used to merge grad
+  Argument tmpInArg_;
+  // since mkldnn sum do not support different formats:
+  // can refer to https://github.com/01org/mkl-dnn/issues/134
+  // so need create reorder manually and save tmp MKLDNNMatrix
+  MKLDNNMatrixPtr tmpOutGrad_;
+  std::shared_ptr<mkldnn::primitive> tmpCvt_;
+
 public:
   explicit MKLDNNLayer(const LayerConfig& config)
       : Layer(config),
@@ -99,6 +110,7 @@ public:
     if (!Layer::init(layerMap, parameterMap)) {
       return false;
     }
+    setOutputMap();
     checkCPUOutputsNumber();
 
     stream_.reset(new MKLDNNStream());
@@ -118,12 +130,9 @@ public:
         VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
         // reset when input total sizes changed, not only the batchsize
         inputElemenCnt_ = elemenCnt;
+        pipelineFwd_.clear();
         reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
         resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
-        if (outVal_) {
-          // change original output value to mkldnn output value
-          output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
-        }
         convertWeightsFromPaddle();
         needResetBwd_ = true;
       }
@@ -144,9 +153,18 @@ public:
   void backward(const UpdateCallback& callback) override {
     if (needResetBwd_) {
       VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
+      pipelineBwd_.clear();
+      pipelineMergeGrad_.clear();
+      mergeGrad_ = nullptr;
       resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
       needResetBwd_ = false;
     }
+
+    // merge grad must before backward activation
+    if (mergeGrad_) {
+      REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
+      stream_->submit(pipelineMergeGrad_);
+    }
     {
       REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
       backwardActivation();
@@ -247,6 +265,76 @@ protected:
     }
   }
 
+  /**
+   * reset the output grad matrix from primitive desc.
+   * and reset the merge grad primitive if needed.
+   * note: when this layer has serval outputs,
+   *       it could not be mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
+   */
+  virtual void resetOutGrad(MKLDNNMatrixPtr& out,
+                            mkldnn::memory::primitive_desc pd) {
+    CHECK(outputIsOnlyMKLDNN()) << "do not support mixed with other device yet";
+    mergeGrad_ = nullptr;
+    pipelineMergeGrad_.clear();
+    out = MKLDNNMatrix::create(output_.grad, pd);
+    if (outputMap_.size() <= 1) {
+      return;
+    }
+    std::vector<double> scales(outputMap_.size(), 1.0);
+    std::vector<mkldnn::memory::primitive_desc> srcPDs;
+    std::vector<mkldnn::primitive::at> srcs;
+    for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
+      MKLDNNMatrixPtr src =
+          std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
+      VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first;
+      CHECK(src) << "should be MKLDNNMatrix";
+      auto srcDims = src->getDims();
+      auto dstDims = out->getDims();
+      CHECK_EQ(srcDims.size(), dstDims.size());
+      for (size_t i = 0; i < srcDims.size(); ++i) {
+        CHECK_EQ(srcDims[i], dstDims[i]);
+      }
+      srcPDs.push_back(src->getPrimitiveDesc());
+      srcs.push_back(*src);
+    }
+
+    // TODO(TJ): remove me when mkldnn sum support different formats
+    for (size_t i = 1; i < srcPDs.size(); ++i) {
+      CHECK(srcPDs[0] == srcPDs[i]);
+    }
+    tmpOutGrad_ = nullptr;
+    tmpCvt_ = nullptr;
+    if (out->getPrimitiveDesc() != srcPDs[0]) {
+      tmpOutGrad_ = MKLDNNMatrix::create(nullptr, srcPDs[0]);
+      tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
+      CHECK(tmpCvt_);
+      pipelineMergeGrad_.push_back(*tmpCvt_);
+    } else {
+      tmpOutGrad_ = out;
+    }
+
+    auto sumPD = mkldnn::sum::primitive_desc(
+        tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
+    mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *tmpOutGrad_));
+    pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
+  }
+
+  /**
+   * reset input grad from primitive desc.
+   * this function is avaiable for input is only mkldnn
+   * or input do not care cpu device
+   */
+  virtual void resetInGrad(MKLDNNMatrixPtr& in,
+                           mkldnn::memory::primitive_desc pd) {
+    LayerPtr& input = inputLayers_[0];
+    const MatrixPtr& grad =
+        input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad;
+    in = MKLDNNMatrix::create(grad, pd);
+    Argument& arg = input->getOutput(this->getName());
+    arg.grad = std::dynamic_pointer_cast<Matrix>(in);
+  }
+
   /**
    * print info about sizes
    */
@@ -334,6 +422,16 @@ private:
     }
   }
 
+  /**
+   * Set output map of prev layers.
+   */
+  void setOutputMap() {
+    outputMap_.clear();
+    for (size_t i = 0; i < inputLayers_.size(); ++i) {
+      inputLayers_[i]->setOutput(getName(), &tmpInArg_);
+    }
+  }
+
   /**
    * Check the cpu device number of outputOtherDevice_.
    * should have only one at most.
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
index b62dfb7c54..5606aae80c 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -142,14 +142,16 @@ void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
     const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
     cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
     if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+      out = MKLDNNMatrix::create(nullptr, out->getPrimitiveDesc());
       cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
       CHECK(cvtOutVal_) << "should not be emptry";
     } else {
-      // CPU output share the same data of MKLDNN output
-      cpuOut->setData(out->getData());
       cpuOutVal_ = out;
     }
+    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
+    return;
   }
+  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
 }
 
 void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
@@ -187,7 +189,6 @@ void MKLDNNPoolLayer::resetFwdPipeline(
     std::shared_ptr<pool_fwd::primitive_desc>& pd,
     MKLDNNMatrixPtr& in,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
   fwd_ = workspace_
              ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
              : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
@@ -205,17 +206,17 @@ void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
   resetInGrad(in);
 }
 void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  CHECK(outVal_) << "Should have output value";
-  out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
-
-  // create reorder if output value has cpu device and pd do not match
   cpuOutGrad_ = nullptr;
   cvtOutGrad_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
+  CHECK(outVal_);
+  if (outputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
+  } else {
     const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
     cpuOutGrad_ = MKLDNNMatrix::create(
         cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
-    if (cpuOutGrad_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
+      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
       cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
       CHECK(cvtOutGrad_) << "should not be emptry";
     } else {
@@ -228,12 +229,11 @@ void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
 
 void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
   in = nullptr;
-  const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad;
-  if (inGrad == nullptr) {
+  if (inputLayers_[0]->getOutput().grad == nullptr) {
     return;
   }
   CHECK(inVal_);
-  in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
+  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
 }
 
 void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
@@ -261,7 +261,6 @@ void MKLDNNPoolLayer::resetBwdPipeline(
     std::shared_ptr<pool_bwd::primitive_desc>& pd,
     MKLDNNMatrixPtr& in,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
   if (cvtOutGrad_) {
     pipeline.push_back(*cvtOutGrad_);
   }
diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
index f59618be9d..eaebdd671c 100644
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -124,8 +124,8 @@ void MKLDNNTester::randomTopDiffs() {
 void MKLDNNTester::checkForward() {
   VLOG(MKLDNN_ALL) << "Check Forward";
   printTopDatas();
-  double delta = compareMatrix(dnnLayer_->getOutput(CPU_DEVICE).value,
-                               refLayer_->getOutputValue());
+  double delta =
+      compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue());
   EXPECT_LE(fabs(delta), eps_);
 }
 
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 5087c02385..8e561528f0 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -14,11 +14,6 @@ limitations under the License. */
 
 #include "paddle/memory/memory.h"
 
-#include <algorithm>  // for transform
-#include <cstring>    // for memcpy
-#include <memory>     // for unique_ptr
-#include <mutex>      // for call_once
-
 #include "glog/logging.h"
 
 #include "paddle/memory/detail/buddy_allocator.h"
@@ -32,19 +27,14 @@ namespace memory {
 
 using BuddyAllocator = detail::BuddyAllocator;
 
-std::once_flag cpu_allocator_flag;
-std::once_flag gpu_allocator_flag;
-
 BuddyAllocator* GetCPUBuddyAllocator() {
-  static std::unique_ptr<BuddyAllocator> a{nullptr};
-
-  std::call_once(cpu_allocator_flag, [&]() {
-    a.reset(new BuddyAllocator(new detail::CPUAllocator,
-                               platform::CpuMinChunkSize(),
-                               platform::CpuMaxChunkSize()));
-  });
-
-  return a.get();
+  static detail::BuddyAllocator* a = nullptr;
+  if (a == nullptr) {
+    a = new detail::BuddyAllocator(new detail::CPUAllocator,
+                                   platform::CpuMinChunkSize(),
+                                   platform::CpuMaxChunkSize());
+  }
+  return a;
 }
 
 template <>
@@ -65,35 +55,24 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
 #ifdef PADDLE_WITH_CUDA
 
 BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  using BuddyAllocVec = std::vector<BuddyAllocator*>;
-  static std::unique_ptr<BuddyAllocVec, void (*)(BuddyAllocVec * p)> as{
-      new BuddyAllocVec, [](BuddyAllocVec* p) {
-        std::for_each(p->begin(), p->end(),
-                      [](BuddyAllocator* p) { delete p; });
-      }};
-
-  // GPU buddy allocators
-  auto& allocators = *as.get();
-
-  // GPU buddy allocator initialization
-  std::call_once(gpu_allocator_flag, [&]() {
+  static BuddyAllocator** as = NULL;
+  if (as == NULL) {
     int gpu_num = platform::GetCUDADeviceCount();
-    allocators.reserve(gpu_num);
+    as = new BuddyAllocator*[gpu_num];
     for (int gpu = 0; gpu < gpu_num; gpu++) {
       platform::SetDeviceId(gpu);
-      allocators.emplace_back(new BuddyAllocator(new detail::GPUAllocator,
-                                                 platform::GpuMinChunkSize(),
-                                                 platform::GpuMaxChunkSize()));
+      as[gpu] = new BuddyAllocator(new detail::GPUAllocator,
+                                   platform::GpuMinChunkSize(),
+                                   platform::GpuMaxChunkSize());
     }
     VLOG(3) << "\n\nNOTE: each GPU device use "
             << FLAGS_fraction_of_gpu_memory_to_use * 100 << "% of GPU memory.\n"
             << "You can set environment variable '"
             << platform::kEnvFractionGpuMemoryToUse
             << "' to change the fraction of GPU usage.\n\n";
-  });
-
+  }
   platform::SetDeviceId(gpu_id);
-  return allocators[gpu_id];
+  return as[gpu_id];
 }
 
 template <>
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index ad941bde2b..75fcc1cda1 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -84,8 +84,9 @@ function(op_library TARGET)
     endif()
 
     # pybind USE_NO_KERNEL_OP
+    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
     file(READ ${TARGET}.cc TARGET_CONTENT)
-    string(REGEX MATCH "OperatorWithKernel" regex_result "${TARGET_CONTENT}")
+    string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
     string(REPLACE "_op" "" TARGET "${TARGET}")
     if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
         file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index c5fb113e0f..037bb49abc 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -21,7 +21,6 @@ class AccuracyOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Inference"),
                    "Input(Inference) of AccuracyOp should not be null.");
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index cba57ba57f..ee4f9b0ef2 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -21,7 +21,6 @@ class ActivationOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", /*->*/ "Y");
@@ -32,7 +31,6 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Y"));
   }
@@ -338,6 +336,38 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+template <typename AttrType>
+class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HardSigmoidOpMaker(framework::OpProto *proto,
+                     framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of HardSigmoid operator");
+    AddOutput("Y", "Output of HardSigmoid operator");
+    AddComment(R"DOC(
+Hard Sigmoid activation operator.
+
+Segment-wise linear approximation of sigmoid[1].
+This is much faster than sigmoid.
+
+hard_sigmoid = max(0, min(1, slope * x + shift))
+
+The slope should be positive. The offset can be either positive or negative.
+The default slope and shift are set from [1].
+It is recommended to use the defaults for this activation.
+
+References:
+  [1] Noisy Activation Functions
+      (https://arxiv.org/abs/1603.00391)
+
+    )DOC");
+    AddAttr<AttrType>("slope", "Slope for linear approximation of sigmoid")
+        .SetDefault(static_cast<AttrType>(0.2));
+    AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
+        .SetDefault(static_cast<AttrType>(0.5));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -413,6 +443,9 @@ REGISTER_OP(thresholded_relu, ops::ActivationOp,
             ops::ThresholdedReluOpMaker<float>, thresholded_relu_grad,
             ops::ActivationOpGrad);
 
+REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker<float>,
+            hard_sigmoid_grad, ops::ActivationOpGrad);
+
 #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)        \
   REGISTER_OP_CPU_KERNEL(                                                      \
       act_type,                                                                \
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 502c33be10..4f4eb44fed 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -616,30 +616,63 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
+    y.device(d) = temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
+  }
+};
+
+template <typename T>
+struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) =
+        dy *
+        ((y > static_cast<T>(0)) * (y < static_cast<T>(1))).template cast<T>() *
+        static_cast<T>(slope);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
-#define FOR_EACH_KERNEL_FUNCTOR(__macro)                          \
-  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);           \
-  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
-  __macro(exp, ExpFunctor, ExpGradFunctor);                       \
-  __macro(relu, ReluFunctor, ReluGradFunctor);                    \
-  __macro(tanh, TanhFunctor, TanhGradFunctor);                    \
-  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
-  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                    \
-  __macro(abs, AbsFunctor, AbsGradFunctor);                       \
-  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
-  __macro(log, LogFunctor, LogGradFunctor);                       \
-  __macro(square, SquareFunctor, SquareGradFunctor);              \
-  __macro(brelu, BReluFunctor, BReluGradFunctor);                 \
-  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);       \
-  __macro(pow, PowFunctor, PowGradFunctor);                       \
-  __macro(stanh, STanhFunctor, STanhGradFunctor);                 \
-  __macro(softplus, SoftplusFunctor, SoftplusGradFunctor);        \
-  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);        \
-  __macro(relu6, Relu6Functor, Relu6GradFunctor);                 \
-  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);    \
-  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
-  __macro(elu, ELUFunctor, ELUGradFunctor);                       \
-  __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor); \
+#define FOR_EACH_KERNEL_FUNCTOR(__macro)                             \
+  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
+  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
+  __macro(exp, ExpFunctor, ExpGradFunctor);                          \
+  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
+  __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
+  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
+  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
+  __macro(abs, AbsFunctor, AbsGradFunctor);                          \
+  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
+  __macro(log, LogFunctor, LogGradFunctor);                          \
+  __macro(square, SquareFunctor, SquareGradFunctor);                 \
+  __macro(brelu, BReluFunctor, BReluGradFunctor);                    \
+  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);          \
+  __macro(pow, PowFunctor, PowGradFunctor);                          \
+  __macro(stanh, STanhFunctor, STanhGradFunctor);                    \
+  __macro(softplus, SoftplusFunctor, SoftplusGradFunctor);           \
+  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);           \
+  __macro(relu6, Relu6Functor, Relu6GradFunctor);                    \
+  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);       \
+  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor);    \
+  __macro(elu, ELUFunctor, ELUGradFunctor);                          \
+  __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor);    \
+  __macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \
   __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor);
diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc
index cf1bca1658..24e419b532 100644
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -21,7 +21,6 @@ class AdadeltaOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Param"),
                    "Input(Param) of AdadeltaOp should not be null.");
diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc
index a17747efb7..bc081f87dc 100644
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -21,7 +21,6 @@ class AdagradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Param"),
                    "Input(Param) of AdagradOp should not be null.");
diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc
new file mode 100644
index 0000000000..e3db70ea12
--- /dev/null
+++ b/paddle/operators/adam_op.cc
@@ -0,0 +1,143 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/adam_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AdamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment1"),
+                   "Input(Moment1) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment2"),
+                   "Input(Moment2) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
+                   "Input(Beta1Pow) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
+                   "Input(Beta2Pow) of AdamOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
+                   "Output(Moment1Out) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
+                   "Output(Moment2Out) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Beta1PowOut"),
+                   "Output(Beta1PowOut) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Beta2PowOut"),
+                   "Output(Beta2PowOut) of AdamOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 dimension");
+    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
+    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+                      "Beta1 power accumulator should have 1 dimension");
+    auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
+    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+                      "Beta1 power accumulator should have 1 dimension");
+
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        "Param and Grad input of AdamOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment1"),
+        "Param and Moment input of AdamOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment2"),
+        "Param and InfNorm input of AdamOp should have same dimension");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("Moment1Out", param_dims);
+    ctx->SetOutputDim("Moment2Out", param_dims);
+    ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims);
+    ctx->SetOutputDim("Beta2PowOut", beta2_pow_dims);
+  }
+};
+
+class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdamOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+    AddInput("Moment1", "(Tensor) Input first moment");
+    AddInput("Moment2", "(Tensor) Input second moment");
+    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
+    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("Moment1Out", "(Tensor) Output first moment");
+    AddOutput("Moment2Out", "(Tensor) Output second moment");
+    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
+    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
+
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "first moment estimates.")
+        .SetDefault(0.9f);
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the "
+                   "second moment estimates.")
+        .SetDefault(0.999f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-8) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-8f);
+
+    AddComment(R"DOC(
+Adam Updates Operator.
+
+This implements the Adam optimizer from Section 2 of the Adam
+paper[1]. Adam is a first-order gradient-based optimization
+method based on adaptive estimates of lower-order moments.
+
+Adam updates:
+
+moment1_out = beta1 * moment1 + (1 − beta1) * grad
+moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad
+beta1_pow_out = beta1_pow * beta1
+beta2_pow_out = beta2_pow * beta2
+learning_rate_t = learning_rate_t *
+                  sqrt(1 - beta2_pow_out) / (1 - beta1_pow_out)
+param_out = param - learning_rate_t * moment1/ (sqrt(moment2) + epsilon)
+
+References:
+  [1] Adam: A Method for Stochastic Optimization
+      (https://arxiv.org/abs/1412.6980)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
+REGISTER_OP_CPU_KERNEL(adam,
+                       ops::AdamOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/adam_op.cu b/paddle/operators/adam_op.cu
new file mode 100644
index 0000000000..a3def912e5
--- /dev/null
+++ b/paddle/operators/adam_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/adam_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(adam,
+                       ops::AdamOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h
new file mode 100644
index 0000000000..789c2f14b3
--- /dev/null
+++ b/paddle/operators/adam_op.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class AdamOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto moment1_out_tensor = ctx.Output<framework::Tensor>("Moment1Out");
+    auto moment2_out_tensor = ctx.Output<framework::Tensor>("Moment2Out");
+    auto beta1_pow_out_tensor = ctx.Output<framework::Tensor>("Beta1PowOut");
+    auto beta2_pow_out_tensor = ctx.Output<framework::Tensor>("Beta2PowOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment1_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment2_out_tensor->mutable_data<T>(ctx.GetPlace());
+    beta1_pow_out_tensor->mutable_data<T>(ctx.GetPlace());
+    beta2_pow_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    float beta1 = ctx.Attr<float>("beta1");
+    float beta2 = ctx.Attr<float>("beta2");
+    float epsilon = ctx.Attr<float>("epsilon");
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    auto moment1 = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Moment1"));
+    auto moment2 = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Moment2"));
+    auto lr = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("LearningRate"));
+    auto beta1_pow = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Beta1Pow"));
+    auto beta2_pow = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Beta2Pow"));
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto moment1_out = framework::EigenVector<T>::Flatten(*moment1_out_tensor);
+    auto moment2_out = framework::EigenVector<T>::Flatten(*moment2_out_tensor);
+    auto beta1_pow_out =
+        framework::EigenVector<T>::Flatten(*beta1_pow_out_tensor);
+    auto beta2_pow_out =
+        framework::EigenVector<T>::Flatten(*beta2_pow_out_tensor);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    moment1_out.device(place) = beta1 * moment1 + (1 - beta1) * grad;
+    moment2_out.device(place) = beta2 * moment2 + (1 - beta2) * grad.square();
+    beta1_pow_out.device(place) = beta1_pow * beta1;
+    beta2_pow_out.device(place) = beta2_pow * beta2;
+    // All of these are tensors of 1 element
+    auto lr_t = lr * (1 - beta2_pow_out).sqrt() / (1 - beta1_pow_out);
+    // Eigen does not support automatic broadcast
+    // Get dimensions of moment vector to broadcast lr_t
+    Eigen::DSizes<int, 1> m_dsize(moment1_out_tensor->numel());
+    param_out.device(place) =
+        param -
+        lr_t.broadcast(m_dsize) *
+            (moment1_out / (moment2_out.sqrt() + epsilon));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc
index 5cf727742c..e848333ef8 100644
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@@ -21,7 +21,6 @@ class AdamaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Param"),
                    "Input(Param) of AdamaxOp should not be null.");
diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index 3e9b0d82ba..2d029394dd 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -21,7 +21,6 @@ class ClipOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of ClipOp should not be null.");
@@ -60,7 +59,6 @@ class ClipOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
index 235c4449ac..e11e51b458 100644
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -23,7 +23,6 @@ class ConcatOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
                       "Inputs(X) of ConcatOp should be empty.")
@@ -82,7 +81,6 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
                const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
   }
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index 2737104a20..adcd867f50 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -134,7 +134,7 @@ void CondOp::PrepareDataForSubnet(
   for (int i = 0; i < BRANCH_NUM; ++i) {
     for (auto& output : (*sub_net_op_[i]).Outputs()) {
       for (auto& var_name : output.second) {
-        sub_scopes[i]->NewVar(var_name);
+        sub_scopes[i]->Var(var_name);
       }
     }
   }
diff --git a/paddle/operators/conv2d_op.h b/paddle/operators/conv2d_op.h
index 7ebdbe81cb..bd1734879e 100644
--- a/paddle/operators/conv2d_op.h
+++ b/paddle/operators/conv2d_op.h
@@ -44,7 +44,6 @@ class Conv2DOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override;
 };
 
@@ -52,7 +51,6 @@ class Conv2DOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override;
 };
 
diff --git a/paddle/operators/conv_shift_op.cc b/paddle/operators/conv_shift_op.cc
index e1e321ed5f..6156a2d6af 100644
--- a/paddle/operators/conv_shift_op.cc
+++ b/paddle/operators/conv_shift_op.cc
@@ -27,7 +27,6 @@ class ConvShiftOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
@@ -54,7 +53,6 @@ class ConvShiftGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
index 2b4c4b9c45..55f69fb03a 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -23,7 +23,6 @@ class CosSimOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     // notnull check
     PADDLE_ENFORCE(ctx->HasInput("X"),
@@ -97,7 +96,6 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     // notnull check
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
index a1424993cc..a994d91676 100644
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -24,7 +24,6 @@ class CropOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of CropOp should not be null.");
@@ -114,7 +113,6 @@ class CropOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index b4ea0338b2..d7777e965d 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -21,7 +21,6 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
@@ -35,9 +34,9 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
                       "The 1st dimension of Input(X) and Input(Label) should "
                       "be equal.");
-    if (ctx->Attrs().Get<bool>("softLabel")) {
+    if (ctx->Attrs().Get<bool>("soft_label")) {
       PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
-                        "If Attr(softLabel) == true, the 2nd dimension of "
+                        "If Attr(soft_label) == true, the 2nd dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
       PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
@@ -49,6 +48,7 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
     ctx->ShareLoD("X", /*->*/ "Y");
   }
 
+ protected:
   // Explicitly set that data type of the output of the cross_entropy operator
   // is determined by its input "X".
   framework::DataType IndicateDataType(
@@ -61,7 +61,6 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
@@ -84,18 +83,19 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                       "be equal.");
     PADDLE_ENFORCE_EQ(dy_dims[1], 1,
                       "The 2nd dimension of Input(Y@Grad) should be 1.");
-    if (ctx->Attrs().Get<bool>("softLabel")) {
+    if (ctx->Attrs().Get<bool>("soft_label")) {
       PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
-                        "When Attr(softLabel) == true, the 2nd dimension of "
+                        "When Attr(soft_label) == true, the 2nd dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
       PADDLE_ENFORCE_EQ(label_dims[1], 1,
-                        "When Attr(softLabel) == false, the 2nd dimension of "
+                        "When Attr(soft_label) == false, the 2nd dimension of "
                         "Input(Label) should be 1.");
     }
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
   }
 
+ protected:
   // CrossEntropy's data type just determined by "X"
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
@@ -117,15 +117,15 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
         "Label",
         "(Tensor, default Tensor<int>), the ground truth which is "
         "a 2-D tensor. "
-        "When softLabel is set to false, `Label` is a Tensor<int> with shape "
+        "When soft_label is set to false, `Label` is a Tensor<int> with shape "
         "[N x 1]. "
-        "When softLabel is set to true, `Label` is a Tensor<float/double> "
+        "When soft_label is set to true, `Label` is a Tensor<float/double> "
         "with shape [N x K].");
     AddOutput("Y",
               "(Tensor, default Tensor<float>), a 2-D tensor "
               "with shape [N x 1]. The cross entropy loss.");
     AddAttr<bool>(
-        "softLabel",
+        "soft_label",
         "(bool, default false), a flag to indicate whether to interpretate "
         "the given labels as soft labels.")
         .SetDefault(false);
@@ -135,12 +135,12 @@ CrossEntropy Operator.
 It supports both standard cross-entropy and soft-label cross-entropy loss
 computation.
 1) One-hot cross-entropy:
-    softLabel = false, Label[i, 0] indicates the class index for sample i:
+    soft_label = false, Label[i, 0] indicates the class index for sample i:
 
                 Y[i] = -log(X[i, Label[i]])
 
 2) Soft-label cross-entropy:
-    softLabel = true, Label[i, j] indicates the soft label of class j
+    soft_label = true, Label[i, j] indicates the soft label of class j
     for sample i:
 
                 Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 5e2024e0ea..c492dddb09 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -56,7 +56,7 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     y->mutable_data<T>(ctx.GetPlace());
 
     math::CrossEntropyFunctor<platform::GPUPlace, T>()(
-        ctx.device_context(), y, x, label, ctx.Attr<bool>("softLabel"));
+        ctx.device_context(), y, x, label, ctx.Attr<bool>("soft_label"));
   }
 };
 
@@ -83,7 +83,7 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
     int block = 512;
     int grid = (batch_size * class_num + block - 1) / block;
 
-    if (ctx.Attr<bool>("softLabel")) {
+    if (ctx.Attr<bool>("soft_label")) {
       auto* label_data = label->data<T>();
       SoftCrossEntropyGradientKernel<T><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
@@ -91,7 +91,8 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
                               .stream()>>>(dx_data, dy_data, x_data, label_data,
                                            batch_size, class_num);
     } else {
-      math::SetConstant<platform::GPUPlace, T>(ctx.device_context(), dx, 0);
+      math::SetConstant<platform::GPUPlace, T> functor;
+      functor(ctx.device_context(), dx, 0);
       auto* label_data = label->data<int>();
       grid = (batch_size + block - 1) / block;
       CrossEntropyGradientKernel<T><<<
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index d2d321aa7e..42f282103b 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -38,7 +38,7 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
     y->mutable_data<T>(ctx.GetPlace());
 
     math::CrossEntropyFunctor<platform::CPUPlace, T>()(
-        ctx.device_context(), y, x, labels, ctx.Attr<bool>("softLabel"));
+        ctx.device_context(), y, x, labels, ctx.Attr<bool>("soft_label"));
   }
 };
 
@@ -55,7 +55,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
     T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
 
     int class_num = x->dims()[1];
-    if (ctx.Attr<bool>("softLabel")) {
+    if (ctx.Attr<bool>("soft_label")) {
       auto x_mat = EigenMatrix<T>::From(*x);
       auto dy_mat = EigenMatrix<T>::From(*dy);
       auto lbl_mat = EigenMatrix<T>::From(*label);
@@ -70,7 +70,8 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
       const T* x_data = x->data<T>();
       const int* label_data = label->data<int>();
 
-      math::SetConstant<platform::CPUPlace, T>(ctx.device_context(), dx, 0);
+      math::SetConstant<platform::CPUPlace, T> functor;
+      functor(ctx.device_context(), dx, 0);
 
       for (int i = 0; i < batch_size; ++i) {
         PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
diff --git a/paddle/operators/decayed_adagrad_op.cc b/paddle/operators/decayed_adagrad_op.cc
new file mode 100644
index 0000000000..17b394aa07
--- /dev/null
+++ b/paddle/operators/decayed_adagrad_op.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/decayed_adagrad_op.h"
+
+namespace paddle {
+namespace operators {
+
+class DecayedAdagradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("LearningRate"),
+        "Input(LearningRate) of DecayedAdagradOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
+                   "Output(MomentOut) of DecayedAdagradOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "LearningRate should have one element");
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Grad"),
+                      "Param and Grad input of DecayedAdagradOp should have "
+                      "the same dimension.");
+    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Moment"),
+                      "Param and Moment input of DecayedAdagradOp should have "
+                      "the same dimension.");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("MomentOut", param_dims);
+  }
+};
+
+class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  DecayedAdagradOpMaker(framework::OpProto *proto,
+                        framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("Moment", "(Tensor) Second moment");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("MomentOut", "(Tensor) Output second moment");
+
+    AddAttr<float>("decay",
+                   "(float, default 0.95) "
+                   "Discounting factor for coming gradient")
+        .SetDefault(0.95);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-6) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-6f);
+    AddComment(R"DOC(
+
+Decayed Adagrad
+
+moment_out = decay * moment + (1 - decay) * grad * grad
+param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp,
+                             ops::DecayedAdagradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    decayed_adagrad,
+    ops::DecayedAdagradOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/decayed_adagrad_op.cu b/paddle/operators/decayed_adagrad_op.cu
new file mode 100644
index 0000000000..6fce77fe4e
--- /dev/null
+++ b/paddle/operators/decayed_adagrad_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/decayed_adagrad_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    decayed_adagrad,
+    ops::DecayedAdagradOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/decayed_adagrad_op.h b/paddle/operators/decayed_adagrad_op.h
new file mode 100644
index 0000000000..0fe0fc5acd
--- /dev/null
+++ b/paddle/operators/decayed_adagrad_op.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class DecayedAdagradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    float decay = ctx.Attr<float>("decay");
+    float epsilon = ctx.Attr<float>("epsilon");
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    auto moment = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Moment"));
+    auto lr = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("LearningRate"));
+
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    moment_out.device(place) = decay * moment + (1 - decay) * grad * grad;
+    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
+    param_out.device(place) =
+        param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index 708ccfa0bf..29858c9083 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -23,7 +23,6 @@ class DropoutOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE_GE(ctx->Attrs().Get<float>("dropout_prob"), 0);
@@ -69,7 +68,6 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_training"), 1,
                       "GradOp is only callable when is_training is true");
diff --git a/paddle/operators/dynamic_recurrent_op.cc b/paddle/operators/dynamic_recurrent_op.cc
index b919aef8fb..03f33e28d4 100644
--- a/paddle/operators/dynamic_recurrent_op.cc
+++ b/paddle/operators/dynamic_recurrent_op.cc
@@ -23,13 +23,37 @@ using framework::Scope;
 using framework::TensorArray;
 using framework::LoDTensor;
 using framework::Variable;
+using framework::DySeqMetaBatch;
 
 namespace detail {
 
 inline void CreateVariables(Scope& scope,
                             const std::vector<std::string>& var_names) {
   for (const auto& name : var_names) {
-    scope.NewVar(name);
+    scope.Var(name);
+  }
+}
+
+/*
+ * The inputs with sequence should be reordered when they are split, so the
+ * boot_states should be reordered in the same order.
+ *
+ * NOTE This may require that the `pre_state` of the first time step should just
+ * copy the `boot_state` rather than reference it, for that the content should
+ * be reordered, but the RNN op should not change the `boot_state` as an input
+ * variable's content.
+ */
+template <typename T>
+inline void ReorderBootState(const DySeqMetaBatch& metas,
+                             const LoDTensor& boot_state, LoDTensor* tensor,
+                             const platform::Place& dst_place) {
+  for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
+    auto slice = tensor->Slice<T>(seq_id, seq_id + 1);
+    auto boot_slice =
+        boot_state.Slice<T>(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
+    // TODO(superjom) pass in device context as an argument
+    slice.template CopyFrom<T>(boot_slice, dst_place,
+                               platform::CPUDeviceContext());
   }
 }
 
@@ -69,6 +93,7 @@ void DynamicRecurrentOp::Run(const Scope& scope,
   CreateScopes();
   WriteStepInputs();
   InitStates();
+  WriteStepOutputs();
 
   // call stepnet in all the time steps
   for (size_t step = 0; step < cache_.num_steps; step++) {
@@ -76,7 +101,6 @@ void DynamicRecurrentOp::Run(const Scope& scope,
     stepnet_->Run(step_scope, dev_ctx);
   }
 
-  WriteStepOutputs();
   ConcatOutputs();
 }
 
@@ -84,11 +108,11 @@ void DynamicRecurrentOp::SplitInputs() const {
   // TODO(superjom) make level a config
   // TODO(superjom) check all the inputs has the same LoD
   int level = 0;
-  const auto& inlinks = cache_.inlinks;
-  for (const auto& item : inlinks) {
+  for (const auto& item : cache_.inlinks) {
     const auto& var = item.second;
     const auto& tensor = var->Get<LoDTensor>();
     TensorArray& ta = step_inputs_[item.first];
+
     dy_seq_metas_[item.first] =
         ta.Unpack(tensor, level, true /*length_descend*/);
 
@@ -112,7 +136,7 @@ void DynamicRecurrentOp::WriteStepInputs() const {
       auto& step_scope = cache_.GetScope(step);
       Variable* var = step_scope.FindVar(item.first);
       if (var == nullptr) {
-        var = step_scope.NewVar(item.first);
+        var = step_scope.Var(item.first);
       }
       var->GetMutable<LoDTensor>()->ShareDataWith<value_type>(tensor);
     }
@@ -120,17 +144,11 @@ void DynamicRecurrentOp::WriteStepInputs() const {
 }
 
 void DynamicRecurrentOp::WriteStepOutputs() const {
-  for (size_t step = 0; step < cache_.scopes->size(); step++) {
-    auto& scope = cache_.GetScope(step);
-    for (auto& item : step_outputs_) {
-      auto* var = scope.FindVar(item.first);
-      if (var == nullptr) {
-        var = scope.NewVar(item.first);
-      }
-      auto* tensor = var->GetMutable<LoDTensor>();
-      item.second.WriteShared(step, *tensor);
-    }
+  // initialize step outputs
+  for (const auto& item : cache_.outlinks) {
+    step_outputs_.emplace(item.first, TensorArray());
   }
+  PADDLE_ENFORCE_GT(step_outputs_.size(), 0UL);
 }
 
 void DynamicRecurrentOp::CreateScopes() const {
@@ -145,12 +163,18 @@ void DynamicRecurrentOp::CreateScopes() const {
   PADDLE_ENFORCE_NOT_NULL(stepnet_, "stepnet should be set first");
   std::vector<std::string> memories;
   std::vector<std::string> pre_memories;
+  std::vector<std::string> stepnet_outputs;
   std::transform(arg_.memories.begin(), arg_.memories.end(),
                  std::back_inserter(memories),
                  [](const rnn::MemoryAttr& m) { return m.var; });
   std::transform(arg_.memories.begin(), arg_.memories.end(),
                  std::back_inserter(pre_memories),
                  [](const rnn::MemoryAttr& m) { return m.pre_var; });
+  for (const auto& item : stepnet_->Outputs()) {
+    for (const auto& var : item.second) {
+      stepnet_outputs.push_back(var);
+    }
+  }
 
   for (size_t step = 0; step < cache_.num_steps; step++) {
     auto& scope = cache_.GetScope(step);
@@ -158,60 +182,88 @@ void DynamicRecurrentOp::CreateScopes() const {
     detail::CreateVariables(scope, arg_.outlinks);
     detail::CreateVariables(scope, memories);
     detail::CreateVariables(scope, pre_memories);
+    detail::CreateVariables(scope, stepnet_outputs);
   }
 }
 
 void DynamicRecurrentOp::ConcatOutputs() const {
   // TODO(superjom) transform this to a config
   int level = 0;
-  // TODO(superjom) pass in some lod
-  // just a placeholder
-  framework::LoD lod;
+  for (size_t step = 0; step < cache_.num_steps; step++) {
+    auto& scope = cache_.GetScope(step);
+    for (auto& item : step_outputs_) {
+      auto* var = scope.FindVar(item.first);
+      PADDLE_ENFORCE_NOT_NULL(var);
+      auto* tensor = var->GetMutable<LoDTensor>();
+      tensor->mutable_data<value_type>(platform::CPUPlace());
+      item.second.WriteShared(step, *tensor);
+    }
+  }
+  // the inlinks' lods should be the same, so randomly get one lod.
+  const auto& some_lod =
+      cache_.scope->FindVar(arg_.inlinks.front())->Get<LoDTensor>().lod();
+  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
   for (auto& item : step_outputs_) {
-    auto tensor = item.second.Pack(level, dy_seq_metas_[item.first], lod);
-    auto& output = cache_.outlinks[item.first]->Get<LoDTensor>();
-    const_cast<LoDTensor*>(&output)->ShareDataWith<value_type>(tensor);
+    auto tensor = item.second.Pack(level, some_meta, some_lod);
+    auto* output = cache_.outlinks[item.first]->GetMutable<LoDTensor>();
+    const_cast<LoDTensor*>(output)->ShareDataWith<value_type>(tensor);
   }
 }
 
 void DynamicRecurrentOp::InitStates() const {
-  // init the first state
-  // TODO(superjom) parepare the scenerio that boot state not exists
-  for (auto memory : arg_.memories) {
-    auto* boot_state_var = cache_.scope->FindVar(memory.boot_var);
-    PADDLE_ENFORCE_NOT_NULL(boot_state_var);
-    auto& boot_state = boot_state_var->Get<LoDTensor>();
-    const auto& dims = boot_state.dims();
-
-    for (size_t step = 0; step < cache_.num_steps; step++) {
-      auto& cur_scope = cache_.GetScope(step);
-      // link pre-state to boot_state
-      // init state and pre-state
-      auto* pre_state = cur_scope.FindVar(memory.pre_var);
-      PADDLE_ENFORCE_NOT_NULL(pre_state);
-      pre_state->GetMutable<LoDTensor>();
-
-      auto* state = cur_scope.FindVar(memory.var);
-      PADDLE_ENFORCE_NOT_NULL(state);
-      state->GetMutable<LoDTensor>()->Resize(dims);
-      state->GetMutable<LoDTensor>()->mutable_data<value_type>(
-          platform::CPUPlace());
-
-      if (step == 0) {
-        auto* pre_state_tensor = pre_state->GetMutable<LoDTensor>();
-        pre_state_tensor->Resize(boot_state.dims());
-        pre_state_tensor->ShareDataWith<value_type>(boot_state);
-      } else {
-        auto& pre_scope = cache_.GetScope(step - 1);
-        auto* state_pre = pre_scope.FindVar(memory.var);
-        PADDLE_ENFORCE_NOT_NULL(state_pre);
-        pre_state->GetMutable<LoDTensor>()->ShareDataWith<value_type>(
-            *state_pre->GetMutable<LoDTensor>());
-      }
+  for (size_t step = 0; step < cache_.num_steps; step++) {
+    for (const auto& memory : arg_.memories) {
+      CreateState(memory, step);
+      LinkState(memory, step);
     }
   }
 }
 
+void DynamicRecurrentOp::CreateState(const rnn::MemoryAttr& memory,
+                                     size_t step) const {
+  auto& scope = cache_.GetScope(step);
+  auto& state = *cache_.GetTensor(scope, memory.var);
+  auto& boot_state = *cache_.GetTensor(*cache_.scope, memory.boot_var);
+
+  size_t num_instances =
+      step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
+  auto dims = boot_state.dims();
+  dims[0] = num_instances;
+
+  state.Resize(dims);
+  state.mutable_data<value_type>(platform::CPUPlace());
+  states_[memory.var].WriteShared(step, state);
+}
+
+void DynamicRecurrentOp::LinkState(const rnn::MemoryAttr& memory,
+                                   size_t step) const {
+  auto& scope = cache_.GetScope(step);
+  auto& state_pre = *cache_.GetTensor(scope, memory.pre_var);
+
+  // all the step_inputs' metas should be the same, just randomly select one
+  // and get the dyseq meta.
+  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
+  size_t num_instances =
+      step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
+
+  LoDTensor* pre_state{nullptr};
+  if (step == 0) {
+    pre_state = cache_.GetTensor(*cache_.scope, memory.boot_var);
+    pre_state->mutable_data<float>(platform::CPUPlace());
+    // allocate memory
+    state_pre.Resize(pre_state->dims());
+    state_pre.mutable_data<value_type>(platform::CPUPlace());
+    detail::ReorderBootState<value_type>(some_meta, *pre_state, &state_pre,
+                                         pre_state->place());
+  } else {
+    pre_state = cache_.GetTensor(cache_.GetScope(step - 1), memory.var);
+  }
+
+  // shink and share from previous state
+  auto shrinked_pre_state = pre_state->Slice<value_type>(0, num_instances);
+  state_pre.ShareDataWith<value_type>(shrinked_pre_state);
+}
+
 void DynamicRecurrentOp::ArgCache::Init(
     const rnn::ArgumentName& name, const paddle::framework::OperatorBase& op,
     const paddle::framework::Scope& scope, rnn::Argument* arg) {
@@ -261,6 +313,12 @@ Variable* DynamicRecurrentOp::ArgCache::GetVariable(const Scope& scope,
   return var;
 }
 
+LoDTensor* DynamicRecurrentOp::ArgCache::GetTensor(
+    const framework::Scope& scope, const std::string& name) {
+  auto* var = GetVariable(scope, name);
+  return var->GetMutable<LoDTensor>();
+}
+
 const rnn::ArgumentName DynamicRecurrentOp::kArgName{
     "step_net", "step_scopes",  "inlinks",      "outlinks",
     "memories", "pre_memories", "boot_memories"};
diff --git a/paddle/operators/dynamic_recurrent_op.h b/paddle/operators/dynamic_recurrent_op.h
index 6a2970f27f..ec80a1c90e 100644
--- a/paddle/operators/dynamic_recurrent_op.h
+++ b/paddle/operators/dynamic_recurrent_op.h
@@ -77,6 +77,17 @@ class DynamicRecurrentOp : public framework::OperatorBase {
    */
   void InitStates() const;
 
+  /*
+   * Create state variables for each time step.
+   */
+  void CreateState(const rnn::MemoryAttr& memory, size_t step) const;
+
+  /*
+   * Link pre-state variable in current scope to the state variable in the
+   * previous time step (scope).
+   */
+  void LinkState(const rnn::MemoryAttr& memory, size_t step) const;
+
   /*
    * Concatenate outputs in each time step and generate a LoDTensor.
    */
@@ -91,6 +102,16 @@ class DynamicRecurrentOp : public framework::OperatorBase {
   }
   const OperatorBase& GetStepNet() const { return *stepnet_; }
 
+  const framework::TensorArray& state(const std::string& name) const {
+    return states_[name];
+  }
+  const framework::TensorArray& step_input(const std::string& name) const {
+    return step_inputs_[name];
+  }
+  const framework::TensorArray& step_output(const std::string& name) const {
+    return step_outputs_[name];
+  }
+
  protected:
   struct ArgCache {
     framework::Scope const* scope;
@@ -108,6 +129,9 @@ class DynamicRecurrentOp : public framework::OperatorBase {
       return *scopes->at(index);
     }
 
+    framework::LoDTensor* GetTensor(const framework::Scope& scope,
+                                    const std::string& name);
+
    private:
     void InitArgument(const rnn::ArgumentName& name, const OperatorBase& op,
                       rnn::Argument* arg);
@@ -122,7 +146,7 @@ class DynamicRecurrentOp : public framework::OperatorBase {
 
  private:
   std::unique_ptr<OperatorBase> stepnet_;
-  mutable framework::TensorArray states_;
+  mutable std::map<std::string, framework::TensorArray> states_;
   mutable std::map<std::string, framework::TensorArray> step_inputs_;
   mutable std::map<std::string, framework::TensorArray> step_outputs_;
   mutable std::map<std::string, std::vector<framework::DySeqMeta>>
diff --git a/paddle/operators/dynamic_recurrent_op_test.cc b/paddle/operators/dynamic_recurrent_op_test.cc
index 675a7890f3..83a5ba36d9 100644
--- a/paddle/operators/dynamic_recurrent_op_test.cc
+++ b/paddle/operators/dynamic_recurrent_op_test.cc
@@ -36,7 +36,7 @@ void OpDescNewVar(const std::string& param_name,
 // create a LoD tensor in scope with specific dims
 LoDTensor* CreateVar(Scope& scope, std::string name, framework::DDim dims,
                      const platform::Place& place) {
-  auto* var = scope.NewVar(name);
+  auto* var = scope.Var(name);
   auto* tensor = var->GetMutable<LoDTensor>();
   tensor->Resize(dims);
   tensor->mutable_data<float>(place);
@@ -85,9 +85,8 @@ class DynamicRecurrentOpTestHelper : public ::testing::Test {
 
   void CreateGlobalVariables() {
     platform::CPUPlace place;
-    scope.NewVar("step_scopes");
+    scope.Var("step_scopes");
     CreateVar(scope, "boot_mem", framework::make_ddim({10, 20}), place);
-    // auto* out0 =
     CreateVar(scope, "out0", framework::make_ddim({10, 20}), place);
     auto* in0 = CreateVar(scope, "in0", framework::make_ddim({10, 8}), place);
     // 10 instanes with 4 sentences, length is 4, 3, 2, 1 respectively.
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
index 66f1910a47..fce4b24a22 100644
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -23,7 +23,6 @@ class ElementwiseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   using Tensor = framework::Tensor;
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
@@ -105,7 +104,6 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
   using Tensor = framework::Tensor;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index fa325bb282..d742bbe51b 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -1,59 +1,57 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
-#include "paddle/operators/feed_op.h"
+#include "paddle/framework/feed_fetch_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace operators {
-
-class FeedOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output should be not null.");
-    auto& shape = ctx->Attrs().Get<std::vector<int>>("dims");
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    ctx->SetOutputDim("Out", framework::make_ddim(shape_int64));
-    // TODO(qijun): need to handle LodTensor later
-  }
-
-  framework::DataType IndicateDataType(
-      const framework::ExecutionContext& ctx) const override {
-    return static_cast<framework::DataType>(Attr<int>("dataType"));
-  }
-};
-
-class FeedOpMaker : public framework::OpProtoAndCheckerMaker {
+class FeedOp : public framework::OperatorBase {
  public:
-  FeedOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<int>("dataType", "output data type")
-        .SetDefault(framework::DataType::FP32);
-    AddAttr<int>("col", "The col in global feed variable").SetDefault(0);
-    AddAttr<std::vector<int>>("dims", "The dimension of feed tensor.");
-    AddOutput("Out", "The output of feed op.");
-    AddComment(R"DOC(Feed data from global feed variable)DOC");
+  FeedOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto feed_var_name = Input("Input");
+    auto *feed_var = scope.FindVar(feed_var_name);
+    PADDLE_ENFORCE(feed_var != nullptr,
+                   "Cannot find feed_var in scope, feed_var_name is %s",
+                   feed_var_name);
+
+    auto out_name = this->Output("Out");
+    auto *out_var = scope.FindVar(out_name);
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot find out_var in scope, out_var_name is %s",
+                   out_name);
+
+    auto col = Attr<int>("col");
+
+    auto &feed_list = feed_var->Get<framework::FeedFetchList>();
+    auto &feed_item = feed_list.at(static_cast<size_t>(col));
+    auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
+    out_item->CopyFromTensor(feed_item, dev_ctx.GetPlace(), dev_ctx);
+    out_item->set_lod(feed_item.lod());
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(feed, ops::FeedOp, ops::FeedOpMaker);
-REGISTER_OP_CPU_KERNEL(feed, ops::FeedKernel<float>);
+// We do not need to register OpInfoMaker,
+// since feed operator will not be used by end users directly
+REGISTER_OPERATOR(feed, paddle::operators::FeedOp,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/feed_op.h b/paddle/operators/feed_op.h
deleted file mode 100644
index e756cd1842..0000000000
--- a/paddle/operators/feed_op.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class FeedKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    framework::Variable* g_feed_variable =
-        framework::GetGlobalScope().FindVar("feed_value");
-    const auto& tensors =
-        g_feed_variable->Get<std::vector<framework::Tensor>>();
-    int col = ctx.template Attr<int>("col");
-    PADDLE_ENFORCE_GT(tensors.size(), static_cast<size_t>(col));
-    // TODO(qijun):
-    //   check tensors[col].dims() with attribute,
-    //   except the first dimenson.
-    out->CopyFrom<T>(tensors[col], ctx.GetPlace(), ctx.device_context());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index 90737c8c55..55d6ac0939 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -1,52 +1,64 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
-#include "paddle/operators/fetch_op.h"
+#include "paddle/framework/feed_fetch_type.h"
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
-class FetchOp : public framework::OperatorWithKernel {
+class FetchOp : public framework::OperatorBase {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
+  FetchOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
 
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"), "Input should be not null.");
-  }
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto fetch_var_name = Input("Input");
+    auto *fetch_var = scope.FindVar(fetch_var_name);
+    PADDLE_ENFORCE(fetch_var != nullptr,
+                   "Cannot find fetch variable in scope, fetch_var_name is %s",
+                   fetch_var_name);
 
-  framework::DataType IndicateDataType(
-      const framework::ExecutionContext& ctx) const override {
-    return static_cast<framework::DataType>(Attr<int>("dataType"));
-  }
-};
+    auto out_name = this->Output("Out");
+    auto *out_var = scope.FindVar(out_name);
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot find out_var in scope, out_var_name is %s",
+                   out_name);
 
-class FetchOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  FetchOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<int>("dataType", "output data type")
-        .SetDefault(framework::DataType::FP32);
-    AddAttr<int>("col", "The col in global fetch variable").SetDefault(0);
-    AddInput("Input", "The output of fetch op.");
-    AddComment(R"DOC(Fetch data to global fetch variable)DOC");
+    auto col = static_cast<size_t>(Attr<int>("col"));
+
+    auto *fetch_list = out_var->GetMutable<framework::FeedFetchList>();
+    auto &src_item = fetch_var->Get<framework::FeedFetchType>();
+
+    if (col >= fetch_list->size()) {
+      fetch_list->resize(col + 1);
+    }
+    auto &dst_item = fetch_list->at(col);
+
+    // FIXME(yuyang18): Should we assume the fetch operator always generate
+    // CPU outputs?
+    dst_item.CopyFromTensor(src_item, platform::CPUPlace(), dev_ctx);
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(fetch, ops::FetchOp, ops::FetchOpMaker);
-REGISTER_OP_CPU_KERNEL(fetch, ops::FetchKernel<float>);
+// We do not need to register OpInfoMaker,
+// since fetch operator will not be used by end users directly
+REGISTER_OPERATOR(fetch, paddle::operators::FetchOp,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/fetch_op.cu b/paddle/operators/fetch_op.cu
deleted file mode 100644
index ca39d24c79..0000000000
--- a/paddle/operators/fetch_op.cu
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/fetch_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(fetch, ops::FetchKernel<float>);
diff --git a/paddle/operators/fetch_op.h b/paddle/operators/fetch_op.h
deleted file mode 100644
index b2a6e95875..0000000000
--- a/paddle/operators/fetch_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class FetchKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor* input = ctx.Input<framework::Tensor>("Input");
-    framework::Variable* g_fetch_variable =
-        framework::GetGlobalScope().FindVar("fetch_value");
-    auto* tensors =
-        g_fetch_variable->GetMutable<std::vector<framework::Tensor>>();
-    int col = ctx.template Attr<int>("col");
-    if (tensors->size() < static_cast<size_t>(col + 1)) {
-      tensors->resize(col + 1);
-    }
-    PADDLE_ENFORCE_GT(tensors->size(), static_cast<size_t>(col));
-    (*tensors)[col].Resize(input->dims());
-    (*tensors)[col].mutable_data<T>(platform::CPUPlace());
-    (*tensors)[col].CopyFrom<T>(*input, platform::CPUPlace(),
-                                ctx.device_context());
-    // TODO(qijun): need to handle LodTensor later
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc
index 65d03d5fa4..0438d4d085 100644
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -21,7 +21,6 @@ class FillConstantOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FillConstantOp should not be null.");
@@ -33,9 +32,10 @@ class FillConstantOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", dims);
   }
 
+ protected:
   framework::DataType IndicateDataType(
       const framework::ExecutionContext &ctx) const override {
-    return static_cast<framework::DataType>(ctx.Attr<int>("dataType"));
+    return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
   }
 };
 
@@ -44,7 +44,7 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
   FillConstantOpMaker(framework::OpProto *proto,
                       framework::OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<int>("dataType",
+    AddAttr<int>("data_type",
                  "(int, default 5 (FP32)) "
                  "Output data type")
         .SetDefault(framework::DataType::FP32);
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
index 4c70b9a36b..ed529ac40a 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -21,7 +21,6 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of FillZerosLikeOp should not be null.");
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index fb99c6c016..f6c7f472da 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -22,7 +22,6 @@ class GatherOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of GatherOp should not be null.");
@@ -40,6 +39,7 @@ class GatherOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", output_dims);
   }
 
+ protected:
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
     return framework::ToDataType(ctx.Input<Tensor>("X")->type());
@@ -50,11 +50,11 @@ class GatherGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 
+ protected:
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
     return framework::ToDataType(ctx.Input<Tensor>("X")->type());
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index ca7fb38505..f59f497d9f 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -42,7 +42,6 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of GaussianRandomOp should not be null.");
@@ -57,6 +56,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", framework::make_ddim(temp));
   }
 
+ protected:
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
     return static_cast<framework::DataType>(Attr<int>("data_type"));
diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc
new file mode 100644
index 0000000000..72dd841c85
--- /dev/null
+++ b/paddle/operators/gru_unit_op.cc
@@ -0,0 +1,208 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/gru_unit_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class GRUUnitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUUnitOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"),
+                   "Input(%s) of GRUUnitOp should not be null.", "HiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUUnitOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasOutput("Gate"),
+                   "Output(%s) of GRUUnitOp should not be null.", "Gate");
+    PADDLE_ENFORCE(ctx->HasOutput("ResetHiddenPrev"),
+                   "Output(%s) of GRUUnitOp should not be null.",
+                   "ResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                   "Output(%s) of GRUUnitOp should not be null.", "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int batch_size = input_dims[0];
+    int input_size = input_dims[1];
+    int frame_size = hidden_prev_dims[1];
+    int weight_height = weight_dims[0];
+    int weight_width = weight_dims[1];
+    PADDLE_ENFORCE_EQ(
+        input_size, frame_size * 3,
+        "The input_size must be 3 times of frame_size in GRUUnitOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_height, frame_size,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    PADDLE_ENFORCE_EQ(
+        weight_width, frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    auto bias = Input("Bias");
+    if (bias != framework::kEmptyVarName) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+    }
+    ctx->SetOutputDim("Gate", {batch_size, frame_size * 3});
+    ctx->SetOutputDim("ResetHiddenPrev", {batch_size, frame_size});
+    ctx->SetOutputDim("Hidden", {batch_size, frame_size});
+  }
+};
+
+class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GRUUnitOpMaker(framework::OpProto* proto,
+                 framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
+             "input.");
+    AddInput("HiddenPrev",
+             "(Tensor) Matrix with shape [batch_size, frame_size] for the "
+             "states of previous time step.");
+    AddInput("Weight",
+             "(Tensor) Weight matrix with shape [frame_size, frame_size * 3]. "
+             "The elements continuous in memory can be divided into two parts. "
+             "The first part are weights of the update gate and reset gate "
+             "with shape [frame_size, frame_size * 2], and the second part are "
+             "weights of output candidate with shape [frame_size, frame_size]");
+    AddInput("Bias",
+             "(Tensor) Bias vector with shape [1, frame_size * 3] concating "
+             "bias of the update gate, reset gate and output candidate.");
+    AddOutput("Gate",
+              "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
+              "output of update gate, reset gate and output candidate")
+        .AsIntermediate();
+    AddOutput("ResetHiddenPrev",
+              "(Tensor) Matrix with shape [batch_size, frame_size] for the "
+              "reseted hidden state of previous time step.")
+        .AsIntermediate();
+    AddOutput("Hidden",
+              "(Tensor) The GRU hidden state of the current time step "
+              "with shape [batch_size, frame_size].");
+    AddAttr<int>("activation",
+                 "(enum int, default tanh) "
+                 "The activation type used for output candidate {h}_t.")
+        .SetDefault(tanh)
+        .InEnum({identity, sigmoid, tanh, relu});
+    AddAttr<int>("gate_activation",
+                 "(enum int, default sigmoid) "
+                 "The activation type used in update gate and reset gate.")
+        .SetDefault(sigmoid)
+        .InEnum({identity, sigmoid, tanh, relu});
+    AddComment(R"DOC(
+GRUUnitOp implements part calculations of the GRU unit as following:
+
+\f[
+update \ gate: u_t = actGate(xu_t + W_u * hidden_prev + bias_u) \\
+reset \ gate: r_t = actGate(xr_t + W_r * hidden_prev + bias_r)  \\
+output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, hidden_prev) + bias_c) \\
+output: h_t = dot((1-u_t), {h}_t) + dot(u_t, hidden_prev)
+\f]
+
+The rest of GRU unit can be completed by using FCOp's output as the input of GRUUnitOp.
+)DOC");
+  }
+};
+
+class GRUUnitGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"),
+                   "Input(%s) of GRUUnitGradOp should not be null.",
+                   "HiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasInput("Gate"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Gate");
+    PADDLE_ENFORCE(ctx->HasInput("ResetHiddenPrev"),
+                   "Input(%s) of GRUUnitGradOp should not be null.",
+                   "ResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Hidden");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Gate")),
+                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
+                   "Gate");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("ResetHiddenPrev")),
+                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
+                   "ResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
+                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
+                   "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    // int batch_size = input_dims[0];
+    int input_size = input_dims[1];
+    int frame_size = hidden_prev_dims[1];
+    int weight_height = weight_dims[0];
+    int weight_width = weight_dims[1];
+    PADDLE_ENFORCE_EQ(
+        input_size, frame_size * 3,
+        "The input_size must be 3 times of frame_size in GRUUnitOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_height, frame_size,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    PADDLE_ENFORCE_EQ(
+        weight_width, frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    auto bias = Input("Bias");
+    if (bias != framework::kEmptyVarName) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+    auto input_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(input_grad_name))
+      ctx->SetOutputDim(input_grad_name, input_dims);
+    auto hidden_prev_grad_name = framework::GradVarName("HiddenPrev");
+    if (ctx->HasOutput(hidden_prev_grad_name))
+      ctx->SetOutputDim(hidden_prev_grad_name, hidden_prev_dims);
+    auto weight_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(weight_grad_name))
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
+            ops::GRUUnitGradOp);
+REGISTER_OP_CPU_KERNEL(gru_unit,
+                       ops::GRUUnitKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/gru_unit_op.cu b/paddle/operators/gru_unit_op.cu
new file mode 100644
index 0000000000..365f656523
--- /dev/null
+++ b/paddle/operators/gru_unit_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/gru_unit_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gru_unit,
+                       ops::GRUUnitKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h
new file mode 100644
index 0000000000..c53e7d9827
--- /dev/null
+++ b/paddle/operators/gru_unit_op.h
@@ -0,0 +1,230 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/activation_op.h"
+#include "paddle/operators/math/math_function.h"
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
+
+template <typename Place, typename T>
+class GRUUnitKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y>
+  void ActCompute(const int act_type, const Device& d, X x, Y y) const {
+    if (act_type == identity)
+      y.device(d) = x;
+    else if (act_type == sigmoid)
+      SigmoidFunctor<T>()(d, x, y);
+    else if (act_type == tanh)
+      TanhFunctor<T>()(d, x, y);
+    else if (act_type == relu)
+      ReluFunctor<T>()(d, x, y);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("Input");
+    auto* hidden_prev = context.Input<Tensor>("HiddenPrev");
+    auto* weight = context.Input<Tensor>("Weight");
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* gate = context.Output<Tensor>("Gate");
+    gate->mutable_data<T>(context.GetPlace());
+    auto* reset_hidden_prev = context.Output<Tensor>("ResetHiddenPrev");
+    reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<Tensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+
+    int batch_size = input->dims()[0];
+    int frame_size = hidden_prev->dims()[1];
+
+    auto x = EigenMatrix<T>::From(*input);
+    auto h_p = EigenMatrix<T>::From(*hidden_prev);
+    auto g = EigenMatrix<T>::From(*gate);
+    auto r_h_p = EigenMatrix<T>::From(*reset_hidden_prev);
+    auto h = EigenMatrix<T>::From(*hidden);
+    auto place = context.GetEigenDevice<Place>();
+
+    // calculate unactivated gate outputs
+    if (bias) {
+      auto b = EigenMatrix<T>::From(*bias);
+      g.device(place) = x +
+                        b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
+                            .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
+    } else {
+      g.device(place) = x;
+    }
+    const T* hidden_prev_data = hidden_prev->data<T>();
+    const T* weight_data = weight->data<T>();
+    T* gate_data = gate->data<T>();
+    T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
+    math::gemm<Place, T>(context.device_context(), false, false, batch_size,
+                         2 * frame_size, frame_size, 1, hidden_prev_data,
+                         frame_size, weight_data, frame_size * 2, 1, gate_data,
+                         frame_size * 3);
+
+    // calculate activited gate
+    Eigen::array<int, 2> extents({{batch_size, frame_size}});
+    Eigen::array<int, 2> u_offsets({{0, 0}});
+    ActCompute(context.Attr<int>("gate_activation"), place,
+               g.slice(u_offsets, extents), g.slice(u_offsets, extents));
+    auto u = g.slice(u_offsets, extents);  // update gate
+    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    ActCompute(context.Attr<int>("gate_activation"), place,
+               g.slice(r_offsets, extents), g.slice(r_offsets, extents));
+    auto r = g.slice(r_offsets, extents);  // reset gate
+    r_h_p.device(place) = r * h_p;         // reset previous hidden state
+    math::gemm<Place, T>(context.device_context(), false, false, batch_size,
+                         frame_size, frame_size, 1, reset_hidden_prev_data,
+                         frame_size, weight_data + frame_size * frame_size * 2,
+                         frame_size, 1, gate_data + frame_size * 2,
+                         frame_size * 3);
+
+    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    ActCompute(context.Attr<int>("activation"), place,
+               g.slice(c_offsets, extents), g.slice(c_offsets, extents));
+    auto c = g.slice(c_offsets, extents);  // output candidate
+
+    // calculate final output
+    h.device(place) = u * (h_p - c) + c;
+  }
+};
+
+template <typename Place, typename T>
+class GRUUnitGradKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y, typename DX, typename DY>
+  void ActGradCompute(const int act_type, const Device& d, X x, Y y, DX dx,
+                      DY dy) const {
+    // x is dummy and won't be used even in Relu(use y instead)
+    if (act_type == identity)
+      dx.device(d) = dy;
+    else if (act_type == sigmoid)
+      SigmoidGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == tanh)
+      TanhGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == relu)
+      ReluGradFunctor<T>()(d, x, y, dy, dx);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("Input");
+    auto* hidden_prev = context.Input<Tensor>("HiddenPrev");
+    auto* weight = context.Input<Tensor>("Weight");
+    auto* gate = context.Input<Tensor>("Gate");
+    auto* reset_hidden_prev = context.Input<Tensor>("ResetHiddenPrev");
+    auto* hidden_grad = context.Input<Tensor>(framework::GradVarName("Hidden"));
+    auto* input_grad = context.Output<Tensor>(framework::GradVarName("Input"));
+    auto* hidden_prev_grad =
+        context.Output<Tensor>(framework::GradVarName("HiddenPrev"));
+    auto* weight_grad =
+        context.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
+    input_grad->mutable_data<T>(context.GetPlace());
+    hidden_prev_grad->mutable_data<T>(context.GetPlace());
+    weight_grad->mutable_data<T>(context.GetPlace());
+    Tensor gate_grad;
+    gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
+    Tensor reset_hidden_prev_grad;
+    reset_hidden_prev_grad.mutable_data<T>(reset_hidden_prev->dims(),
+                                           context.GetPlace());
+
+    int batch_size = input->dims()[0];
+    int frame_size = hidden_prev->dims()[1];
+
+    const T* hidden_prev_data = hidden_prev->data<T>();
+    T* hidden_prev_grad_data = hidden_prev_grad->data<T>();
+    const T* weight_data = weight->data<T>();
+    T* weight_grad_data = weight_grad->data<T>();
+    T* gate_grad_data = gate_grad.data<T>();
+    const T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
+    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.data<T>();
+
+    auto h_p = EigenMatrix<T>::From(*hidden_prev);
+    auto g = EigenMatrix<T>::From(*gate);
+    auto d_h = EigenMatrix<T>::From(*hidden_grad);
+    auto d_x = EigenMatrix<T>::From(*input_grad);
+    auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
+    auto d_g = EigenMatrix<T>::From(gate_grad);
+    auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
+    auto place = context.GetEigenDevice<Place>();
+
+    Eigen::array<int, 2> extents({{batch_size, frame_size}});
+    Eigen::array<int, 2> u_offsets({{0, 0}});
+    auto u = g.slice(u_offsets, extents);  // update gate
+    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    auto r = g.slice(r_offsets, extents);  // reset gate
+    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    auto c = g.slice(c_offsets, extents);  // output candidate
+
+    // backward for unactivated update gate
+    ActGradCompute(context.Attr<int>("gate_activation"), place, u, u,
+                   d_g.slice(u_offsets, extents), d_h * (h_p - c));
+    // backward for unactivated output candidate
+    ActGradCompute(context.Attr<int>("activation"), place, c, c,
+                   d_g.slice(c_offsets, extents), d_h * (u.constant(T(1)) - u));
+    // backward for reset_hidden_prev
+    math::gemm<Place, T>(context.device_context(), false, true, batch_size,
+                         frame_size, frame_size, 1,
+                         gate_grad_data + frame_size * 2, frame_size * 3,
+                         weight_data + frame_size * frame_size * 2, frame_size,
+                         0, reset_hidden_prev_grad_data, frame_size);
+    // backward for state_weight
+    math::gemm<Place, T>(
+        context.device_context(), true, false, frame_size, frame_size,
+        batch_size, 1, reset_hidden_prev_data, frame_size,
+        gate_grad_data + frame_size * 2, frame_size * 3, 0,
+        weight_grad_data + frame_size * frame_size * 2, frame_size);
+    // backward for unactivated reset gate
+    ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
+                   d_g.slice(r_offsets, extents), d_r_h_p * h_p);
+    // backward for update_gate_weight and reset_gate_weight
+    math::gemm<Place, T>(context.device_context(), true, false, frame_size,
+                         frame_size * 2, batch_size, 1, hidden_prev_data,
+                         frame_size, gate_grad_data, frame_size * 3, 0,
+                         weight_grad_data, frame_size * 2);
+    // backward for hidden_prev
+    d_h_p.device(place) = d_r_h_p * r + d_h * u;
+    math::gemm<Place, T>(context.device_context(), false, true, batch_size,
+                         frame_size, frame_size * 2, 1, gate_grad_data,
+                         frame_size * 3, weight_data, frame_size * 2, 1,
+                         hidden_prev_grad_data, frame_size);
+    // backward for input
+    d_x.device(place) = d_g;
+    // backward for bias
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(context.GetPlace());
+      auto d_b = EigenMatrix<T>::From(*bias_grad);
+      d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
index b451ae62e2..e127811a10 100644
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -120,7 +120,6 @@ class LinearChainCrfOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Emission"),
                    "Input(Emission) should be not null.");
@@ -165,6 +164,7 @@ class LinearChainCrfOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("LogLikelihood", {emission_dims[0], 1});
   }
 
+ protected:
   // Explicitly set that the data type of output of the linear_chain_crf
   // operator is determined by its input "Emission".
   framework::DataType IndicateDataType(
@@ -208,9 +208,9 @@ class LinearChainCrfOpKernel<platform::CPUPlace, T>
     transition_exps.mutable_data<T>(transition_weights->dims(),
                                     platform::CPUPlace());
 
-    auto* alpha = ctx.Output<Tensor>("Alpha");
+    auto* alpha = ctx.Output<LoDTensor>("Alpha");
     alpha->mutable_data<T>(ctx.GetPlace());
-    auto* ll = ctx.Output<Tensor>("LogLikelihood");
+    auto* ll = ctx.Output<LoDTensor>("LogLikelihood");
     // resize the output tensor to the correct dimension.
     ll->Resize({static_cast<int>(seq_num), 1});
     T* log_likelihood = ll->mutable_data<T>(ctx.GetPlace());
@@ -324,7 +324,6 @@ class LinearChainCrfGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {}
 };
 
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index 3f8d4ab857..b88cd14d78 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -21,7 +21,6 @@ class LookupTableOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("W"),
                    "Input(W) of LookupTableOp should not be null.");
@@ -37,6 +36,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
     ctx->ShareLoD("Ids", /*->*/ "Out");
   }
 
+ protected:
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
     return framework::ToDataType(ctx.Input<Tensor>("W")->type());
@@ -69,12 +69,12 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     auto table_dims = ctx->GetInputDim("W");
     ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
   }
 
+ protected:
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
     return framework::ToDataType(ctx.Input<Tensor>("W")->type());
diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/operators/lstm_unit_op.cc
index 13a45ec246..5d63017208 100644
--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
@@ -21,7 +21,6 @@ class LstmUnitOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("C_prev"),
@@ -76,7 +75,6 @@ class LstmUnitGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("C")),
                    "Input(C@GRAD) should not be null");
diff --git a/paddle/operators/margin_rank_loss_op.cc b/paddle/operators/margin_rank_loss_op.cc
index 5be61dfec3..638a99addc 100644
--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
@@ -21,7 +21,6 @@ class MarginRankLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     // input check
     PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
@@ -94,7 +93,6 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
     PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null.");
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 1a2f623ce7..72ce858504 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -1,18 +1,22 @@
 if(WITH_GPU)
     nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context operator)
-    nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
+    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
+    nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function)
+    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
     nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
     nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
     nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
     nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
-    cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
+    cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
     cc_library(softmax SRCS softmax.cc DEPS operator)
     cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
     cc_library(pooling SRCS pooling.cc DEPS device_context)
     cc_library(vol2col SRCS vol2col.cc DEPS device_context)
 endif()
 
+cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
+cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
 cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
 cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor)
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index ba653afa2c..77a1e22b41 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -130,6 +130,8 @@ void matmul<platform::CPUPlace, double>(
       matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
 
+template struct SetConstant<platform::CPUPlace, float>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index 649f1f352c..7fbc03acf2 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -155,6 +155,8 @@ void matmul<platform::GPUPlace, double>(
       matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
 
+template struct SetConstant<platform::GPUPlace, float>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
index 473eff4d19..6f92d83aab 100644
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -86,11 +86,14 @@ void matmul(const platform::DeviceContext& context,
             framework::Tensor* matrix_out, T beta);
 
 template <typename Place, typename T>
-void SetConstant(const platform::DeviceContext& context,
-                 framework::Tensor* tensor, T num) {
-  auto t = framework::EigenVector<T>::Flatten(*tensor);
-  t.device(*context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(num));
-}
+struct SetConstant {
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor* tensor, T num) {
+    auto t = framework::EigenVector<T>::Flatten(*tensor);
+    t.device(*context.GetEigenDevice<Place>()) =
+        t.constant(static_cast<T>(num));
+  }
+};
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc
index c87d200c3a..3b9f92e7ae 100644
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
@@ -1,185 +1,6 @@
 #include "paddle/operators/math/math_function.h"
 #include "gtest/gtest.h"
 
-#ifdef PADDLE_WITH_CUDA
-TEST(math_function, notrans_mul_trans) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor out_gpu;
-  paddle::framework::Tensor out;
-
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr, 6 * sizeof(float));
-
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
-  paddle::platform::CUDADeviceContext context(*gpu_place);
-
-  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
-  input2_gpu.CopyFrom<float>(input1, *gpu_place, context);
-
-  out_gpu.mutable_data<float>({2, 2}, *gpu_place);
-
-  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
-      context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
-
-  out.CopyFrom<float>(out_gpu, *cpu_place, context);
-
-  float* out_ptr = out.data<float>();
-  context.Wait();
-  EXPECT_EQ(out_ptr[0], 5);
-  EXPECT_EQ(out_ptr[1], 14);
-  EXPECT_EQ(out_ptr[2], 14);
-  EXPECT_EQ(out_ptr[3], 50);
-  delete gpu_place;
-}
-
-TEST(math_function, trans_mul_notrans) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor out_gpu;
-  paddle::framework::Tensor out;
-
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr, 6 * sizeof(float));
-
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
-  paddle::platform::CUDADeviceContext context(*gpu_place);
-
-  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
-  input2_gpu.CopyFrom<float>(input1, *gpu_place, context);
-
-  out_gpu.mutable_data<float>({3, 3}, *gpu_place);
-
-  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
-      context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
-
-  out.CopyFrom<float>(out_gpu, *cpu_place, context);
-
-  float* out_ptr = out.data<float>();
-  context.Wait();
-  EXPECT_EQ(out_ptr[0], 9);
-  EXPECT_EQ(out_ptr[1], 12);
-  EXPECT_EQ(out_ptr[2], 15);
-  EXPECT_EQ(out_ptr[3], 12);
-  EXPECT_EQ(out_ptr[4], 17);
-  EXPECT_EQ(out_ptr[5], 22);
-  EXPECT_EQ(out_ptr[6], 15);
-  EXPECT_EQ(out_ptr[7], 22);
-  EXPECT_EQ(out_ptr[8], 29);
-  delete gpu_place;
-}
-
-TEST(math_function, gemm_notrans_cublas) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor input3_gpu;
-
-  int m = 2;
-  int n = 3;
-  int k = 3;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr1[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr1, 6 * sizeof(float));
-  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
-  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  memcpy(input2_ptr, arr2, 12 * sizeof(float));
-  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
-  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
-  paddle::platform::CUDADeviceContext context(*gpu_place);
-
-  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
-  input2_gpu.CopyFrom<float>(input2, *gpu_place, context);
-  input3_gpu.CopyFrom<float>(input3, *gpu_place, context);
-  float* a = input1_gpu.data<float>();
-  float* b = input2_gpu.data<float>();
-  float* c = input3_gpu.mutable_data<float>(*gpu_place);
-
-  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
-      context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
-
-  input3.CopyFrom<float>(input3_gpu, *cpu_place, context);
-
-  // numpy code:
-  // a = np.arange(6).reshape(2, 3)
-  // b = np.arange(12).reshape(3, 4)[:, 1:]
-  // c = np.arange(8).reshape(2, 4)[:, 1:]
-  // out = np.arange(8).reshape(2, 4)
-  // out[:, 1:] = np.dot(a, b) + c
-  context.Wait();
-  EXPECT_EQ(input3_ptr[0], 0);
-  EXPECT_EQ(input3_ptr[1], 24);
-  EXPECT_EQ(input3_ptr[2], 28);
-  EXPECT_EQ(input3_ptr[3], 32);
-  EXPECT_EQ(input3_ptr[4], 4);
-  EXPECT_EQ(input3_ptr[5], 73);
-  EXPECT_EQ(input3_ptr[6], 86);
-  EXPECT_EQ(input3_ptr[7], 99);
-  delete gpu_place;
-}
-
-TEST(math_function, gemm_trans_cublas) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor input3_gpu;
-
-  int m = 2;
-  int n = 3;
-  int k = 3;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr1[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr1, 6 * sizeof(float));
-  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
-  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
-  memcpy(input2_ptr, arr2, 12 * sizeof(float));
-  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
-  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
-  paddle::platform::CUDADeviceContext context(*gpu_place);
-
-  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
-  input2_gpu.CopyFrom<float>(input2, *gpu_place, context);
-  input3_gpu.CopyFrom<float>(input3, *gpu_place, context);
-  float* a = input1_gpu.data<float>();
-  float* b = input2_gpu.data<float>();
-  float* c = input3_gpu.mutable_data<float>(*gpu_place);
-
-  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
-      context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
-
-  input3.CopyFrom<float>(input3_gpu, *cpu_place, context);
-  context.Wait();
-
-  EXPECT_EQ(input3_ptr[0], 0);
-  EXPECT_EQ(input3_ptr[1], 24);
-  EXPECT_EQ(input3_ptr[2], 28);
-  EXPECT_EQ(input3_ptr[3], 32);
-  EXPECT_EQ(input3_ptr[4], 4);
-  EXPECT_EQ(input3_ptr[5], 73);
-  EXPECT_EQ(input3_ptr[6], 86);
-  EXPECT_EQ(input3_ptr[7], 99);
-  delete gpu_place;
-}
-#endif
-
 TEST(math_function, gemm_notrans_cblas) {
   paddle::framework::Tensor input1;
   paddle::framework::Tensor input2;
@@ -253,15 +74,15 @@ TEST(math_function, zero) {
   auto* cpu_place = new paddle::platform::CPUPlace();
   float* t = tensor.mutable_data<float>({2, 2}, *cpu_place);
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUPlace, float>(
-      context, &tensor, 0);
+  paddle::operators::math::SetConstant<paddle::platform::CPUPlace, float>
+      functor;
+  functor(context, &tensor, 0);
   EXPECT_EQ(t[0], 0);
   EXPECT_EQ(t[1], 0);
   EXPECT_EQ(t[2], 0);
   EXPECT_EQ(t[3], 0);
 
-  paddle::operators::math::SetConstant<paddle::platform::CPUPlace, float>(
-      context, &tensor, 1);
+  functor(context, &tensor, 1);
 
   EXPECT_EQ(t[0], 1);
   EXPECT_EQ(t[1], 1);
diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu
new file mode 100644
index 0000000000..14359d835b
--- /dev/null
+++ b/paddle/operators/math/math_function_test.cu
@@ -0,0 +1,179 @@
+#include "gtest/gtest.h"
+#include "paddle/operators/math/math_function.h"
+
+TEST(math_function, notrans_mul_trans) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor out_gpu;
+  paddle::framework::Tensor out;
+
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr, 6 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
+  input2_gpu.CopyFrom<float>(input1, *gpu_place, context);
+
+  out_gpu.mutable_data<float>({2, 2}, *gpu_place);
+
+  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
+      context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
+
+  out.CopyFrom<float>(out_gpu, *cpu_place, context);
+
+  float* out_ptr = out.data<float>();
+  context.Wait();
+  EXPECT_EQ(out_ptr[0], 5);
+  EXPECT_EQ(out_ptr[1], 14);
+  EXPECT_EQ(out_ptr[2], 14);
+  EXPECT_EQ(out_ptr[3], 50);
+  delete gpu_place;
+}
+
+TEST(math_function, trans_mul_notrans) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor out_gpu;
+  paddle::framework::Tensor out;
+
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr, 6 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
+  input2_gpu.CopyFrom<float>(input1, *gpu_place, context);
+
+  out_gpu.mutable_data<float>({3, 3}, *gpu_place);
+
+  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
+      context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
+
+  out.CopyFrom<float>(out_gpu, *cpu_place, context);
+
+  float* out_ptr = out.data<float>();
+  context.Wait();
+  EXPECT_EQ(out_ptr[0], 9);
+  EXPECT_EQ(out_ptr[1], 12);
+  EXPECT_EQ(out_ptr[2], 15);
+  EXPECT_EQ(out_ptr[3], 12);
+  EXPECT_EQ(out_ptr[4], 17);
+  EXPECT_EQ(out_ptr[5], 22);
+  EXPECT_EQ(out_ptr[6], 15);
+  EXPECT_EQ(out_ptr[7], 22);
+  EXPECT_EQ(out_ptr[8], 29);
+  delete gpu_place;
+}
+
+TEST(math_function, gemm_notrans_cublas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
+  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
+  input2_gpu.CopyFrom<float>(input2, *gpu_place, context);
+  input3_gpu.CopyFrom<float>(input3, *gpu_place, context);
+  float* a = input1_gpu.data<float>();
+  float* b = input2_gpu.data<float>();
+  float* c = input3_gpu.mutable_data<float>(*gpu_place);
+
+  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
+      context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
+
+  input3.CopyFrom<float>(input3_gpu, *cpu_place, context);
+
+  // numpy code:
+  // a = np.arange(6).reshape(2, 3)
+  // b = np.arange(12).reshape(3, 4)[:, 1:]
+  // c = np.arange(8).reshape(2, 4)[:, 1:]
+  // out = np.arange(8).reshape(2, 4)
+  // out[:, 1:] = np.dot(a, b) + c
+  context.Wait();
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+  delete gpu_place;
+}
+
+TEST(math_function, gemm_trans_cublas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
+  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
+  input2_gpu.CopyFrom<float>(input2, *gpu_place, context);
+  input3_gpu.CopyFrom<float>(input3, *gpu_place, context);
+  float* a = input1_gpu.data<float>();
+  float* b = input2_gpu.data<float>();
+  float* c = input3_gpu.mutable_data<float>(*gpu_place);
+
+  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
+      context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
+
+  input3.CopyFrom<float>(input3_gpu, *cpu_place, context);
+  context.Wait();
+
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+  delete gpu_place;
+}
diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc
new file mode 100644
index 0000000000..f2305ea169
--- /dev/null
+++ b/paddle/operators/math/selected_rows_functor.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+struct SelectedRowsAdd<platform::CPUPlace, T> {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::SelectedRows& input2,
+                  framework::SelectedRows* output) {
+    auto in1_height = input1.height();
+    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    output->set_height(in1_height);
+
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = input2.rows();
+    std::vector<int64_t> out_rows;
+    out_rows.reserve(in1_rows.size() + in2_rows.size());
+
+    // concat rows
+    out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end());
+    out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end());
+    output->set_rows(out_rows);
+
+    auto* out_value = output->mutable_value();
+    auto& in1_value = input1.value();
+    auto& in2_value = input2.value();
+
+    auto in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
+    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+
+    auto in1_place = input1.place();
+    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+    auto in2_place = input2.place();
+    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+    auto out_place = context.GetPlace();
+    PADDLE_ENFORCE(platform::is_cpu_place(out_place));
+
+    auto* out_data = out_value->data<T>();
+    auto* in1_data = in1_value.data<T>();
+    memory::Copy(boost::get<platform::CPUPlace>(out_place), out_data,
+                 boost::get<platform::CPUPlace>(in1_place), in1_data,
+                 in1_value.numel() * sizeof(T));
+
+    auto* in2_data = in2_value.data<T>();
+    memory::Copy(boost::get<platform::CPUPlace>(out_place),
+                 out_data + in1_value.numel(),
+                 boost::get<platform::CPUPlace>(in2_place), in2_data,
+                 in2_value.numel() * sizeof(T));
+  }
+};
+
+template struct SelectedRowsAdd<platform::CPUPlace, float>;
+
+template <typename T>
+struct SelectedRowsAddTensor<platform::CPUPlace, T> {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::Tensor& input2, framework::Tensor* output) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2.dims();
+    auto out_dims = output->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
+    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+
+    SetConstant<platform::CPUPlace, T> functor;
+    functor(context, output, 0.0);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* out_data = output->data<T>();
+
+    for (size_t i = 0; i < in1_rows.size(); i++) {
+      for (int64_t j = 0; j < in1_row_numel; j++) {
+        out_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+      }
+    }
+
+    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
+    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
+    out_eigen.device(*context.GetEigenDevice<platform::CPUPlace>()) =
+        out_eigen + in2_eigen;
+  }
+};
+
+template struct SelectedRowsAddTensor<platform::CPUPlace, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu
new file mode 100644
index 0000000000..ea149ebbc1
--- /dev/null
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -0,0 +1,142 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+struct SelectedRowsAdd<platform::GPUPlace, T> {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::SelectedRows& input2,
+                  framework::SelectedRows* output) {
+    auto in1_height = input1.height();
+    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    output->set_height(in1_height);
+
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = input2.rows();
+    std::vector<int64_t> out_rows;
+    out_rows.reserve(in1_rows.size() + in2_rows.size());
+
+    // concat rows
+    out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end());
+    out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end());
+    output->set_rows(out_rows);
+
+    auto* out_value = output->mutable_value();
+    auto& in1_value = input1.value();
+    auto& in2_value = input2.value();
+
+    auto in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
+    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+
+    auto* out_data = out_value->data<T>();
+    auto* in1_data = in1_value.data<T>();
+
+    auto in1_place = input1.place();
+    PADDLE_ENFORCE(platform::is_gpu_place(in1_place));
+    auto in2_place = input2.place();
+    PADDLE_ENFORCE(platform::is_gpu_place(in2_place));
+    auto out_place = context.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(out_place));
+
+    memory::Copy(
+        boost::get<platform::GPUPlace>(out_place), out_data,
+        boost::get<platform::GPUPlace>(in1_place), in1_data,
+        in1_value.numel() * sizeof(T),
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
+
+    auto* in2_data = in2_value.data<T>();
+    memory::Copy(
+        boost::get<platform::GPUPlace>(out_place), out_data + in1_value.numel(),
+        boost::get<platform::GPUPlace>(in2_place), in2_data,
+        in2_value.numel() * sizeof(T),
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
+  }
+};
+
+template struct SelectedRowsAdd<platform::GPUPlace, float>;
+
+namespace {
+template <typename T>
+__global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
+                                            const int64_t* rows, T* tensor_out,
+                                            int64_t row_numel, int block_size) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  selected_rows += ty * row_numel;
+  tensor_out += rows[ty] * row_numel;
+
+  for (int index = tid; index < row_numel; index += block_size) {
+    // Since index in rows of SelectedRows can be duplicate, we can not use
+    // tensor_out[index] += selected_rows[index]; Instead, we have to use
+    // AtomicAdd to avoid concurrent write error.
+    paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
+  }
+}
+}  // namespace
+
+template <typename T>
+struct SelectedRowsAddTensor<platform::GPUPlace, T> {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::Tensor& input2, framework::Tensor* output) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2.dims();
+    auto out_dims = output->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
+    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = input2.data<T>();
+    auto* out_data = output->data<T>();
+
+    SetConstant<platform::GPUPlace, T> functor;
+    functor(context, output, 0.0);
+
+    int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid(1, in1_rows.size());
+    SelectedRowsAddTensorKernel<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(in1_data, in1_rows.data(), out_data,
+                              in1_row_numel, block_size);
+
+    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
+    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
+    out_eigen.device(*context.GetEigenDevice<platform::GPUPlace>()) =
+        out_eigen + in2_eigen;
+  }
+};
+
+template struct SelectedRowsAddTensor<platform::GPUPlace, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h
new file mode 100644
index 0000000000..53ab240ca6
--- /dev/null
+++ b/paddle/operators/math/selected_rows_functor.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/selected_rows.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// SelectedRows + SelectedRows will simplely concat value and rows.
+// The real computation happens in dealing with LoDTensor.
+template <typename Place, typename T>
+struct SelectedRowsAdd {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::SelectedRows& input2,
+                  framework::SelectedRows* output);
+};
+
+template <typename Place, typename T>
+struct SelectedRowsAddTensor {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::Tensor& input2, framework::Tensor* output);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor_test.cc b/paddle/operators/math/selected_rows_functor_test.cc
new file mode 100644
index 0000000000..4f7760cb71
--- /dev/null
+++ b/paddle/operators/math/selected_rows_functor_test.cc
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/selected_rows_functor.h"
+#include "gtest/gtest.h"
+#include "paddle/operators/math/math_function.h"
+
+TEST(selected_rows_functor, cpu_add) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators::math;
+
+  CPUPlace cpu_place;
+  CPUDeviceContext ctx(cpu_place);
+  SetConstant<CPUPlace, float> functor;
+  int64_t height = 10;
+  int64_t row_numel = 10;
+
+  std::vector<int64_t> rows1{0, 4, 7};
+  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), cpu_place);
+  functor(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{0, 5, 7, 9};
+  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), cpu_place);
+  functor(ctx, in2_value, 2.0);
+
+  std::unique_ptr<SelectedRows> output{new SelectedRows()};
+  auto* out_value = output->mutable_value();
+
+  // simplely concat two SelectedRows
+  out_value->mutable_data<float>(make_ddim({7, 10}), cpu_place);
+
+  SelectedRowsAdd<CPUPlace, float> add_functor;
+  add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
+
+  auto out_height = output->height();
+  EXPECT_EQ(out_height, height);
+
+  auto& out_rows = output->rows();
+
+  // input1 rows
+  EXPECT_EQ(out_rows[0], 0);
+  EXPECT_EQ(out_rows[1], 4);
+  EXPECT_EQ(out_rows[2], 7);
+  // input2 rows
+  EXPECT_EQ(out_rows[3], 0);
+  EXPECT_EQ(out_rows[4], 5);
+  EXPECT_EQ(out_rows[5], 7);
+  EXPECT_EQ(out_rows[6], 9);
+
+  auto* out_data = output->value().data<float>();
+  // input1 value
+  EXPECT_EQ(out_data[0 * row_numel + 0], 1.0);
+  EXPECT_EQ(out_data[0 * row_numel + 8], 1.0);
+  EXPECT_EQ(out_data[1 * row_numel + 1], 1.0);
+  EXPECT_EQ(out_data[2 * row_numel + 6], 1.0);
+  // input2 value
+  EXPECT_EQ(out_data[3 * row_numel + 3], 2.0);
+  EXPECT_EQ(out_data[3 * row_numel + 8], 2.0);
+  EXPECT_EQ(out_data[4 * row_numel + 4], 2.0);
+  EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
+  EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
+
+  std::unique_ptr<Tensor> tensor1{new Tensor()};
+  tensor1->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
+  functor(ctx, tensor1.get(), 3.0);
+
+  std::unique_ptr<Tensor> tensor2{new Tensor()};
+  tensor2->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
+
+  SelectedRowsAddTensor<CPUPlace, float> add_tensor_functor;
+  add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
+
+  auto* tensor2_data = tensor2->data<float>();
+  // row0: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor2_data[0 * row_numel + 0], 6.0);
+  // row1: 3.0
+  EXPECT_EQ(tensor2_data[1 * row_numel + 1], 3.0);
+  // row4 : 1.0 + 3.0
+  EXPECT_EQ(tensor2_data[4 * row_numel + 6], 4.0);
+  // row5: 2.0 + 3.0
+  EXPECT_EQ(tensor2_data[5 * row_numel + 7], 5.0);
+  // row6: 3.0
+  EXPECT_EQ(tensor2_data[6 * row_numel + 1], 3.0);
+  // row7: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor2_data[7 * row_numel + 3], 6.0);
+  // row9: 2.0 + 3.0
+  EXPECT_EQ(tensor2_data[9 * row_numel + 6], 5.0);
+}
diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu
new file mode 100644
index 0000000000..8a9f25b982
--- /dev/null
+++ b/paddle/operators/math/selected_rows_functor_test.cu
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+
+TEST(selected_rows_functor, gpu_add) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators::math;
+
+  GPUPlace gpu_place(0);
+  CPUPlace cpu_place;
+  CUDADeviceContext ctx(gpu_place);
+  SetConstant<GPUPlace, float> functor;
+  int64_t height = 10;
+  int64_t row_numel = 10;
+
+  std::vector<int64_t> rows1{0, 4, 7};
+  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place);
+  functor(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{0, 5, 7, 9};
+  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), gpu_place);
+  functor(ctx, in2_value, 2.0);
+
+  std::unique_ptr<SelectedRows> output{new SelectedRows()};
+  auto* out_value = output->mutable_value();
+
+  // simplely concat two SelectedRows
+  out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place);
+
+  SelectedRowsAdd<GPUPlace, float> add_functor;
+  add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
+
+  auto out_height = output->height();
+  EXPECT_EQ(out_height, height);
+
+  auto& out_rows = output->rows();
+
+  // input1 rows
+  EXPECT_EQ(out_rows[0], 0);
+  EXPECT_EQ(out_rows[1], 4);
+  EXPECT_EQ(out_rows[2], 7);
+  // input2 rows
+  EXPECT_EQ(out_rows[3], 0);
+  EXPECT_EQ(out_rows[4], 5);
+  EXPECT_EQ(out_rows[5], 7);
+  EXPECT_EQ(out_rows[6], 9);
+
+  Tensor out_cpu;
+  out_cpu.CopyFrom<float>(*out_value, cpu_place, ctx);
+  ctx.Wait();
+
+  auto* out_cpu_data = out_cpu.data<float>();
+  // input1 value
+  EXPECT_EQ(out_cpu_data[0 * row_numel + 0], 1.0);
+  EXPECT_EQ(out_cpu_data[0 * row_numel + 8], 1.0);
+  EXPECT_EQ(out_cpu_data[1 * row_numel + 1], 1.0);
+  EXPECT_EQ(out_cpu_data[2 * row_numel + 6], 1.0);
+  // input2 value
+  EXPECT_EQ(out_cpu_data[3 * row_numel + 3], 2.0);
+  EXPECT_EQ(out_cpu_data[3 * row_numel + 8], 2.0);
+  EXPECT_EQ(out_cpu_data[4 * row_numel + 4], 2.0);
+  EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
+  EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
+
+  std::unique_ptr<Tensor> tensor1{new Tensor()};
+  tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
+  functor(ctx, tensor1.get(), 3.0);
+
+  std::unique_ptr<Tensor> tensor2{new Tensor()};
+  tensor2->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
+
+  SelectedRowsAddTensor<GPUPlace, float> add_tensor_functor;
+  add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
+
+  Tensor tensor2_cpu;
+  tensor2_cpu.CopyFrom<float>(*tensor2, cpu_place, ctx);
+  ctx.Wait();
+
+  auto* tensor2_cpu_data = tensor2_cpu.data<float>();
+  // row0: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[0 * row_numel + 0], 6.0);
+  // row1: 3.0
+  EXPECT_EQ(tensor2_cpu_data[1 * row_numel + 1], 3.0);
+  // row4 : 1.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[4 * row_numel + 6], 4.0);
+  // row5: 2.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[5 * row_numel + 7], 5.0);
+  // row6: 3.0
+  EXPECT_EQ(tensor2_cpu_data[6 * row_numel + 1], 3.0);
+  // row7: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[7 * row_numel + 3], 6.0);
+  // row9: 2.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[9 * row_numel + 6], 5.0);
+}
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index 441543049f..9556fdf731 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -21,7 +21,6 @@ class MeanOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of MeanOp should not be null.");
@@ -46,7 +45,6 @@ class MeanGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index d7fd2f901b..f7943e99ac 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -25,7 +25,6 @@ class MinusOp : public framework::OperatorWithKernel {
           const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of MinusOp should not be null.");
diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc
index 6522327fdc..7b9e952895 100644
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -21,7 +21,6 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
@@ -73,7 +72,6 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index ec0683d887..943f81e949 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -23,7 +23,6 @@ class MulOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null.");
@@ -96,7 +95,6 @@ class MulOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index a86685b6dd..4d86769026 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -23,7 +23,6 @@ class MultiplexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Ids"), "Input(Ids) shouldn't be null.");
     PADDLE_ENFORCE(!ctx->Inputs("X").empty(),
@@ -51,6 +50,7 @@ class MultiplexOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", in_dim);
   }
 
+ protected:
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
     return framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type());
@@ -89,7 +89,6 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(!ctx->Inputs("X").empty(), "Input(X) should not be null.");
     PADDLE_ENFORCE(!ctx->Outputs(framework::GradVarName("X")).empty(),
@@ -105,6 +104,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
     ctx->SetOutputsDim(framework::GradVarName("X"), d_ins);
   }
 
+ protected:
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
     return framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type());
@@ -115,8 +115,9 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
 
-REGISTER_OP(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, multiplex_grad,
-            ops::MultiplexGradOp);
+REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<false>);
+REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
 REGISTER_OP_CPU_KERNEL(
     multiplex, ops::MultiplexCPUKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/operators/name_convention.md b/paddle/operators/name_convention.md
index 379385dc5d..5a21690795 100644
--- a/paddle/operators/name_convention.md
+++ b/paddle/operators/name_convention.md
@@ -11,7 +11,7 @@ When defining an operator in Paddle, a corresponding [OpProtoMaker](https://gith
   - If an operator's Input/Output are tensors in math, not match to any meaningful words, input name should starts from `X`. e.g. `X`, `Y`, and output name should starts from `Out`. e.g. `Out`. This rule intends making operators which have few inputs/outputs unified.
 
 - Attribute.
-  - Attribute name follows the **camelCase**. e.g. `x`, `y`, `axis`, `rowwiseMatrix`. Also, attribute name prefers to meaningful English words.
+  - Attribute name follows the **snake_case**. e.g. `x`, `y`, `axis`, `rowwise_matrix`. Also, attribute name prefers to meaningful English words.
 
 - Comments.
   - Input/Output/Attr comment follow the format of **(type,default value) usage**, corresponding to which type it can be and how it will be used in the operator. e.g.  Attribute in Accumulator`"gamma" `,`(float, default 1.0) Accumulation multiplier`.
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index 2f26ada85e..73a0b8baff 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -23,7 +23,6 @@ class PadOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of PadOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -97,7 +96,6 @@ class PadOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index c6d9aae133..a326839c0f 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -29,7 +29,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
 
   auto in_x_dims = ctx->GetInputDim("X");
 
-  std::string pooling_type = ctx->Attrs().Get<std::string>("poolingType");
+  std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
   std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
@@ -37,7 +37,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
   PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                  "Pooling intput should be 4-D or 5-D tensor.");
 
-  if (ctx->Attrs().Get<bool>("globalPooling")) {
+  if (ctx->Attrs().Get<bool>("global_pooling")) {
     ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
     for (size_t i = 0; i < ksize.size(); ++i)
       ksize[i] = static_cast<int>(in_x_dims[i + 2]);
@@ -80,23 +80,23 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
             "the number of channels, H and W is the height and "
             "width of feature.");
 
-  AddAttr<std::string>("poolingType",
-                       "PoolingType of pooling operator."
+  AddAttr<std::string>("pooling_type",
+                       "Pooling_type of pooling operator."
                        "Str constant equal to 'max' or 'avg'.")
       .InEnum({"max", "avg"});
 
   AddAttr<std::vector<int>>(
       "ksize",
       "The pooling window size(height, width) of pooling operator."
-      "If globalPooling = true, ksize is ignored and need not be "
+      "If global_pooling = true, ksize is ignored and need not be "
       "specified.");  // TODO(Chengduo): Add checker. (Currently,
                       // TypedAttrChecker don't support vector type.)
   AddAttr<bool>(
-      "globalPooling",
-      "Whether to use the globalPooling."
+      "global_pooling",
+      "Whether to use the global_pooling."
       "Bool constant equal to false or true."
       "Default false."
-      "If globalPooling = true, ksize is ignored and need not be specified.")
+      "If global_pooling = true, ksize is ignored and need not be specified.")
       .SetDefault(false);
   AddAttr<std::vector<int>>("strides",
                             "The strides(height, width) of pooling window."
@@ -146,7 +146,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
             "the number of channels, D, H and W is the depth, height and "
             "width of feature.");
 
-  AddAttr<std::string>("poolingType",
+  AddAttr<std::string>("pooling_type",
                        "PoolingType of pooling operator."
                        "Str constant equal to 'max' or 'avg'.")
       .InEnum({"max", "avg"});
@@ -154,15 +154,15 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
   AddAttr<std::vector<int>>(
       "ksize",
       "The pooling window size(depth, height, width) of pooling operator."
-      "If globalPooling = true, ksize is ignored and need not be "
+      "If global_pooling = true, ksize is ignored and need not be "
       "specified.");  // TODO(Chengduo): Add checker. (Currently,
                       // TypedAttrChecker don't support vector type.)
   AddAttr<bool>(
-      "globalPooling",
-      "Whether to use the globalPooling."
+      "global_pooling",
+      "Whether to use the global_pooling."
       "Bool constant equal to false or true."
       "Default false."
-      "If globalPooling = true, ksize is ignored and need not be specified.")
+      "If global_pooling = true, ksize is ignored and need not be specified.")
       .SetDefault(false);
   AddAttr<std::vector<int>>("strides",
                             "Strides(depth, height, width) of pooling operator."
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
index e5016d573d..ada9565019 100644
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -28,7 +28,6 @@ class PoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override;
 };
 
@@ -36,7 +35,6 @@ class PoolOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override;
 };
 
@@ -59,11 +57,11 @@ class PoolKernel : public framework::OpKernel<T> {
     const Tensor* in_x = context.Input<Tensor>("X");
     Tensor* out = context.Output<Tensor>("Out");
 
-    std::string pooling_type = context.Attr<std::string>("poolingType");
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
       }
@@ -119,12 +117,12 @@ class PoolGradKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
 
-    std::string pooling_type = context.Attr<std::string>("poolingType");
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
 
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i)
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
     }
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index 005ee88693..29d0322a27 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -27,7 +27,6 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "X(Input) of Pooling should not be null.");
@@ -45,7 +44,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                    "Pooling intput should be 4-D or 5-D tensor.");
 
-    if (ctx->Attrs().Get<bool>("globalPooling")) {
+    if (ctx->Attrs().Get<bool>("global_pooling")) {
       ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
       for (size_t i = 0; i < ksize.size(); ++i)
         ksize[i] = static_cast<int>(in_x_dims[i + 2]);
@@ -72,7 +71,6 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Mask"), "Input(Mask) must not be null.");
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
@@ -108,15 +106,15 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<int>>(
         "ksize",
         "The pooling window size(height, width) of pooling operator."
-        "If globalPooling = true, ksize is ignored and need not be "
+        "If global_pooling = true, ksize is ignored and need not be "
         "specified.");  // TODO(Chengduo): Add checker. (Currently,
                         // TypedAttrChecker don't support vector type.)
     AddAttr<bool>(
-        "globalPooling",
-        "Whether to use the globalPooling."
+        "global_pooling",
+        "Whether to use the global_pooling."
         "Bool constant equal to false or true."
         "Default false."
-        "If globalPooling = true, ksize is ignored and need not be specified.")
+        "If global_pooling = true, ksize is ignored and need not be specified.")
         .SetDefault(false);
     AddAttr<std::vector<int>>("strides",
                               "The strides(height, width) of pooling window."
@@ -179,15 +177,15 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<int>>(
         "ksize",
         "The pooling window size(depth, height, width) of pooling operator."
-        "If globalPooling = true, ksize is ignored and need not be "
+        "If global_pooling = true, ksize is ignored and need not be "
         "specified.");  // TODO(Chengduo): Add checker. (Currently,
                         // TypedAttrChecker don't support vector type.)
     AddAttr<bool>(
-        "globalPooling",
-        "Whether to use the globalPooling."
+        "global_pooling",
+        "Whether to use the global_pooling."
         "Bool constant equal to false or true."
         "Default false."
-        "If globalPooling = true, ksize is ignored and need not be specified.")
+        "If global_pooling = true, ksize is ignored and need not be specified.")
         .SetDefault(false);
     AddAttr<std::vector<int>>(
         "strides",
diff --git a/paddle/operators/pool_with_index_op.h b/paddle/operators/pool_with_index_op.h
index 01b961ca82..455c453efc 100644
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
@@ -35,7 +35,7 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
       }
@@ -70,7 +70,7 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         ksize[i] = static_cast<int>(in_x_grad->dims()[i + 2]);
       }
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
index 166fe26824..eef2e34eaa 100644
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -25,7 +25,6 @@ class PReluOp : public framework::OperatorWithKernel {
           const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null");
@@ -62,7 +61,6 @@ class PReluGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index e0abbc4db1..17ef2b1d01 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -24,7 +24,6 @@ class RankLossOp : public framework::OperatorWithKernel {
              const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     // input check
     PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null");
@@ -89,7 +88,6 @@ class RankLossGradOp : public framework::OperatorWithKernel {
                  const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
     PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null.");
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 00647f55f7..e3d08378c2 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -70,14 +70,14 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope,
         // the weight are located in parent scope
         for (auto& var_name : input.second) {
           if (!step_scope.FindVar(var_name)) {
-            step_scope.NewVar(var_name)->GetMutable<LoDTensor>();
+            step_scope.Var(var_name)->GetMutable<LoDTensor>();
           }
         }
       }
       // create stepnet's outputs
       for (const auto& output : (*stepnet_)->Outputs()) {
         for (auto& var_name : output.second) {
-          step_scope.NewVar(var_name);
+          step_scope.Var(var_name);
         }
       }
       step_scopes->emplace_back(&step_scope);
@@ -87,7 +87,7 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope,
 
 void RecurrentAlgorithm::InitMemories(Scope* step_scope) const {
   for (auto& attr : arg_->memories) {
-    auto* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable<LoDTensor>();
+    auto* pre_mem = step_scope->Var(attr.pre_var)->GetMutable<LoDTensor>();
     PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
                    "memory [%s]'s boot variable [%s] not exists", attr.var,
                    attr.boot_var);
@@ -167,9 +167,9 @@ void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
                    "memory variable [%s] does not exists", attr.var);
     PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
                    "boot variable [%s] does not exists", attr.boot_var);
-    auto* mem_grad = step_scope->NewVar(attr.var)->GetMutable<LoDTensor>();
+    auto* mem_grad = step_scope->Var(attr.var)->GetMutable<LoDTensor>();
     auto* boot_mem_grad =
-        step_scope->NewVar(attr.boot_var)->GetMutable<LoDTensor>();
+        step_scope->Var(attr.boot_var)->GetMutable<LoDTensor>();
     boot_mem_grad->Resize(mem_grad->dims());
     boot_mem_grad->ShareDataWith<float>(*mem_grad);
   }
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
index 005f88b57c..5e878353ce 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -23,7 +23,6 @@ class ReduceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of ReduceOp should not be null.");
@@ -57,7 +56,6 @@ class ReduceGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index 3cd54930a0..a8eb8d45ee 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -25,7 +25,6 @@ class ReshapeOp : public framework::OperatorWithKernel {
             const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     // input check
     PADDLE_ENFORCE(ctx->HasInput("X"),
@@ -93,7 +92,6 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
                 const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
diff --git a/paddle/operators/rmsprop_op.cc b/paddle/operators/rmsprop_op.cc
index ada6f2bc3c..fd5567a365 100644
--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
@@ -21,7 +21,6 @@ class RmspropOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Param"),
                    "Input(Param) of RmspropOp should not be null.");
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
index d264664a99..30b8ddeb5b 100644
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -40,7 +40,7 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
     f::DDim step_dims = slice_ddim(dims, 1, dims.size());
     for (size_t j = 0; j < seq_len; j++) {
       Tensor* step_input =
-          step_scopes[j]->NewVar(inlinks[i])->GetMutable<Tensor>();
+          step_scopes[j]->Var(inlinks[i])->GetMutable<Tensor>();
       // The input of operators of each step is Tensor here.
       // Maybe need to modify Slice function.
       *step_input = input->Slice<float>(j, j + 1);
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index ac297da6b7..7f1a21bea7 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -25,7 +25,6 @@ class ScaleOp : public framework::OperatorWithKernel {
           const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of ScaleOp should not be null.");
@@ -56,7 +55,6 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
  public:
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
- protected:
   std::unique_ptr<framework::OpDescBind> Apply() const override {
     auto *grad_op = new framework::OpDescBind();
     grad_op->SetType("scale");
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index fbea01a8db..62e6c70b45 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -22,7 +22,6 @@ class ScatterOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Ref"),
                    "Input(Ref) of ScatterOp should not be null.");
@@ -49,6 +48,7 @@ class ScatterOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", ref_dims);
   }
 
+ protected:
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
     return framework::ToDataType(ctx.Input<Tensor>("Ref")->type());
@@ -59,13 +59,13 @@ class ScatterGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("Updates"),
                       ctx->GetInputDim("Updates"));
     ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref"));
   }
 
+ protected:
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
     return framework::ToDataType(ctx.Input<Tensor>("Ref")->type());
diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc
index 287fb1942e..1fce96cdfe 100644
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -21,7 +21,6 @@ class SequenceConcatOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInputs("X"),
                    "Inputs(X) of SequenceConcatOp should not be null.");
@@ -105,7 +104,6 @@ class SequenceConcatGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "The gradient of Out should not be null.");
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
index 06c00d31ea..e3f5d509a8 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -21,7 +21,6 @@ class SequencePoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of SequencePoolOp should not be null.");
@@ -36,11 +35,10 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
   SequencePoolOpMaker(framework::OpProto* proto,
                       framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "A float LoDTensor, the variable-length input of SequencePoolOp");
-    AddOutput(
-        "Out",
-        "A float LoDTensor, the variable-length output of SequencePoolOp.");
+    AddInput("X", "(LoDTensor), the variable-length input of SequencePoolOp");
+    AddOutput("Out",
+              "(Tensor), output of SequencePoolOp, which does not contain LoD "
+              "infomation.");
     AddAttr<int>(
         "strategy",
         "(int, default AVERAGE) the pooling strategy of SequencePoolOp.")
@@ -49,13 +47,13 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
     SequencePoolOp pools features of all time-steps of each instance.
 
-    For a mini-batch of 3 variable lengths sentences, containing 2, 3, and 2 time-steps:
+    For a mini-batch of 3 variable-length sentences, containing 2, 3, and 2 time-steps:
 
-    Assume X is a [7,M,N] float LoDTensor, and X->lod()[0] = [0, 2, 5, 7].
+    Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2.
     Besides, for the sake of simplicity, we assume M=1 and N=1,
     and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
 
-    Thus, Out is a [3,1,1] float LoDTensor, but Out->lod() is nullptr.
+    Thus, Out is a [3,1,1] Tensor without LoD infomation.
     And for different strategy, the value of Out is as follows:
 
     - AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
@@ -73,7 +71,6 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Gradient of Out should not be null.");
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
index 752d714125..a5569d1aac 100644
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -77,6 +78,16 @@ class SequencePoolKernel : public framework::OpKernel<T> {
         case SUM:
           out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
           break;
+        case SQRT:
+          out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
+                                std::sqrt(static_cast<T>(h));
+          break;
+        case LAST:
+          out_e.device(place) = in_e.chip(h - 1, 0);
+          break;
+        case FIRST:
+          out_e.device(place) = in_e.chip(0, 0);
+          break;
         default:
           PADDLE_THROW("unsupported pooling strategy");
       }
@@ -98,6 +109,11 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
     int64_t w = in->numel() / dims[0];
 
     in_g->mutable_data<T>(context.GetPlace());
+    if (strategy == LAST || strategy == FIRST) {
+      // set X@Grad be zero at first when strategy is LAST/FIRST
+      math::SetConstant<Place, T> functor;
+      functor(context.device_context(), in_g, 0);
+    }
     auto place = context.GetEigenDevice<Place>();
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
       auto in_g_t = in_g->Slice<T>(static_cast<int>(lod[i]),
@@ -115,6 +131,16 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
         case SUM:
           in_g_e.device(place) = (out_g_e).broadcast(bcast);
           break;
+        case SQRT:
+          in_g_e.device(place) =
+              (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
+          break;
+        case LAST:
+          in_g_e.chip(h - 1, 0).device(place) = out_g_e;
+          break;
+        case FIRST:
+          in_g_e.chip(0, 0).device(place) = out_g_e;
+          break;
         default:
           PADDLE_THROW("unsupported pooling strategy");
       }
diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/operators/sequence_softmax_op.cc
index ea217ba459..c891ab1fdc 100644
--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
@@ -21,7 +21,6 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of SequenceSoftmaxOp should not be null.");
@@ -66,7 +65,6 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Out"),
                    "Input(Out) of SequenceSoftmaxGradOp should not be null.");
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 2a6a162a02..0f78eeab9b 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -21,7 +21,6 @@ class SGDOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Param"),
                    "Input(Param) of SGDOp should not be null.");
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
index b6653e1cc7..e781c8db20 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -23,7 +23,6 @@ class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput("Labels"),
@@ -52,7 +51,6 @@ class SigmoidCrossEntropyWithLogitsGradOp
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput("Labels"),
diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc
index 91391dc945..a4f0f37764 100644
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -21,7 +21,6 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
@@ -93,7 +92,6 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     auto in_dims = ctx->GetInputDim("X");
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 4c131ed44d..00fd0b32a9 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -21,7 +21,6 @@ class SoftmaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of SoftmaxOp should not be null.");
@@ -68,7 +67,6 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index 98a1c70f11..50497da1b7 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -46,7 +46,7 @@ class SoftmaxWithCrossEntropyOpMaker
               "(Tensor, default: Tensor<float>), A 2-D tensor. The cross "
               "entropy loss with shape [N x 1].");
     AddAttr<bool>(
-        "softLabel",
+        "soft_label",
         "(bool, default: false), A flag to indicate whether to interpretate "
         "the given labels as soft labels.")
         .SetDefault(false);
@@ -86,7 +86,6 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Logits"),
                    "Input(Logits) should be not null.");
@@ -104,13 +103,13 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
                       "The labels should be a 2-D tensor.");
 
-    if (ctx->Attrs().Get<bool>("softLabel")) {
+    if (ctx->Attrs().Get<bool>("soft_label")) {
       PADDLE_ENFORCE_EQ(logits_dims[1], labels_dims[1],
-                        "If Attr(softLabel) == true, the 2nd dimension of "
+                        "If Attr(soft_label) == true, the 2nd dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
       PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
-                        "If Attr(softLabel) == false, the 2nd dimension of "
+                        "If Attr(soft_label) == false, the 2nd dimension of "
                         "Input(Label) should be 1.");
     }
 
@@ -121,6 +120,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
     ctx->ShareLoD("Logits", /*->*/ "Loss");
   }
 
+ protected:
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
     return framework::ToDataType(ctx.Input<Tensor>("Logits")->type());
@@ -131,7 +131,6 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
                    "Input(Loss@Grad) should not be null.");
@@ -146,13 +145,13 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
                       "The labels should be a 2-D tensor.");
 
-    if (ctx->Attrs().Get<bool>("softLabel")) {
+    if (ctx->Attrs().Get<bool>("soft_label")) {
       PADDLE_ENFORCE_EQ(softmax_dims[1], labels_dims[1],
-                        "When Attr(softLabel) == true, the 2nd dimension of "
+                        "When Attr(soft_label) == true, the 2nd dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
       PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
-                        "When Attr(softLabel) == false, the 2nd dimension of "
+                        "When Attr(soft_label) == false, the 2nd dimension of "
                         "Input(Label) should be 1.");
     }
 
@@ -160,6 +159,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
                       ctx->GetInputDim("Softmax"));
   }
 
+ protected:
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
     return framework::ToDataType(
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
index 2bc53ecf87..d03a1a7658 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -70,7 +70,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
                                                   logits, softmax);
     math::CrossEntropyFunctor<platform::GPUPlace, T>()(
         context.device_context(), loss, softmax, labels,
-        context.Attr<bool>("softLabel"));
+        context.Attr<bool>("soft_label"));
   }
 };
 
@@ -93,7 +93,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     int block = 512;
     int grid = (batch_size * class_num + block - 1) / block;
 
-    if (context.Attr<bool>("softLabel")) {
+    if (context.Attr<bool>("soft_label")) {
       const T* label_data = labels->data<T>();
       SoftCrossEntropyGradientKernel<T><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
index cffd422f18..66d7bc1569 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -44,7 +44,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
                                                   logits, softmax);
     math::CrossEntropyFunctor<platform::CPUPlace, T>()(
         context.device_context(), loss, softmax, labels,
-        context.Attr<bool>("softLabel"));
+        context.Attr<bool>("soft_label"));
   }
 };
 
@@ -60,7 +60,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     logit_grad->ShareDataWith<T>(*context.Input<Tensor>("Softmax"));
 
     const int class_num = logit_grad->dims()[1];
-    if (context.Attr<bool>("softLabel")) {
+    if (context.Attr<bool>("soft_label")) {
       auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
       auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
       auto lbl_mat = EigenMatrix<T>::From(*labels);
diff --git a/paddle/operators/split_op.cc b/paddle/operators/split_op.cc
index d5dd4df2a2..4a6c50f797 100644
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
@@ -23,7 +23,6 @@ class SplitOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of SplitOp should not be null.");
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
index cce4e527c3..e360c19b47 100644
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -21,7 +21,6 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of SquaredL2DistanceOp should not be null.");
@@ -85,7 +84,6 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Gradient of Out should not be null");
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index ffb0cb9211..5214a8413e 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -21,7 +21,6 @@ class SumOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null");
     auto x_dims = ctx->GetInputsDim("X");
@@ -34,7 +33,7 @@ class SumOp : public framework::OperatorWithKernel {
     auto in_dim = x_dims[0];
     for (size_t i = 1; i < N; i++) {
       auto dim = x_dims[i];
-      PADDLE_ENFORCE(in_dim == dim, "Input tensors must have same shape");
+      PADDLE_ENFORCE_EQ(in_dim, dim, "Input tensors must have same shape");
     }
     ctx->SetOutputDim("Out", in_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/operators/top_k_op.cc b/paddle/operators/top_k_op.cc
index c954819912..d5c2c91a5f 100644
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
@@ -21,7 +21,6 @@ class TopkOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of TopkOp should not be null.");
diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc
index 1101bbe3ef..d785e57c83 100644
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
@@ -23,7 +23,6 @@ class TransposeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
@@ -92,7 +91,6 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index e330877fc4..612bdd70db 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -46,7 +46,6 @@ class UniformRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of UniformRandomOp should not be null.");
@@ -54,7 +53,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(
         ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
         "uniform_random's min must less then max");
-    auto dims = Attr<std::vector<int>>("dims");
+    auto& dims = ctx->Attrs().Get<std::vector<int>>("dims");
     std::vector<int64_t> temp;
     temp.reserve(dims.size());
     for (auto dim : dims) {
@@ -63,6 +62,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", framework::make_ddim(temp));
   }
 
+ protected:
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
     return static_cast<framework::DataType>(Attr<int>("data_type"));
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index 4203f26164..0e8e5a83a4 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -49,6 +49,11 @@ DEFINE_int32(sock_recv_buf_size,
              1024 * 1024 * 40,
              "restrict sock recv buff size");
 
+/// reasonable sock_listen_queue_size can control maximum pending connections.
+DEFINE_int32(sock_listen_queue_size,
+             1024,
+             "listen queue size when pserver listen a TCP port");
+
 namespace paddle {
 
 /**
@@ -129,7 +134,7 @@ SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu)
   if (rdmaCpu == -1) {
     tcpRdma_ = F_TCP;
     socket_ = 0;
-    maxPendingConnections_ = 100;
+    maxPendingConnections_ = FLAGS_sock_listen_queue_size;
   } else {
     tcpRdma_ = F_RDMA;
     rdmaCpu_ = rdmaCpu;
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index b8fc934724..46c24e2cd5 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1,6 +1,6 @@
 if(WITH_PYTHON)
   cc_library(paddle_pybind SHARED
     SRCS pybind.cc exception.cc protobuf.cc
-    DEPS pybind python backward proto_desc tensor_array paddle_memory
+    DEPS pybind python backward proto_desc tensor_array paddle_memory executor
     ${GLOB_OP_LIB})
 endif(WITH_PYTHON)
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 0e4bbe8415..b360b05d16 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -118,12 +118,35 @@ void BindProgramDesc(py::module &m) {
       .def("append_block", &ProgramDescBind::AppendBlock,
            py::return_value_policy::reference)
       .def("append_backward",
-           [](ProgramDescBind &program_desc,
+           [](ProgramDescBind &program_desc, const VarDescBind &target,
               const std::unordered_set<std::string> &no_grad_vars) {
-             AppendBackward(program_desc, no_grad_vars);
+             ParamGradInfoMap param_grad_map =
+                 AppendBackward(program_desc, target, no_grad_vars);
+             std::unordered_map<
+                 std::string, std::tuple<std::string /* grad_var_name */,
+                                         int /* block_idx */, int /* op_idx */>>
+                 retv;
+             for (auto it = param_grad_map.begin(); it != param_grad_map.end();
+                  ++it) {
+               const auto &grad_info = it->second;
+               retv[it->first] = std::make_tuple(
+                   grad_info.name_, grad_info.block_idx_, grad_info.op_idx_);
+             }
+             return retv;
            })
       .def("block", &ProgramDescBind::Block, py::return_value_policy::reference)
-      .def("num_blocks", &ProgramDescBind::Size);
+      .def("num_blocks", &ProgramDescBind::Size)
+      .def("serialize_to_string",
+           [](ProgramDescBind &program_desc) -> py::bytes {
+             const ProgramDesc *desc = program_desc.Proto();
+             PADDLE_ENFORCE(desc->IsInitialized(),
+                            "ProgramDesc has not been initialized.");
+             std::string res;
+             PADDLE_ENFORCE(
+                 desc->SerializeToString(&res),
+                 "Serialize ProgramDesc Error. This could be a bug of Paddle.");
+             return res;
+           });
 }
 
 void BindBlockDesc(py::module &m) {
@@ -134,22 +157,32 @@ void BindBlockDesc(py::module &m) {
            py::return_value_policy::reference)
       .def("prepend_op", &BlockDescBind::PrependOp,
            py::return_value_policy::reference)
-      .def("new_var",
+      .def("var",
            [](BlockDescBind &self, py::bytes byte_name) {
              std::string name = byte_name;
-             return self.NewVar(name);
+             return self.Var(name);
            },
            py::return_value_policy::reference)
-      .def("var",
+      .def("find_var",
            [](BlockDescBind &self, py::bytes byte_name) {
              std::string name = byte_name;
-             return self.Var(name);
+             return self.FindVar(name);
            },
            py::return_value_policy::reference)
       .def("all_vars", &BlockDescBind::AllVars,
            py::return_value_policy::reference)
       .def("all_ops", &BlockDescBind::AllOps,
-           py::return_value_policy::reference);
+           py::return_value_policy::reference)
+      .def("serialize_to_string", [](BlockDescBind &block_desc) -> py::bytes {
+        const BlockDesc *desc = block_desc.Proto();
+        PADDLE_ENFORCE(desc->IsInitialized(),
+                       "BlockDesc has not been initialized.");
+        std::string res;
+        PADDLE_ENFORCE(
+            desc->SerializeToString(&res),
+            "Serialize BlockDesc Error. This could be a bug of Paddle.");
+        return res;
+      });
 }
 
 void BindVarDsec(py::module &m) {
@@ -162,7 +195,8 @@ void BindVarDsec(py::module &m) {
       .value("FP32", DataType::FP32)
       .value("FP64", DataType::FP64);
 
-  py::class_<VarDescBind>(m, "VarDesc", "")
+  py::class_<VarDescBind> var_desc(m, "VarDesc", "");
+  var_desc
       .def("name",
            [](const VarDescBind &self) {
              py::bytes name = self.Name();
@@ -174,7 +208,23 @@ void BindVarDsec(py::module &m) {
       .def("shape", &VarDescBind::Shape, py::return_value_policy::reference)
       .def("data_type", &VarDescBind::GetDataType)
       .def("lod_level", &VarDescBind::GetLodLevel)
-      .def("set_lod_level", &VarDescBind::SetLoDLevel);
+      .def("set_lod_level", &VarDescBind::SetLoDLevel)
+      .def("type", &VarDescBind::GetType)
+      .def("set_type", &VarDescBind::SetType)
+      .def("serialize_to_string", [](VarDescBind &var_desc) -> py::bytes {
+        const VarDesc *desc = var_desc.Proto();
+        PADDLE_ENFORCE(desc->IsInitialized(),
+                       "VarDesc has not been initialized.");
+        std::string res;
+        PADDLE_ENFORCE(
+            desc->SerializeToString(&res),
+            "Serialize VarDesc Error. This could be a bug of Paddle.");
+        return res;
+      });
+
+  py::enum_<VarDesc::VarType>(var_desc, "VarType", "")
+      .value("LOD_TENSOR", VarDesc::LOD_TENSOR)
+      .value("SELECTED_ROWS", VarDesc::SELECTED_ROWS);
 }
 
 void BindOpDesc(py::module &m) {
@@ -204,9 +254,19 @@ void BindOpDesc(py::module &m) {
       .def("set_attr", &OpDescBind::SetAttr)
       .def("attr", &OpDescBind::GetAttr)
       .def("set_block_attr", &OpDescBind::SetBlockAttr)
-      .def("get_block_attr", &OpDescBind::GetBlockAttr)
+      .def("block_attr", &OpDescBind::GetBlockAttr)
       .def("check_attrs", &OpDescBind::CheckAttrs)
-      .def("infer_shape", &OpDescBind::InferShape);
+      .def("infer_shape", &OpDescBind::InferShape)
+      .def("serialize_to_string", [](OpDescBind &op_desc) -> py::bytes {
+        const OpDesc *desc = op_desc.Proto();
+        PADDLE_ENFORCE(desc->IsInitialized(),
+                       "OpDesc has not been initialized.");
+        std::string res;
+        PADDLE_ENFORCE(
+            desc->SerializeToString(&res),
+            "Serialize OpDesc Error. This could be a bug of Paddle.");
+        return res;
+      });
 }
 
 }  // namespace pybind
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 0f6e3101e2..fcae92ad99 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -15,9 +15,13 @@ limitations under the License. */
 #include "paddle/pybind/protobuf.h"
 
 #include "paddle/framework/backward.h"
+#include "paddle/framework/executor.h"
+#include "paddle/framework/feed_fetch_method.h"
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/selected_rows.h"
 #include "paddle/framework/tensor_array.h"
 #include "paddle/operators/cond_op.h"
+#include "paddle/operators/dynamic_recurrent_op.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
 #include "paddle/platform/enforce.h"
@@ -136,6 +140,32 @@ PYBIND11_PLUGIN(core) {
 #endif
       });
 
+  py::class_<SelectedRows>(m, "SelectedRows")
+      .def("__init__",
+           [](SelectedRows &instance) { new (&instance) SelectedRows(); })
+      .def("__init__",
+           [](SelectedRows &instance, const std::vector<int64_t> rows,
+              const int64_t &height) {
+             new (&instance) SelectedRows(rows, height);
+           })
+      .def("get_tensor",
+           [](SelectedRows &self) { return self.mutable_value(); },
+           py::return_value_policy::reference)
+      .def("set_height", &SelectedRows::set_height)
+      .def("height", &SelectedRows::height)
+      .def("set_rows", &SelectedRows::set_rows)
+      .def("rows", [](SelectedRows &self) {
+#ifndef PADDLE_WITH_CUDA
+        return self.rows();
+#else
+         auto rows = self.rows();
+         std::vector<int64_t> new_rows;
+         new_rows.reserve(rows.size());
+         std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
+         return new_rows;
+#endif
+      });
+
   py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
 
 All parameter, weight, gradient are variables in Paddle.
@@ -163,9 +193,9 @@ All parameter, weight, gradient are variables in Paddle.
            py::return_value_policy::reference);
 
   py::class_<Scope>(m, "Scope", "")
-      .def("new_var",
+      .def("var",
            [](Scope &self, const std::string &name) -> Variable * {
-             return self.NewVar(name);
+             return self.Var(name);
            },
            py::return_value_policy::reference)
       .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
@@ -341,6 +371,33 @@ All parameter, weight, gradient are variables in Paddle.
         self.set_stepnet(net.Clone());
       });
 
+  py::class_<operators::DynamicRecurrentOp, OperatorBase>(m,
+                                                          "DynamicRecurrentOp")
+      .def_static("create",
+                  [](py::bytes protobin) -> operators::DynamicRecurrentOp * {
+                    OpDesc desc;
+                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                                   "Cannot parse user input to OpDesc");
+                    PADDLE_ENFORCE(desc.IsInitialized(),
+                                   "User OpDesc is not initialized, reason %s",
+                                   desc.InitializationErrorString());
+                    auto rnn_op = OpRegistry::CreateOp(desc);
+                    return static_cast<operators::DynamicRecurrentOp *>(
+                        rnn_op.release());
+                  })
+      .def("set_stepnet",
+           [](operators::DynamicRecurrentOp &self, const operators::NetOp &net)
+               -> void { self.SetStepNet(net.Clone()); })
+      .def("get_state",
+           [](operators::DynamicRecurrentOp &self, const std::string &name)
+               -> const TensorArray & { return self.state(name); })
+      .def("get_step_input",
+           [](operators::DynamicRecurrentOp &self, const std::string &name)
+               -> const TensorArray & { return self.step_input(name); })
+      .def("get_step_output",
+           [](operators::DynamicRecurrentOp &self, const std::string &name)
+               -> const TensorArray & { return self.step_output(name); });
+
   // cond_op
   py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
       .def_static("create",
@@ -363,9 +420,21 @@ All parameter, weight, gradient are variables in Paddle.
              self.set_falsenet(net.Clone());
            });
 
+  py::class_<framework::Executor>(m, "Executor")
+      .def(py::init<std::vector<platform::Place> &>())
+      .def("run",
+           [](Executor &self, const ProgramDesc &program_desc, int block_id) {
+             framework::Scope &global_scope = GetGlobalScope();
+             self.Run(program_desc, &global_scope, block_id);
+           });
+
   m.def("unique_integer", UniqueIntegerGenerator);
 
   m.def("is_compile_gpu", IsCompileGPU);
+  m.def("set_feed_variable_float", framework::SetFeedVariable<float>);
+  m.def("set_feed_variable_double", framework::SetFeedVariable<double>);
+  m.def("set_feed_variable_int", framework::SetFeedVariable<int>);
+  m.def("get_fetch_variable", framework::GetFetchVariable);
 
   BindProgramDesc(m);
   BindBlockDesc(m);
diff --git a/paddle/scripts/travis/check_style.sh b/paddle/scripts/travis/check_style.sh
index cb483b0ffc..e71d243efa 100755
--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
@@ -11,7 +11,13 @@ set -e
 # install glide
 curl https://glide.sh/get | bash
 eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
-go get -u github.com/alecthomas/gometalinter
+
+# set up go environment for running gometalinter
+mkdir -p $GOPATH/src/github.com/PaddlePaddle/
+ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
+cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
+
+go get github.com/alecthomas/gometalinter
 gometalinter --install
 
 cd $TRAVIS_BUILD_DIR
@@ -19,10 +25,7 @@ export PATH=/usr/bin:$PATH
 pre-commit install
 clang-format --version
 
-# set up go environment for running gometalinter
-mkdir -p $GOPATH/src/github.com/PaddlePaddle/
-ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
-cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
+
 
 if ! pre-commit run -a ; then
     git diff
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index 066837ca95..5ebbb99c94 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -39,15 +39,18 @@ add_test(NAME test_CompareTwoNets
 
 ################ test_CompareMKLDNNandCPU ######################
 if(WITH_MKLDNN)
-  add_unittest_without_exec(test_CompareMKLDNNandCPU
-      test_CompareTwoNets.cpp)
-  add_test(NAME test_CompareMKLDNNandCPU
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-          ${CMAKE_CURRENT_BINARY_DIR}/test_CompareMKLDNNandCPU
-              --config_file_a=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_a=True
-              --config_file_b=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_b=False
-              --use_gpu=False
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+  macro(gen_command VAR_NAME CONFIG_FILE)
+    set(${VAR_NAME} "${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh" "-d" "${PADDLE_SOURCE_DIR}/python/"
+                    "${CMAKE_CURRENT_BINARY_DIR}/test_CompareMKLDNNandCPU --use_gpu=False"
+                    "--config_file_a=trainer/tests/${CONFIG_FILE} --use_mkldnn_a=True"
+                    "--config_file_b=trainer/tests/${CONFIG_FILE} --use_mkldnn_b=False"
+                    "WORKING_DIRECTORY" "${PADDLE_SOURCE_DIR}/paddle/")
+  endmacro()
+  add_unittest_without_exec(test_CompareMKLDNNandCPU test_CompareTwoNets.cpp)
+  gen_command(compare_simple_net "sample_trainer_config_simple_net.conf")
+  gen_command(compare_branch_net "sample_trainer_config_branch_net.conf")
+  add_test(NAME test_CompareMKLDNNandCPU_simple_net COMMAND ${compare_simple_net})
+  add_test(NAME test_CompareMKLDNNandCPU_branch_net COMMAND ${compare_branch_net})
 endif()
 
 ############### test_CompareTwoOpts ###################
diff --git a/paddle/trainer/tests/sample_trainer_config_branch_net.conf b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
new file mode 100644
index 0000000000..c2594bc13c
--- /dev/null
+++ b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
@@ -0,0 +1,103 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+################################### Data Configuration ###################################
+TrainData(ProtoData(files = "trainer/tests/mnist.list"))
+################################### Algorithm Configuration ###################################
+settings(batch_size = 256,
+         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
+################################### Network Configuration ###################################
+data = data_layer(name ="input", size=784)
+
+tmp = img_conv_layer(input=data,
+            num_channels=1,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+a1 = img_conv_layer(input=tmp,
+            filter_size=1,
+            num_filters=32,
+            padding=0,
+            shared_biases=True,
+            act=ReluActivation())
+
+a2 = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+tmp = concat_layer(input=[a1, a2])
+
+tmp = img_pool_layer(input=tmp,
+            num_channels=64,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=AvgPooling())
+
+b1 = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=64,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+b1 = img_pool_layer(input=b1,
+            pool_size=3,
+            stride=1,
+            padding=1,
+            pool_type=MaxPooling())
+
+b2 = img_conv_layer(input=tmp,
+            filter_size=5,
+            num_filters=64,
+            padding=2,
+            shared_biases=True,
+            act=ReluActivation())
+
+b2 = img_pool_layer(input=b2,
+            pool_size=5,
+            stride=1,
+            padding=2,
+            pool_type=MaxPooling())
+
+tmp = addto_layer(input=[b1, b2],
+            act=ReluActivation(),
+            bias_attr=False)
+
+tmp = img_pool_layer(input=tmp,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=MaxPooling())
+
+tmp = fc_layer(input=tmp, size=64,
+            bias_attr=False,
+            act=TanhActivation())
+
+output = fc_layer(input=tmp, size=10,
+            bias_attr=True,
+            act=SoftmaxActivation())
+
+lbl = data_layer(name ="label", size=10)
+
+cost = classification_cost(input=output, label=lbl)
+outputs(cost)
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 098a51ab87..09c92d3513 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -559,6 +559,9 @@ class IdentityOffsetProjection(Projection):
                                                        **xargs)
         self.proj_conf.offset = offset
 
+    def calc_output_size(self, input_layer_config):
+        return 0  # depends on the outside MixedLayer
+
     def calc_parameter_size(self, input_size, output_size):
         return 0
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 5043fb811d..09315b9d92 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -3679,6 +3679,12 @@ def gru_step_naive_layer(input,
     if size is None:
         size = input.size / 3
 
+    if bias_attr and bias_attr.attr.get("parameter_name", None) is not None:
+        raise ValueError("You should not specify the field `name` in bias_attr."
+                         " Otherwise, the three biases, which correponding to "
+                         " the two gates and the mixed layer for computing Wx+b"
+                         ", will share the same parameter matrix unexpectedly.")
+
     def __gate__(gate_name, offset):
         with mixed_layer(
                 name=name + "_" + gate_name,
diff --git a/python/paddle/v2/framework/default_scope_funcs.py b/python/paddle/v2/framework/default_scope_funcs.py
index 1b5580c8b3..c07f9a6ab9 100644
--- a/python/paddle/v2/framework/default_scope_funcs.py
+++ b/python/paddle/v2/framework/default_scope_funcs.py
@@ -5,7 +5,7 @@ Default scope function.
 thread-local stack of Scope. Top of that stack is current scope, the bottom 
 of that stack is all scopes' parent. 
 
-Invoking `new_var/find_var`  can `new/find` variable in current scope. 
+Invoking `var/find_var`  can `new/find` variable in current scope. 
 Invoking `enter_local_scope/leave_local_scope` can create or destroy local 
 scope. 
 
@@ -19,7 +19,7 @@ import threading
 __tl_scope__ = threading.local()
 
 __all__ = [
-    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'new_var',
+    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'var',
     'find_var', 'scoped_function'
 ]
 
@@ -54,11 +54,11 @@ def leave_local_scope():
     get_cur_scope().drop_kids()
 
 
-def new_var(name):
+def var(name):
     """
     create variable in current scope.
     """
-    return get_cur_scope().new_var(name)
+    return get_cur_scope().var(name)
 
 
 def find_var(name):
diff --git a/python/paddle/v2/framework/framework.py b/python/paddle/v2/framework/framework.py
new file mode 100644
index 0000000000..a17f988bf4
--- /dev/null
+++ b/python/paddle/v2/framework/framework.py
@@ -0,0 +1,457 @@
+import paddle.v2.framework.core as core
+import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import collections
+import numpy as np
+import copy
+
+__all__ = ['Block', 'Variable', 'Program', 'Operator']
+
+
+class Variable(object):
+    def __init__(self,
+                 block,
+                 type=core.VarDesc.VarType.LOD_TENSOR,
+                 name=None,
+                 shape=None,
+                 dtype=None,
+                 lod_level=None,
+                 **kwargs):
+        self.block = block
+
+        if name is None:
+            name = Variable._unique_var_name_()
+        is_new_var = False
+        self.desc = self.block.desc.find_var(name)
+
+        if self.desc is None:
+            self.desc = self.block.desc.var(name)
+            is_new_var = True
+
+        if is_new_var:
+            self.desc.set_type(type)
+        elif self.desc.type() != type:
+            raise ValueError("Variable {0} has been created before. The "
+                             "previous type is {1}; the new type is {2}. They"
+                             " are not matched".format(self.name,
+                                                       self.desc.type(), type))
+
+        if shape is not None:
+            if is_new_var:
+                self.desc.set_shape(shape)
+            else:
+                old_shape = self.shape
+                shape = tuple(shape)
+                if shape != old_shape:
+                    raise ValueError(
+                        "Variable {0} has been created before. the previous "
+                        "shape is {1}; the new shape is {2}. They are not "
+                        "matched.".format(self.name, old_shape, shape))
+        if dtype is not None:
+            if not isinstance(dtype, core.DataType):
+                dtype = Variable._convert_np_dtype_to_dtype_(dtype)
+            if is_new_var:
+                self.desc.set_data_type(dtype)
+            else:
+                old_dtype = self.data_type()
+                if dtype != old_shape:
+                    raise ValueError("Variable {0} has been created before. "
+                                     "The previous data type is {1}; the new "
+                                     "data type is {2}. They are not "
+                                     "matched.".format(self.name, old_dtype,
+                                                       dtype))
+
+        if lod_level is not None:
+            if is_new_var:
+                self.desc.set_lod_level(lod_level)
+            else:
+                if lod_level != self.lod_level:
+                    raise ValueError("Variable {0} has been created before. "
+                                     "The previous lod_level is {1}; the new "
+                                     "lod_level is {2}. They are not "
+                                     "matched".format(self.name, self.lod_level,
+                                                      lod_level))
+        self.block.vars[name] = self
+        self.op = None
+
+    def __str__(self):
+        protostr = self.desc.serialize_to_string()
+        proto = framework_pb2.VarDesc.FromString(str(protostr))
+        return proto.__str__()
+
+    __repr__ = __str__
+
+    @property
+    def name(self):
+        return self.desc.name()
+
+    @property
+    def shape(self):
+        # convert to tuple, make it as same as numpy API.
+        return tuple(self.desc.shape())
+
+    @property
+    def data_type(self):
+        return self.desc.data_type()
+
+    @property
+    def lod_level(self):
+        return self.desc.lod_level()
+
+    @staticmethod
+    def _unique_var_name_():
+        uid = core.unique_integer()  # unique during whole process.
+        return "_generated_var_%d" % uid
+
+    @staticmethod
+    def _convert_np_dtype_to_dtype_(np_dtype):
+        dtype = np.dtype(np_dtype)
+        if dtype == np.float32:
+            return core.DataType.FP32
+        elif dtype == np.float64:
+            return core.DataType.FP64
+        elif dtype == np.float16:
+            return core.DataType.FP16
+        elif dtype == np.int32:
+            return core.DataType.INT32
+        elif dtype == np.int16:
+            return core.DataType.INT16
+        elif dtype == np.int64:
+            return core.DataType.INT64
+        elif dtype == np.bool:
+            return core.DataType.BOOL
+        else:
+            raise ValueError("Not supported numpy dtype " + str(dtype))
+
+
+def get_all_op_protos():
+    """
+    Get all registered op proto from PaddlePaddle C++ end.
+    :return: A list of registered OpProto.
+    """
+    protostrs = core.get_all_op_protos()
+    ret_values = []
+    for pbstr in protostrs:
+        op_proto = framework_pb2.OpProto.FromString(str(pbstr))
+        ret_values.append(op_proto)
+    return ret_values
+
+
+class OpProtoHolder(object):
+    @classmethod
+    def instance(cls):
+        if not hasattr(cls, '_instance'):
+            cls._instance = cls()
+        return cls._instance
+
+    def __init__(self):
+        assert not hasattr(
+            self.__class__,
+            '_instance'), 'Please use `instance()` to get OpProtoHolder opject!'
+        op_protos = get_all_op_protos()
+        self.op_proto_map = {}
+        for proto in op_protos:
+            self.op_proto_map[proto.type] = proto
+
+    def get_op_proto(self, type):
+        if type not in self.op_proto_map:
+            raise ValueError("Operator \"%s\" has not been registered." % type)
+        return self.op_proto_map[type]
+
+
+class Operator(object):
+    def __init__(self,
+                 block,
+                 desc,
+                 type=None,
+                 inputs=None,
+                 outputs=None,
+                 attrs=None):
+        self.block = block
+        self.desc = desc
+        if len(self.desc.type()) != 0:
+            return
+        if type is None:
+            raise ValueError(
+                "`type` to initilized an Operator can not be None.")
+        self.desc.set_type(type)
+        proto = OpProtoHolder.instance().get_op_proto(type)
+
+        if inputs is not None:
+            given = set()
+            need = set()
+            for n in inputs:
+                given.add(n)
+            for m in proto.inputs:
+                need.add(m.name)
+            if not given == need:
+                raise ValueError(
+                    "Incorrect setting for input(s) of operator \"%s\". Need: [%s] Given: [%s]"
+                    % (type, ", ".join(str(e) for e in need), ", ".join(
+                        str(e) for e in given)))
+
+            for in_proto in proto.inputs:
+                in_argus = inputs[in_proto.name]
+                if not isinstance(in_argus, list):
+                    in_argus = [in_argus]
+                if not in_proto.duplicable and len(in_argus) > 1:
+                    raise ValueError(
+                        "Input %s expects only one input, but %d are given." %
+                        (in_proto.name, len(in_argus)))
+                in_argu_names = []
+                for argu in in_argus:
+                    in_argu_names.append(argu.name)
+                self.desc.set_input(in_proto.name, in_argu_names)
+
+        if outputs is not None:
+            given = set()
+            need = set()
+            for n in outputs:
+                given.add(n)
+            for m in proto.outputs:
+                need.add(m.name)
+            if not given == need:
+                raise ValueError(
+                    "Incorrect setting for output(s) of operator \"%s\". Need: [%s] Given: [%s]"
+                    % (type, ", ".join(str(e) for e in need), ", ".join(
+                        str(e) for e in given)))
+
+            for out_proto in proto.outputs:
+                out_argus = outputs[out_proto.name]
+                if not isinstance(out_argus, list):
+                    out_argus = [out_argus]
+                if not out_proto.duplicable and len(out_argus) > 1:
+                    raise ValueError(
+                        "Output %s expects only one output, but %d are given." %
+                        (out_proto.name, len(out_argus)))
+                out_argu_names = []
+                for argu in out_argus:
+                    out_argu_names.append(argu.name)
+                    argu.op = self
+                self.desc.set_output(out_proto.name, out_argu_names)
+
+        if attrs is not None:
+            for attr in proto.attrs:
+                attr_name = attr.name
+                if not attr_name in attrs:
+                    continue
+                if not isinstance(attrs[attr_name], Block):
+                    self.desc.set_attr(attr_name, attrs[attr_name])
+                else:
+                    self.desc.set_block_attr(attr_name, attrs[attr_name].desc)
+
+        self.desc.check_attrs()
+        self.desc.infer_shape(self.block.desc)
+
+    def __str__(self):
+        protostr = self.desc.serialize_to_string()
+        proto = framework_pb2.OpDesc.FromString(str(protostr))
+        return proto.__str__()
+
+    __repr__ = __str__
+
+    @property
+    def type(self):
+        return self.desc.type()
+
+    def input(self, name):
+        return self.desc.input(name)
+
+    @property
+    def input_names(self):
+        return self.desc.input_names()
+
+    def output(self, name):
+        return self.desc.output(name)
+
+    @property
+    def output_names(self):
+        return self.desc.output_names()
+
+    def has_attr(self, name):
+        return self.desc.has_attr(name)
+
+    def attr_type(self, name):
+        return self.desc.attr_type(name)
+
+    @property
+    def attr_names(self):
+        return self.desc.attr_names()
+
+    def attr(self, name):
+        return self.desc.attr(name)
+
+    def block_attr(self, name):
+        return self.desc.block_attr(name)
+
+
+class Block(object):
+    def __init__(self, program, idx):
+        self.desc = program.desc.block(idx)
+        self.vars = dict()  # var_name --> var
+        self.ops = collections.deque()  # operator list
+        self.program = program
+
+    def __str__(self):
+        protostr = self.desc.serialize_to_string()
+        proto = framework_pb2.BlockDesc.FromString(str(protostr))
+        return proto.__str__()
+
+    __repr__ = __str__
+
+    @property
+    def parent_idx(self):
+        return self.desc.parent
+
+    @property
+    def idx(self):
+        return self.desc.id
+
+    def create_var(self, *args, **kwargs):
+        return Variable(self, *args, **kwargs)
+
+    def has_var(self, name):
+        return name in self.vars
+
+    def create_parameter(self, *args, **kwargs):
+        global_block = self.program.global_block()
+        return Parameter(global_block, *args, **kwargs)
+
+    def append_op(self, *args, **kwargs):
+        op_desc = self.desc.append_op()
+        op = Operator(self, op_desc, *args, **kwargs)
+        self.ops.append(op)
+        return op
+
+    def prepend_op(self, *args, **kwargs):
+        op_desc = self.desc.prepend_op()
+        op = Operator(self, op_desc, *args, **kwargs)
+        self.ops.appendleft(op)
+        return op
+
+    def sync_with_cpp(self):
+        # sync variables from cpp
+        for var in self.desc.all_vars():
+            if not self.has_var(var.name()):
+                self.create_var(name=var.name(), desc=var, type=var.type())
+
+        # sync operators from cpp
+        ops_in_cpp = self.desc.all_ops()
+        first_op_in_python = self.ops[0].desc
+        last_op_in_python = self.ops[len(self.ops) - 1].desc
+        start_index = None
+        end_index = None
+        for index in range(len(ops_in_cpp)):
+            if first_op_in_python == ops_in_cpp[index]:
+                start_index = index
+            if last_op_in_python == ops_in_cpp[index]:
+                end_index = index
+        assert start_index is not None
+        assert end_index is not None
+        assert start_index <= end_index
+
+        # sync ops append to the head of cpp_ops
+        for index in range((start_index - 1 - 1), -1, -1):
+            op_desc = ops_in_cpp[index]
+            op = Operator(self, op_desc)
+            self.ops.appendleft(op)
+
+        # sync ops append to the end of cpp_ops
+        for index in range((end_index + 1), len(ops_in_cpp)):
+            op_desc = ops_in_cpp[index]
+            op = Operator(self, op_desc)
+            self.ops.append(op)
+
+        assert len(self.ops) == len(ops_in_cpp)
+        for index in range(len(self.ops)):
+            assert self.ops[index].desc == ops_in_cpp[index]
+
+
+class Program(object):
+    @classmethod
+    def instance(cls):
+        # From https://stackoverflow.com/questions/8212053
+        # Making Program as a Singleton class.
+        if not hasattr(cls, '_instance'):
+            cls._instance = cls()
+        return cls._instance
+
+    def __init__(self, desc=None):
+        if desc is None:
+            desc = core.ProgramDesc.instance()
+        self.desc = desc
+        self.blocks = [Block(self, 0)]
+        self.current_block_idx = 0
+
+    def __str__(self):
+        protostr = self.desc.serialize_to_string()
+        proto = framework_pb2.ProgramDesc.FromString(str(protostr))
+        return proto.__str__()
+
+    __repr__ = __str__
+
+    def global_block(self):
+        return self.blocks[0]
+
+    def current_block(self):
+        return self.blocks[self.current_block_idx]
+
+    def append_backward(self, target, no_grad_set):
+        assert isinstance(target, Variable)
+        param_to_grad_info = self.desc.append_backward(target.desc, no_grad_set)
+        self.sync_with_cpp()
+        return param_to_grad_info
+
+    def create_block(self):
+        new_block_idx = len(self.blocks)
+        self.desc.append_block(self.current_block().desc)
+        self.current_block_idx = new_block_idx
+        self.blocks.append(Block(self, self.current_block_idx))
+        return self.current_block()
+
+    def rollback(self):
+        self.current_block_idx = self.current_block().parent_idx
+
+    def sync_with_cpp(self):
+        for block_idx in range(len(self.blocks), self.desc.num_blocks()):
+            self.blocks.append(Block(self, block_idx))
+        for block in self.blocks:
+            block.sync_with_cpp()
+
+
+class Parameter(Variable):
+    def __init__(self, block, shape, dtype, **kwargs):
+        if shape is None or dtype is None:
+            raise ValueError("Parameter must set shape and dtype")
+        if len(shape) == 0:
+            raise ValueError("Parameter shape cannot be empty")
+
+        for each in shape:
+            if each < 0:
+                raise ValueError("Parameter shape should not be related with "
+                                 "batch-size")
+        Variable.__init__(self, block, shape=shape, dtype=dtype, **kwargs)
+        self.trainable = kwargs.get('trainable', True)
+        self.init_attr = kwargs.get('initialize_attr', {
+            'type': 'uniform_random',
+            'min': -1.0,
+            'max': 1.0
+        })
+
+        self.optimize_attr = kwargs.get('optimize_attr', {'learning_rate': 1.0})
+        self._append_initialize_ops_()
+
+    def _append_initialize_ops_(self):
+        attr = self.init_attr
+        op_type = attr.pop('type', None)
+        block = self.block
+        assert isinstance(block, Block)
+        shape = self.shape
+        attr['dims'] = shape
+        attr['data_type'] = int(self.data_type)
+        op = block.prepend_op(
+            type=op_type, inputs=None, outputs={'Out': [self]}, attrs=attr)
+        self.op = op
+
+
+# program is a global instance.
+g_program = Program.instance()
diff --git a/python/paddle/v2/framework/graph.py b/python/paddle/v2/framework/graph.py
deleted file mode 100644
index 0f0a2847e5..0000000000
--- a/python/paddle/v2/framework/graph.py
+++ /dev/null
@@ -1,240 +0,0 @@
-import paddle.v2.framework.core as core
-import collections
-import numpy as np
-import copy
-
-__all__ = ['Block', 'Variable', 'Program', 'Operator']
-
-
-class Variable(object):
-    def __init__(self,
-                 block,
-                 name=None,
-                 shape=None,
-                 dtype=None,
-                 lod_level=None,
-                 **kwargs):
-        self.block = block
-
-        if name is None:
-            name = Variable._unique_var_name_()
-        try:
-            self.desc = self.block.desc.var(name)
-            is_new_var = False
-        except core.EnforceNotMet:
-            self.desc = self.block.desc.new_var(name)
-            is_new_var = True
-
-        if shape is not None:
-            if is_new_var:
-                self.desc.set_shape(shape)
-            else:
-                old_shape = self.shape
-                shape = tuple(shape)
-                if shape != old_shape:
-                    raise ValueError(
-                        "Variable {0} has been created before. the previous "
-                        "shape is {1}; the new shape is {2}. They are not "
-                        "matched.".format(self.name, old_shape, shape))
-        if dtype is not None:
-            if not isinstance(dtype, core.DataType):
-                dtype = Variable._convert_np_dtype_to_dtype_(dtype)
-            if is_new_var:
-                self.desc.set_data_type(dtype)
-            else:
-                old_dtype = self.data_type()
-                if dtype != old_shape:
-                    raise ValueError("Variable {0} has been created before. "
-                                     "The previous data type is {1}; the new "
-                                     "data type is {2}. They are not "
-                                     "matched.".format(self.name, old_dtype,
-                                                       dtype))
-
-        if lod_level is not None:
-            if is_new_var:
-                self.desc.set_lod_level(lod_level)
-            else:
-                if lod_level != self.lod_level:
-                    raise ValueError("Variable {0} has been created before. "
-                                     "The previous lod_level is {1}; the new "
-                                     "lod_level is {2}. They are not "
-                                     "matched".format(self.name, self.lod_level,
-                                                      lod_level))
-        self.block.vars[name] = self
-        self.op = None
-
-    @property
-    def name(self):
-        return self.desc.name()
-
-    @property
-    def shape(self):
-        # convert to tuple, make it as same as numpy API.
-        return tuple(self.desc.shape())
-
-    @property
-    def data_type(self):
-        return self.desc.data_type()
-
-    @property
-    def lod_level(self):
-        return self.desc.lod_level()
-
-    @staticmethod
-    def _unique_var_name_():
-        uid = core.unique_integer()  # unique during whole process.
-        return "_generated_var_%d" % uid
-
-    @staticmethod
-    def _convert_np_dtype_to_dtype_(np_dtype):
-        dtype = np.dtype(np_dtype)
-        if dtype == np.float32:
-            return core.DataType.FP32
-        elif dtype == np.float64:
-            return core.DataType.FP64
-        elif dtype == np.float16:
-            return core.DataType.FP16
-        elif dtype == np.int32:
-            return core.DataType.INT32
-        elif dtype == np.int16:
-            return core.DataType.INT16
-        elif dtype == np.int64:
-            return core.DataType.INT64
-        elif dtype == np.bool:
-            return core.DataType.BOOL
-        else:
-            raise ValueError("Not supported numpy dtype " + str(dtype))
-
-
-class Operator(object):
-    def __init__(self,
-                 block,
-                 desc,
-                 type=None,
-                 inputs=None,
-                 outputs=None,
-                 attrs=None):
-        self.block = block
-        self.desc = desc
-        if type is not None:
-            # TODO.
-            pass
-        if inputs is not None:
-            # TODO
-            pass
-        if outputs is not None:
-            # TODO
-            pass
-        if attrs is not None:
-            # TODO
-            pass
-
-            # TODO: Getters
-
-
-class Block(object):
-    def __init__(self, program, idx):
-        self.desc = program.desc.block(idx)
-        self.vars = dict()  # var_name --> var
-        self.ops = collections.deque()  # operator list
-        self.program = program
-
-    @property
-    def parent_idx(self):
-        return self.desc.parent
-
-    @property
-    def idx(self):
-        return self.desc.id
-
-    def create_var(self, *args, **kwargs):
-        return Variable(self, *args, **kwargs)
-
-    def create_parameter(self, *args, **kwargs):
-        global_block = self.program.global_block()
-        return Parameter(global_block, *args, **kwargs)
-
-    def append_op(self, *args, **kwargs):
-        op_desc = self.desc.append_op()
-        op = Operator(self, op_desc, *args, **kwargs)
-        self.ops.append(op)
-        return op
-
-    def prepend_op(self, *args, **kwargs):
-        op_desc = self.desc.prepend_op()
-        op = Operator(self, op_desc, *args, **kwargs)
-        self.ops.appendleft(op)
-        return op
-
-
-class Program(object):
-    @classmethod
-    def instance(cls):
-        # From https://stackoverflow.com/questions/8212053
-        # Making Program as a Singleton class.
-        if not hasattr(cls, '_instance'):
-            cls._instance = cls()
-        return cls._instance
-
-    def __init__(self):
-        assert not hasattr(self.__class__,
-                           '_instance'), 'Do not call constructor directly!'
-        self.desc = core.ProgramDesc.instance()
-        self.blocks = [Block(self, 0)]
-        self.current_block_idx = 0
-
-    def global_block(self):
-        return self.blocks[0]
-
-    def current_block(self):
-        return self.blocks[self.current_block_idx]
-
-    def create_block(self):
-        new_block_idx = len(self.blocks)
-        self.desc.append_block(self.current_block().desc)
-        self.current_block_idx = new_block_idx
-        self.blocks.append(Block(self, self.current_block_idx))
-        return self.current_block()
-
-    def rollback(self):
-        self.current_block_idx = self.current_block().parent_idx
-
-
-class Parameter(Variable):
-    def __init__(self, block, shape, dtype, **kwargs):
-        if shape is None or dtype is None:
-            raise ValueError("Parameter must set shape and dtype")
-        if len(shape) == 0:
-            raise ValueError("Parameter shape cannot be empty")
-
-        for each in shape:
-            if each < 0:
-                raise ValueError("Parameter shape should not be related with "
-                                 "batch-size")
-
-        Variable.__init__(self, block, shape=shape, dtype=dtype, **kwargs)
-        self.trainable = kwargs.get('trainable', True)
-        self.init_attr = kwargs.get('initialize_attr', {
-            'type': 'uniform_random',
-            'min': -1.0,
-            'max': 1.0
-        })
-
-        self.optimize_attr = kwargs.get('optimize_attr', {'learning_rate': 1.0})
-        self._append_initialize_ops_()
-
-    def _append_initialize_ops_(self):
-        attr = copy.deepcopy(self.init_attr)
-        op_type = attr.pop('type', None)
-        block = self.block
-        assert isinstance(block, Block)
-        shape = self.shape
-        attr['dims'] = shape
-        attr['data_type'] = int(self.data_type)
-        op = block.prepend_op(
-            type=op_type, inputs=None, outputs={'Out': [self]}, attrs=attr)
-        self.op = op
-
-
-# program is a global instance.
-g_program = Program.instance()
diff --git a/python/paddle/v2/framework/layer_helper.py b/python/paddle/v2/framework/layer_helper.py
new file mode 100644
index 0000000000..26d3e04310
--- /dev/null
+++ b/python/paddle/v2/framework/layer_helper.py
@@ -0,0 +1,160 @@
+from paddle.v2.framework.framework import Variable, OpProtoHolder, g_program
+import paddle.v2.framework.core as core
+import copy
+import itertools
+
+
+def unique_name(prefix):
+    uid = core.unique_integer()  # unique during whole process.
+    return "_".join([prefix, str(uid)])
+
+
+class LayerHelper(object):
+    def __init__(self, layer_type, **kwargs):
+        self.kwargs = kwargs
+        self.layer_type = layer_type
+        name = self.kwargs.get('name', None)
+        if name is None:
+            self.kwargs['name'] = unique_name(self.layer_type)
+
+    @property
+    def name(self):
+        return self.kwargs['name']
+
+    @property
+    def program(self):
+        prog = self.kwargs.get('program', None)
+        if prog is None:
+            return g_program
+        else:
+            return prog
+
+    def append_op(self, *args, **kwargs):
+        return self.program.current_block().append_op(*args, **kwargs)
+
+    def multiple_input(self, input_param_name='input'):
+        inputs = self.kwargs.get(input_param_name, [])
+        type_error = TypeError(
+            "Input of {0} layer should be Variable or sequence of Variable".
+            format(self.layer_type))
+        if isinstance(inputs, Variable):
+            inputs = [inputs]
+        elif not isinstance(inputs, list) and not isinstance(inputs, tuple):
+            raise type_error
+        else:
+            for each in inputs:
+                if not isinstance(each, Variable):
+                    raise type_error
+        return inputs
+
+    def input(self, input_param_name='input'):
+        inputs = self.multiple_input(input_param_name)
+        if len(inputs) != 1:
+            raise "{0} layer only takes one input".format(self.layer_type)
+        return inputs[0]
+
+    @property
+    def param_attr(self):
+        default = {
+            'name': None,
+            'init_attr': {
+                'type': 'uniform_random',
+                'min': -1.0,
+                'max': 1.0
+            }
+        }
+        actual = self.kwargs.get('param_attr', None)
+        return actual if actual is not None else default
+
+    def bias_attr(self, size, dtype):
+        bias_attr = self.kwargs.get('bias_attr', False)
+        if bias_attr is None or bias_attr:
+            bias_attr = {
+                'name': None,
+                'init_attr': {
+                    'type': 'fill_constant',
+                    'value': 0.0,
+                    'shape': [size],
+                    'dataType': dtype
+                }
+            }
+        return bias_attr
+
+    def multiple_param_attr(self, length):
+        param_attr = self.param_attr
+        if isinstance(param_attr, dict):
+            param_attr = [param_attr]
+
+        if len(param_attr) != 1 and len(param_attr) != length:
+            raise ValueError("parameter number mismatch")
+        elif len(param_attr) == 1 and length != 1:
+            tmp = [None] * length
+            for i in xrange(length):
+                tmp[i] = copy.deepcopy(param_attr[0])
+            param_attr = tmp
+        return param_attr
+
+    def iter_inputs_and_params(self, input_param_name='input'):
+        inputs = self.multiple_input(input_param_name)
+        param_attrs = self.multiple_param_attr(len(inputs))
+        for ipt, param_attr in itertools.izip(inputs, param_attrs):
+            yield ipt, param_attr
+
+    def input_dtype(self, input_param_name='input'):
+        inputs = self.multiple_input(input_param_name)
+        dtype = None
+        for each in inputs:
+            if dtype is None:
+                dtype = each.data_type
+            elif dtype != each.data_type:
+                raise ValueError("Data Type mismatch")
+        return dtype
+
+    def create_parameter(self, attr, shape, dtype, suffix='w'):
+        if attr['name'] is None:
+            attr['name'] = unique_name(".".join([self.name, suffix]))
+        return self.program.global_block().create_parameter(
+            name=attr['name'],
+            dtype=dtype,
+            shape=shape,
+            initialize_attr=attr['init_attr'])
+
+    def create_tmp_variable(self, dtype):
+        return self.program.current_block().create_var(
+            name=unique_name(".".join([self.name, 'tmp'])), dtype=dtype)
+
+    def create_global_variable(self, *args, **kwargs):
+        return self.program.global_block().create_var(*args, **kwargs)
+
+    def append_bias_op(self, input_var):
+        bias_attr = self.bias_attr(
+            self.kwargs['size'], dtype=input_var.data_type)
+        if not bias_attr:
+            return input_var
+        b = self.create_parameter(
+            attr=bias_attr,
+            shape=[self.kwargs['size']],
+            dtype=input_var.data_type,
+            suffix='b')
+        tmp = self.create_tmp_variable(dtype=input_var.data_type)
+        self.append_op(
+            type='elementwise_add',
+            inputs={'X': [input_var],
+                    'Y': [b]},
+            outputs={'Out': [tmp]})
+        return tmp
+
+    def append_activation(self, input_var):
+        act = self.kwargs.get('act', None)
+        if act is None:
+            return input_var
+        if isinstance(act, basestring):
+            act = {'type': act}
+        tmp = self.create_tmp_variable(dtype=input_var.data_type)
+        act_type = act.pop('type')
+        self.append_op(
+            type=act_type,
+            inputs={"X": [input_var]},
+            outputs={"Y": [tmp]},
+            attrs=act)
+        return tmp
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
new file mode 100644
index 0000000000..44b587b116
--- /dev/null
+++ b/python/paddle/v2/framework/layers.py
@@ -0,0 +1,143 @@
+from paddle.v2.framework.layer_helper import LayerHelper
+import paddle.v2.framework.core as core
+from paddle.v2.framework.framework import OpProtoHolder, Variable
+import re
+
+__all__ = ['fc_layer', 'data_layer', 'cross_entropy']
+
+
+def fc_layer(input,
+             size,
+             param_attr=None,
+             bias_attr=True,
+             name=None,
+             act=None,
+             num_flatten_dims=1,
+             program=None):
+    # create helper
+    helper = LayerHelper('fc', **locals())
+
+    dtype = helper.input_dtype()
+
+    # mul
+    mul_results = []
+    for input_var, param_attr in helper.iter_inputs_and_params():
+        input_shape = input_var.shape
+        param_shape = list(input_shape[num_flatten_dims:]) + [size]
+        w = helper.create_parameter(
+            attr=param_attr, shape=param_shape, dtype=dtype)
+        tmp = helper.create_tmp_variable(dtype)
+        helper.append_op(
+            type="mul",
+            inputs={
+                "X": input_var,
+                "Y": w,
+            },
+            outputs={"Out": tmp},
+            attrs={'x_num_col_dims': num_flatten_dims})
+        mul_results.append(tmp)
+
+    # sum
+    if len(mul_results) == 1:
+        pre_bias = mul_results[0]
+    else:
+        pre_bias = helper.create_tmp_variable(dtype)
+        helper.append_op(
+            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
+    # add bias
+    pre_activation = helper.append_bias_op(pre_bias)
+    # add activation
+    return helper.append_activation(pre_activation)
+
+
+def data_layer(name,
+               shape,
+               data_type='float32',
+               type=core.VarDesc.VarType.LOD_TENSOR,
+               program=None):
+    helper = LayerHelper('data', **locals())
+    shape = [-1] + shape  # append batch size as -1
+    return helper.create_global_variable(
+        name=name, shape=shape, dtype=data_type, type=type)
+
+
+def _convert_(name):
+    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
+    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
+
+
+def _create_op_func_(op_type):
+    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
+    if len(op_proto.outputs) != 1:
+        raise ValueError(
+            "Only one output operator can be automatically generated")
+
+    if op_proto.outputs[0].duplicable:
+        raise ValueError(
+            "Only not duplicable op can be automatically generated")
+
+    o_name = op_proto.outputs[0].name
+
+    def func(**kwargs):
+        helper = LayerHelper(op_type, **kwargs)
+        inputs = dict()
+        dtype = None
+        for ipt in op_proto.inputs:
+            name = _convert_(ipt.name)
+            val = kwargs.pop(name, [])
+            if not isinstance(val, list) and not isinstance(val, tuple):
+                val = [val]
+            for each in val:
+                if not isinstance(each, Variable):
+                    raise ValueError("input of {0} must be variable".format(
+                        op_type))
+
+                if dtype is None:
+                    dtype = each.data_type
+                elif dtype != each.data_type:
+                    raise ValueError(
+                        "operator {0} must input same dtype".format(op_type))
+            inputs[ipt.name] = val
+
+        out = helper.create_tmp_variable(dtype=dtype)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs={o_name: [out]}, attrs=kwargs)
+        return out
+
+    func.__name__ = op_type
+    globals()[op_type] = func
+    global __all__
+    __all__.append(op_type)
+
+
+_create_op_func_('mean')
+
+
+def cross_entropy(input, label, **kwargs):
+    helper = LayerHelper('cross_entropy', **kwargs)
+    out = helper.create_tmp_variable(dtype=input.data_type)
+    helper.append_op(
+        type='cross_entropy',
+        inputs={'X': [input],
+                'Label': [label]},
+        outputs={'Y': [out]},
+        attrs=kwargs)
+    return out
+
+
+def square_error_cost(input, label, **kwargs):
+    helper = LayerHelper('square_error_cost', **kwargs)
+    minus_out = helper.create_tmp_variable(dtype=input.data_type)
+    helper.append_op(
+        type='elementwise_sub',
+        inputs={'X': [input],
+                'Y': [label]},
+        outputs={'Out': [minus_out]})
+
+    square_out = helper.create_tmp_variable(dtype=input.data_type)
+    helper.append_op(
+        type='pow',
+        inputs={'X': [minus_out]},
+        outputs={'Y': [square_out]},
+        attrs={'factor': 2.0})
+    return square_out
diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/framework/op.py
index 9086a5cc34..bc771a964a 100644
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
@@ -219,6 +219,27 @@ class __RecurrentOp__(object):
         return core.RecurrentOp.create(proto.SerializeToString())
 
 
+class __DynamicRecurrentOp__(object):
+    __proto__ = None
+    type = "dynamic_recurrent"
+
+    def __init__(self):
+        # cache recurrent_op's proto
+        if self.__proto__ is None:
+            for op_proto in get_all_op_protos():
+                if op_proto.type == self.type:
+                    self.__proto__ = op_proto
+
+    def __call__(self, *args, **kwargs):
+        if self.type not in args and "type" not in kwargs:
+            kwargs["type"] = self.type
+        # create proto
+        create_method = OpDescCreationMethod(self.__proto__)
+        proto = create_method(*args, **kwargs)
+        # create rnnop
+        return core.DynamicRecurrentOp.create(proto.SerializeToString())
+
+
 class __CondOp__(object):
     __proto__ = None
     type = "cond"
@@ -242,4 +263,5 @@ class __CondOp__(object):
 
 Operator = OperatorFactory()  # The default global factory
 RecurrentOp = __RecurrentOp__()
+DynamicRecurrentOp = __DynamicRecurrentOp__()
 CondOp = __CondOp__()
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 81067f38bb..215fa0b94e 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -14,7 +14,7 @@ def create_op(scope, op_type, inputs, outputs, attrs):
     kwargs = dict()
 
     def __create_var__(name, var_name):
-        scope.new_var(var_name)
+        scope.var(var_name)
         kwargs[name].append(var_name)
 
     for in_name, in_dup in Operator.get_op_inputs(op_type):
@@ -71,7 +71,7 @@ def set_input(scope, op, inputs, place):
 def set_output_grad(scope, op, outputs, place):
     def __set_tensor__(name):
         out_tensor = scope.find_var(name).get_tensor()
-        grad_tensor = scope.new_var(grad_var_name(name)).get_tensor()
+        grad_tensor = scope.var(grad_var_name(name)).get_tensor()
         out_dtype = out_tensor.dtype()
         if out_dtype == core.DataType.FP64:
             data = np.ones(out_tensor.shape(), dtype=np.float64)
@@ -169,10 +169,10 @@ def get_numeric_gradient(scope,
 def get_backward_op(scope, op, no_grad_set):
     backward_op = core.Operator.backward(op, no_grad_set)
     for input in backward_op.input_vars():
-        var = scope.new_var(input)
+        var = scope.var(input)
         var.get_tensor()
     for output in backward_op.output_vars():
-        var = scope.new_var(output)
+        var = scope.var(output)
         var.get_tensor()
     return backward_op
 
diff --git a/python/paddle/v2/framework/tests/test_activation_op.py b/python/paddle/v2/framework/tests/test_activation_op.py
index 3acd00e352..5831b880e4 100644
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
@@ -384,5 +384,33 @@ class TestThresholdedRelu(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=self.relative_error)
 
 
+class TestHardSigmoid(OpTest):
+    def setUp(self):
+        self.op_type = "hard_sigmoid"
+        self.relative_error = 0.002
+
+        X = np.random.uniform(-5, 5, [2, 2]).astype("float32")
+        slope = 0.2
+        offset = 0.5
+        lower_threshold = -offset / slope
+        upper_threshold = (1 - offset) / slope
+
+        self.inputs = {'X': X}
+        # Same reason as TestAbs
+        X[np.abs(X - lower_threshold) < self.relative_error] = \
+            lower_threshold + 0.2
+        X[np.abs(X - upper_threshold) < self.relative_error] = \
+            upper_threshold - 0.2
+
+        temp = X * slope + offset
+        self.outputs = {'Y': np.maximum(0.0, np.minimum(1.0, temp))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.002)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_adam_op.py b/python/paddle/v2/framework/tests/test_adam_op.py
new file mode 100644
index 0000000000..ff6faafa6e
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_adam_op.py
@@ -0,0 +1,186 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestAdamOp1(OpTest):
+    def setUp(self):
+        '''Test Adam Op with supplied attributes
+        '''
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        param_out, moment1_out, moment2_out, beta1_pow_out, \
+            beta2_pow_out = adam_step(self.inputs, self.attrs)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'Beta1PowOut': beta1_pow_out,
+            'Beta2PowOut': beta2_pow_out,
+            'ParamOut': param_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdamOp2(OpTest):
+    def setUp(self):
+        '''Test Adam Op with supplied attributes
+        '''
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.001
+        beta1 = 0.9
+        beta2 = 0.999
+        epsilon = 1e-8
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        param_out, moment1_out, moment2_out, beta1_pow_out, \
+            beta2_pow_out = adam_step(self.inputs, attributes)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'Beta1PowOut': beta1_pow_out,
+            'Beta2PowOut': beta2_pow_out,
+            'ParamOut': param_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdamOpMultipleSteps(OpTest):
+    def setUp(self):
+        '''Test Adam Operator with supplied attributes
+        '''
+        self.op_type = "adam"
+        self.num_steps = 10
+
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.001
+        beta1 = 0.9
+        beta2 = 0.999
+        epsilon = 1e-8
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+    def test_check_output(self):
+        for _ in range(self.num_steps):
+            param_out, moment1_out, moment2_out, beta1_pow_out, \
+                beta2_pow_out = adam_step(self.inputs, self.attrs)
+
+            self.outputs = {
+                'Moment1Out': moment1_out,
+                'Moment2Out': moment2_out,
+                'Beta1PowOut': beta1_pow_out,
+                'Beta2PowOut': beta2_pow_out,
+                'ParamOut': param_out
+            }
+
+            # Verify output for this step
+            self.check_output()
+
+            # Output of this step becomes input for next step
+            self.inputs['Param'] = param_out
+            self.inputs['Moment1'] = moment1_out
+            self.inputs['Moment2'] = moment2_out
+            self.inputs['Beta1Pow'] = beta1_pow_out
+            self.inputs['Beta2Pow'] = beta2_pow_out
+
+            # Randomize gradient for next step
+            self.inputs['Grad'] = np.random.uniform(
+                -1, 1, (102, 105)).astype("float32")
+
+
+def adam_step(inputs, attributes):
+    '''
+    Simulate one step of the adam optimizer
+    :param inputs: dict of inputs
+    :param attributes: dict of attributes
+    :return tuple: tuple of output param, moment1, moment2,
+    beta1 power accumulator and beta2 power accumulator
+    '''
+    param = inputs['Param']
+    grad = inputs['Grad']
+    moment1 = inputs['Moment1']
+    moment2 = inputs['Moment2']
+    lr = inputs['LearningRate']
+    beta1_pow = inputs['Beta1Pow']
+    beta2_pow = inputs['Beta2Pow']
+
+    beta1 = attributes['beta1']
+    beta2 = attributes['beta2']
+    epsilon = attributes['epsilon']
+
+    moment1_out = beta1 * moment1 + (1 - beta1) * grad
+    moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
+    beta1_pow_out = beta1_pow * beta1
+    beta2_pow_out = beta2_pow * beta2
+    lr_t = lr * np.sqrt(1 - beta2_pow_out) / (1 - beta1_pow_out)
+    param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon))
+    return param_out, moment1_out, moment2_out, beta1_pow_out, beta2_pow_out
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_cond_op.py b/python/paddle/v2/framework/tests/test_cond_op.py
index 76323b5e10..2c7bcc4be4 100644
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/framework/tests/test_cond_op.py
@@ -39,7 +39,7 @@ class PySimpleCondTest(unittest.TestCase):
 
 
 def create_tensor(scope, name, shape, np_data):
-    tensor = scope.new_var(name).get_tensor()
+    tensor = scope.var(name).get_tensor()
     tensor.set_dims(shape)
     tensor.set(np_data, core.CPUPlace())
     return tensor
@@ -74,9 +74,9 @@ class TestCondOp(unittest.TestCase):
         create_tensor(self.scope, "X", [10, 1], x_np_data)
         cond_np_data = self.py_cond.cond.astype("int32")
         create_tensor(self.scope, "cond", [10, 1], cond_np_data)
-        self.scope.new_var("SubScopes")
-        self.scope.new_var("IndexTensors")
-        self.scope.new_var("Out")
+        self.scope.var("SubScopes")
+        self.scope.var("IndexTensors")
+        self.scope.var("Out")
 
     def create_cond_op(self):
         self.condop = CondOp(
diff --git a/python/paddle/v2/framework/tests/test_conv2d_op.py b/python/paddle/v2/framework/tests/test_conv2d_op.py
index bfbb213d75..2fb808944a 100644
--- a/python/paddle/v2/framework/tests/test_conv2d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_op.py
@@ -3,71 +3,56 @@ import numpy as np
 from op_test import OpTest
 
 
+def conv2d_forward_naive(input, filter, group, conv_param):
+    in_n, in_c, in_h, in_w = input.shape
+    out_c, f_c, f_h, f_w = filter.shape
+    assert f_c * group == in_c
+    assert np.mod(out_c, group) == 0
+    sub_out_c = out_c / group
+
+    stride, pad = conv_param['stride'], conv_param['pad']
+    out_h = 1 + (in_h + 2 * pad[0] - f_h) / stride[0]
+    out_w = 1 + (in_w + 2 * pad[1] - f_w) / stride[1]
+    out = np.zeros((in_n, out_c, out_h, out_w))
+
+    input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], )),
+                       mode='constant',
+                       constant_values=0)
+    for i in range(out_h):
+        for j in range(out_w):
+            for g in range(group):
+                input_pad_masked = \
+                    input_pad[:, g * f_c:(g + 1) * f_c,
+                    i * stride[0]:i * stride[0] + f_h,
+                    j * stride[1]:j * stride[1] + f_w]
+
+                f_sub = filter[g * sub_out_c:(g + 1) * sub_out_c, :, :, :]
+                for k in range(sub_out_c):
+                    out[:, g * sub_out_c + k, i, j] = \
+                        np.sum(input_pad_masked * f_sub[k, :, :, :],
+                               axis=(1, 2, 3))
+
+    return out
+
+
 class TestConv2dOp(OpTest):
     def setUp(self):
-        self.init_groups()
-        self.init_optype()
-        batch_size = 2
-        input_channels = 3
-        input_height = 5
-        input_width = 5
-        output_channels = 6
-        filter_height = 3
-        filter_width = 3
-        stride = 1
-        padding = 0
-        output_height = (input_height - filter_height + 2 * padding
-                         ) / stride + 1
-        output_width = (input_width - filter_width + 2 * padding) / stride + 1
-        input = np.random.random((batch_size, input_channels, input_height,
-                                  input_width)).astype("float32")
-
-        filter = np.random.random(
-            (output_channels, input_channels / self.groups, filter_height,
-             filter_width)).astype("float32")
-        output = np.ndarray(
-            (batch_size, output_channels, output_height, output_width))
+        self.init_op_type()
+        self.init_group()
+        self.init_test_case()
+
+        conv2d_param = {'stride': self.stride, 'pad': self.pad}
+        input = np.random.random(self.input_size).astype("float32")
+        filter = np.random.random(self.filter_size).astype("float32")
+        output = conv2d_forward_naive(input, filter, self.groups, conv2d_param)
 
         self.inputs = {'Input': input, 'Filter': filter}
         self.attrs = {
-            'strides': [1, 1],
-            'paddings': [0, 0],
-            'dilations': [1, 1],
-            'groups': self.groups
+            'strides': self.stride,
+            'paddings': self.pad,
+            'groups': self.groups,
+            'dilations': self.dilations
         }
-
-        output_group_channels = output_channels / self.groups
-        input_group_channels = input_channels / self.groups
-        for batchid in xrange(batch_size):
-            for group in xrange(self.groups):
-                for outchannelid in range(group * output_group_channels,
-                                          (group + 1) * output_group_channels):
-                    for rowid in xrange(output_height):
-                        for colid in xrange(output_width):
-                            start_h = (rowid * stride) - padding
-                            start_w = (colid * stride) - padding
-                            output_value = 0.0
-                            for inchannelid in range(
-                                    group * input_group_channels,
-                                (group + 1) * input_group_channels):
-                                for frowid in xrange(filter_height):
-                                    for fcolid in xrange(filter_width):
-                                        input_value = 0.0
-                                        inrowid = start_h + frowid
-                                        incolid = start_w + fcolid
-                                        if ((inrowid >= 0 and
-                                             inrowid < input_height) and
-                                            (incolid >= 0 and
-                                             incolid < input_width)):
-                                            input_value = input[batchid][
-                                                inchannelid][inrowid][incolid]
-                                        filter_value = filter[outchannelid][
-                                            inchannelid % input_group_channels][
-                                                frowid][fcolid]
-                                        output_value += input_value * filter_value
-                            output[batchid][outchannelid][rowid][
-                                colid] = output_value
-
         self.outputs = {'Output': output}
 
     def test_check_output(self):
@@ -91,30 +76,47 @@ class TestConv2dOp(OpTest):
             max_relative_error=0.05,
             no_grad_set=set(['Input']))
 
-    def init_groups(self):
+    def init_test_case(self):
+        # self.groups = 1
+        # self.op_type = "conv2d"
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_group(self):
         self.groups = 1
 
-    def init_optype(self):
+    def init_op_type(self):
         self.op_type = "conv2d"
 
 
 class TestWithGroup(TestConv2dOp):
-    def init_groups(self):
+    def init_group(self):
         self.groups = 3
 
+    def init_op_type(self):
+        self.op_type = "conv2d"
 
-class TestCudnn2d(TestConv2dOp):
-    def init_optype(self):
-        self.op_type = "conv_cudnn"
 
+class TestCudnn(TestConv2dOp):
+    def init_group(self):
+        self.groups = 1
 
-class TestCudnn2dWithGroup(TestConv2dOp):
-    def init_optype(self):
+    def init_op_type(self):
         self.op_type = "conv_cudnn"
 
-    def init_groups(self):
+
+class TestCudnnWithGroup(TestConv2dOp):
+    def init_group(self):
         self.groups = 3
 
+    def init_op_type(self):
+        self.op_type = "conv_cudnn"
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index 4ea14da7fd..919b6c3f67 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -49,7 +49,7 @@ class TestCrossEntropyOp2(OpTest):
 
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {"softLabel": True}
+        self.attrs = {"soft_label": True}
 
     def test_check_output(self):
         self.check_output()
@@ -82,7 +82,7 @@ class TestCrossEntropyOp3(OpTest):
 
         self.inputs = {"X": X, "Label": label.astype(np.float32)}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {"softLabel": True}
+        self.attrs = {"soft_label": True}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py b/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
new file mode 100644
index 0000000000..674c3fda5c
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
@@ -0,0 +1,71 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestDecayedAdagradOp1(OpTest):
+    ''' Test DecayedAdagrad operator with explicit attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "decayed_adagrad"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+        lr = 0.01
+        decay = 0.80
+        epsilon = 1e-8
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'LearningRate': np.array([lr]).astype("float32")
+        }
+
+        self.attrs = {'decay': decay, 'epsilon': epsilon}
+
+        moment_out = decay * moment + (1 - decay) * grad * grad
+        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
+
+        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestDecayedAdagradOp2(OpTest):
+    ''' Test DecayedAdagrad operator with default attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "decayed_adagrad"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+        lr = 0.01
+        decay = 0.95
+        epsilon = 1e-6
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'LearningRate': np.array([lr]).astype("float32")
+        }
+
+        self.attrs = {'decay': decay, 'epsilon': epsilon}
+
+        moment_out = decay * moment + (1 - decay) * grad * grad
+        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
+
+        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_default_scope_funcs.py b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
index 495863c456..09a9850d05 100644
--- a/python/paddle/v2/framework/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
@@ -10,7 +10,7 @@ class TestDefaultScopeFuncs(unittest.TestCase):
         self.assertIsNone(find_var("test"))
 
     def test_create_var_get_var(self):
-        var_a = new_var("var_a")
+        var_a = var("var_a")
         self.assertIsNotNone(var_a)
         self.assertIsNotNone(get_cur_scope().find_var('var_a'))
         enter_local_scope()
@@ -19,7 +19,7 @@ class TestDefaultScopeFuncs(unittest.TestCase):
 
     def test_var_get_int(self):
         def __new_scope__():
-            i = new_var("var_i")
+            i = var("var_i")
             self.assertFalse(i.is_int())
             i.set_int(10)
             self.assertTrue(i.is_int())
diff --git a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
new file mode 100644
index 0000000000..2b01e43454
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
@@ -0,0 +1,111 @@
+import logging
+import paddle.v2.framework.core as core
+import unittest
+from paddle.v2.framework.op import Operator, DynamicRecurrentOp
+import numpy as np
+
+
+def create_tensor(scope, name, shape, np_data):
+    tensor = scope.var(name).get_tensor()
+    tensor.set_dims(shape)
+    tensor.set(np_data, core.CPUPlace())
+    return tensor
+
+
+class DynamicRecurrentOpTest(unittest.TestCase):
+    '''
+    Test RNNOp
+
+    equation:
+        h_t = \sigma (W x_t + U h_{t-1})
+    weights:
+        - W
+        - U
+    vars:
+        - x
+    memories:
+        - h
+    outputs:
+       - h
+    '''
+
+    # for siplicity, just one level LoD
+    lod_py = [[0, 4, 7, 9, 10]]
+    input_dim = 30
+    num_sents = len(lod_py[0]) - 1
+    weight_dim = 15
+
+    def forward(self):
+        self.scope = core.Scope()
+        self.create_global_variables()
+        self.create_rnn_op()
+        self.create_step_net()
+        ctx = core.DeviceContext.create(core.CPUPlace())
+        self.rnnop.run(self.scope, ctx)
+        state = self.rnnop.get_state("h@mem")
+        print 'state size: ', state.size()
+
+        step_inputs = self.rnnop.get_step_input("x")
+        print "x size ", step_inputs.size()
+        for i in range(step_inputs.size()):
+            print "x %d" % i, np.array(step_inputs.read(i).get_dims())
+        step_outputs = self.rnnop.get_step_output('h@mem')
+        print 'step_outputs.size ', step_outputs.size()
+        output = self.scope.find_var("h@mem").get_tensor()
+
+        print 'output', np.array(output).shape
+
+    def create_global_variables(self):
+        x = np.random.normal(size=(self.lod_py[0][-1],
+                                   self.input_dim)).astype("float32")
+        W = np.random.normal(size=(self.input_dim,
+                                   self.input_dim)).astype("float32")
+        U = np.random.normal(size=(self.input_dim,
+                                   self.input_dim)).astype("float32")
+        h_boot = np.random.normal(size=(self.num_sents,
+                                        self.input_dim)).astype("float32")
+        # create inlink
+        x_tensor = create_tensor(self.scope, "x",
+                                 [self.num_sents, self.input_dim], x)
+        x_tensor.set_lod(self.lod_py)
+        create_tensor(self.scope, "W", [self.input_dim, self.input_dim], W)
+        create_tensor(self.scope, "U", [self.input_dim, self.input_dim], U)
+        create_tensor(self.scope, "h_boot", [self.num_sents, self.input_dim],
+                      h_boot)
+        self.scope.var("step_scopes")
+        self.scope.var("h@mem")
+
+    def create_rnn_op(self):
+        # create RNNOp
+        self.rnnop = DynamicRecurrentOp(
+            # inputs
+            inlinks=["x"],
+            boot_memories=["h_boot"],
+            step_net="stepnet",
+            # outputs
+            outlinks=["h@mem"],
+            step_scopes="step_scopes",
+            # attributes
+            pre_memories=["h@pre"],
+            memories=["h@mem"])
+
+    def create_step_net(self):
+        stepnet = core.Net.create()
+        x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
+        h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
+        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
+        sig_op = Operator("sigmoid", X="sum", Y="h@mem")
+
+        for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
+            stepnet.append_op(op)
+        stepnet.complete_add_op(True)
+        self.rnnop.set_stepnet(stepnet)
+
+    def test_forward(self):
+        print 'test recurrent op forward'
+        pd_output = self.forward()
+        print 'pd_output', pd_output
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_feed_fetch_method.py b/python/paddle/v2/framework/tests/test_feed_fetch_method.py
new file mode 100644
index 0000000000..47eedddcb6
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_feed_fetch_method.py
@@ -0,0 +1,30 @@
+import paddle.v2.framework.core as core
+import unittest
+import numpy as np
+
+
+class TestFeedFetch(unittest.TestCase):
+    def test_feed_fetch(self):
+        place = core.CPUPlace()
+        input_array = np.ones((4, 4, 6)).astype("float32")
+        input_array[0, 0, 0] = 3
+        input_array[3, 3, 5] = 10
+        input_tensor = core.LoDTensor([[0, 2, 4]])
+        input_tensor.set(input_array, place)
+
+        core.set_feed_variable_float(input_tensor, "feed", 0)
+
+        output_tensor = core.get_fetch_variable("feed", 0)
+
+        output_lod = output_tensor.lod()
+        self.assertEqual(0, output_lod[0][0])
+        self.assertEqual(2, output_lod[0][1])
+        self.assertEqual(4, output_lod[0][2])
+
+        output_array = np.array(output_tensor)
+        self.assertEqual(3, output_array[0, 0, 0])
+        self.assertEqual(10, output_array[3, 3, 5])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_gaussian_random_op.py b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
index cff5080048..8b7779667d 100644
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
@@ -14,7 +14,7 @@ class TestGaussianRandomOp(unittest.TestCase):
 
     def gaussian_random_test(self, place):
         scope = core.Scope()
-        scope.new_var('Out').get_tensor()
+        scope.var('Out').get_tensor()
 
         op = Operator(
             "gaussian_random",
diff --git a/python/paddle/v2/framework/tests/test_gru_unit_op.py b/python/paddle/v2/framework/tests/test_gru_unit_op.py
new file mode 100644
index 0000000000..57625362d2
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_gru_unit_op.py
@@ -0,0 +1,115 @@
+import math
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class GRUActivationType(OpTest):
+    identity = 0
+    sigmoid = 1
+    tanh = 2
+    relu = 3
+
+
+def identity(x):
+    return x
+
+
+def sigmoid(x):
+    return 1. / (1. + np.exp(-x))
+
+
+def tanh(x):
+    return 2. * sigmoid(2. * x) - 1.
+
+
+def relu(x):
+    return np.maximum(x, 0)
+
+
+class TestGRUUnitOp(OpTest):
+    batch_size = 3
+    frame_size = 5
+    activate = {
+        GRUActivationType.identity: identity,
+        GRUActivationType.sigmoid: sigmoid,
+        GRUActivationType.tanh: tanh,
+        GRUActivationType.relu: relu,
+    }
+
+    def set_inputs(self):
+        batch_size = self.batch_size
+        frame_size = self.frame_size
+        self.op_type = 'gru_unit'
+        self.inputs = {
+            'Input': np.random.uniform(
+                -0.1, 0.1, (batch_size, frame_size * 3)).astype('float32'),
+            'HiddenPrev': np.random.uniform(
+                -0.1, 0.1, (batch_size, frame_size)).astype('float32'),
+            'Weight': np.random.uniform(
+                -1. / math.sqrt(frame_size), 1. / math.sqrt(frame_size),
+                (frame_size, frame_size * 3)).astype('float32'),
+        }
+        self.attrs = {
+            'activation': GRUActivationType.tanh,
+            'gate_activation': GRUActivationType.sigmoid
+        }
+
+    def set_outputs(self):
+        # GRU calculations
+        batch_size = self.batch_size
+        frame_size = self.frame_size
+        x = self.inputs['Input']
+        h_p = self.inputs['HiddenPrev']
+        w = self.inputs['Weight']
+        b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
+            (1, frame_size * 3))
+        g = x + np.tile(b, (batch_size, 1))
+        w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(
+            (frame_size, frame_size * 2))
+        u_r = self.activate[self.attrs['gate_activation']](np.dot(
+            h_p, w_u_r) + g[:, :frame_size * 2])
+        u = u_r[:, :frame_size]
+        r = u_r[:, frame_size:frame_size * 2]
+        r_h_p = r * h_p
+        w_c = w.flatten()[frame_size * frame_size * 2:].reshape(
+            (frame_size, frame_size))
+        c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
+                                                    g[:, frame_size * 2:])
+        g = np.hstack((u_r, c))
+        h = u * h_p + (1 - u) * c
+        self.outputs = {'Gate': g, 'ResetHiddenPrev': r_h_p, 'Hidden': h}
+
+    def setUp(self):
+        self.set_inputs()
+        self.set_outputs()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['Input', 'HiddenPrev', 'Weight'], ['Hidden'],
+            max_relative_error=0.007)
+
+
+class TestGRUUnitOpWithBias(TestGRUUnitOp):
+    def set_inputs(self):
+        batch_size = self.batch_size
+        frame_size = self.frame_size
+        super(TestGRUUnitOpWithBias, self).set_inputs()
+        self.inputs['Bias'] = np.random.uniform(
+            -0.1, 0.1, (1, frame_size * 3)).astype('float32')
+        self.attrs = {
+            'activation': GRUActivationType.identity,
+            'gate_activation': GRUActivationType.sigmoid
+        }
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'],
+            max_relative_error=0.007)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_infer_shape.py b/python/paddle/v2/framework/tests/test_infer_shape.py
index 99562890fd..19bb45acef 100644
--- a/python/paddle/v2/framework/tests/test_infer_shape.py
+++ b/python/paddle/v2/framework/tests/test_infer_shape.py
@@ -13,12 +13,15 @@ class TestInferShape(unittest.TestCase):
         shape = [10, 20]
 
         # prepare input/output
-        x1 = block.new_var("x1")
+        x1 = block.var("x1")
+        x1.set_type(core.VarDesc.VarType.LOD_TENSOR)
         x1.set_shape(shape)
-        x2 = block.new_var("x2")
+        x2 = block.var("x2")
+        x2.set_type(core.VarDesc.VarType.LOD_TENSOR)
         x2.set_shape(shape)
 
-        out = block.new_var("out")
+        out = block.var("out")
+        out.set_type(core.VarDesc.VarType.LOD_TENSOR)
 
         # prepare the operator
         sum_op_desc = block.append_op()
@@ -39,12 +42,15 @@ class TestInferShape(unittest.TestCase):
         y_shape = [20, 30]
 
         # prepare input/output
-        x1 = block.new_var("x")
+        x1 = block.var("x")
+        x1.set_type(core.VarDesc.VarType.LOD_TENSOR)
         x1.set_shape(x_shape)
-        x2 = block.new_var("y")
+        x2 = block.var("y")
+        x2.set_type(core.VarDesc.VarType.LOD_TENSOR)
         x2.set_shape(y_shape)
 
-        out = block.new_var("out")
+        out = block.var("out")
+        out.set_type(core.VarDesc.VarType.LOD_TENSOR)
 
         # prepare the operator
         mul_op_desc = block.append_op()
diff --git a/python/paddle/v2/framework/tests/test_layers.py b/python/paddle/v2/framework/tests/test_layers.py
new file mode 100644
index 0000000000..1ef2591cca
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_layers.py
@@ -0,0 +1,43 @@
+from paddle.v2.framework.layers import fc_layer, data_layer, cross_entropy, mean, square_error_cost
+from paddle.v2.framework.framework import Program, g_program
+import paddle.v2.framework.core as core
+import unittest
+
+
+class TestBook(unittest.TestCase):
+    def test_fit_a_line(self):
+        pd = core.ProgramDesc.__create_program_desc__()
+        program = Program(desc=pd)
+        x = data_layer(
+            name='x', shape=[13], data_type='float32', program=program)
+        y_predict = fc_layer(input=x, size=1, act=None, program=program)
+
+        y = data_layer(
+            name='y', shape=[1], data_type='float32', program=program)
+        cost = square_error_cost(input=y_predict, label=y, program=program)
+
+        avg_cost = mean(x=cost, program=program)
+        self.assertIsNotNone(avg_cost)
+        print str(program)
+
+    def test_recognize_digits_mlp(self):
+        pd = core.ProgramDesc.__create_program_desc__()
+        program = Program(desc=pd)
+
+        # Change g_program, so the rest layers use `g_program`
+        images = data_layer(
+            name='pixel', shape=[784], data_type='float32', program=program)
+        label = data_layer(
+            name='label', shape=[1], data_type='int32', program=program)
+        hidden1 = fc_layer(input=images, size=128, act='relu', program=program)
+        hidden2 = fc_layer(input=hidden1, size=64, act='relu', program=program)
+        predict = fc_layer(
+            input=hidden2, size=10, act='softmax', program=program)
+        cost = cross_entropy(input=predict, label=label, program=program)
+        avg_cost = mean(x=cost, program=program)
+        self.assertIsNotNone(avg_cost)
+        print str(program)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_mnist.py b/python/paddle/v2/framework/tests/test_mnist.py
index 169242b537..c8d54b7c94 100644
--- a/python/paddle/v2/framework/tests/test_mnist.py
+++ b/python/paddle/v2/framework/tests/test_mnist.py
@@ -31,7 +31,7 @@ uniq_id = atomic_id().next
 
 
 def data_layer(name, dims):
-    var = scope.new_var(name)
+    var = scope.var(name)
     tensor = var.get_tensor()
     tensor.set_dims(dims)  # 1 is batch size holder.
     return name
@@ -67,7 +67,7 @@ def sgd_optimizer(net, param_name, learning_rate=0.005):
 
 # should use operator and add these to the init_network
 def init_param(net, param_name, dims):
-    scope.new_var(param_name)
+    scope.var(param_name)
     op = Operator(
         "uniform_random", Out=param_name, dims=dims, min=-0.5, max=0.5, seed=10)
     op.infer_shape(scope)
@@ -104,7 +104,7 @@ def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None):
     sgd_optimizer(net=optimize_net, param_name=w_name, learning_rate=0.01)
 
     pre_activation = name + ".mul.out"
-    scope.new_var(pre_activation)
+    scope.var(pre_activation)
     mul_op = Operator("mul", X=input, Y=w_name, Out=pre_activation)
     net.append_op(mul_op)
 
@@ -115,7 +115,7 @@ def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None):
         sgd_optimizer(
             net=optimize_net, param_name=bias_name, learning_rate=0.001)
         bias_out = name + ".rowwise_add.out"
-        scope.new_var(bias_out)
+        scope.var(bias_out)
         rowwise_append_op = Operator(
             "rowwise_add", X=pre_activation, b=bias_name, Out=bias_out)
         net.append_op(rowwise_append_op)
@@ -123,7 +123,7 @@ def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None):
 
     activation_op = Operator(act, X=pre_activation, Y=name)
     net.append_op(activation_op)
-    scope.new_var(name)
+    scope.var(name)
     net.infer_shape(scope)
     return name
 
@@ -133,7 +133,7 @@ def cross_entropy_layer(net, input, label):
     cross_entropy_op = Operator(
         "cross_entropy", X=input, Label=label, Y=cost_name)
     net.append_op(cross_entropy_op)
-    scope.new_var(cost_name)
+    scope.var(cost_name)
     net.infer_shape(scope)
     return cost_name
 
@@ -141,10 +141,10 @@ def cross_entropy_layer(net, input, label):
 def create_backward_net(forward_net):
     net = core.Operator.backward(forward_net, set())
     for input in net.inputs()["all"]:
-        var = scope.new_var(input)
+        var = scope.var(input)
         var.get_tensor()
     for output in net.outputs()["all"]:
-        var = scope.new_var(output)
+        var = scope.var(output)
         var.get_tensor()
     return net
 
diff --git a/python/paddle/v2/framework/tests/test_operator_desc.py b/python/paddle/v2/framework/tests/test_operator_desc.py
new file mode 100644
index 0000000000..af4e980b8e
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_operator_desc.py
@@ -0,0 +1,78 @@
+import unittest
+from paddle.v2.framework.framework import Variable, g_program
+import paddle.v2.framework.core as core
+
+
+class TestOperator(unittest.TestCase):
+    def test_error_type(self):
+        block = g_program.create_block()
+        try:
+            block.append_op()
+            self.assertFail()
+        except ValueError as v_err:
+            self.assertEqual(
+                v_err.message,
+                "`type` to initilized an Operator can not be None.")
+        try:
+            block.append_op(type="no_such_op")
+            self.assertFail()
+        except ValueError as a_err:
+            self.assertEqual(a_err.message,
+                             "Operator \"no_such_op\" has not been registered.")
+
+    def test_op_desc_creation(self):
+        block = g_program.current_block()
+        mul_x = block.create_var(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        mul_op = block.append_op(
+            type="mul",
+            inputs={"X": [mul_x],
+                    "Y": mul_y},
+            outputs={"Out": [mul_out]},
+            attrs={"x_num_col_dims": 1})
+
+        self.assertNotEqual(str(mul_op), "")
+        self.assertEqual(mul_op.type, "mul")
+        self.assertEqual(mul_op.input_names, ["X", "Y"])
+        self.assertEqual(mul_op.input("X"), ["mul.x"])
+        self.assertEqual(mul_op.input("Y"), ["mul.y"])
+        self.assertEqual(mul_op.output_names, ["Out"])
+        self.assertEqual(mul_op.output("Out"), ["mul.out"])
+        self.assertEqual(
+            set(mul_op.attr_names), set(["x_num_col_dims", "y_num_col_dims"]))
+        self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
+        self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
+        self.assertEqual(mul_op.attr("x_num_col_dims"), 1)
+        self.assertEqual(mul_op.has_attr("y_num_col_dims"), True)
+        self.assertEqual(mul_op.attr_type("y_num_col_dims"), core.AttrType.INT)
+        self.assertEqual(mul_op.attr("y_num_col_dims"), 1)
+        self.assertEqual(mul_out.op, mul_op)
+
+    def test_mult_input(self):
+        block = g_program.current_block()
+        sum_x1 = block.create_var(
+            dtype="int", shape=[3, 4], lod_level=0, name="sum.x1")
+        sum_x2 = block.create_var(
+            dtype="int", shape=[3, 4], lod_level=0, name="sum.x2")
+        sum_x3 = block.create_var(
+            dtype="int", shape=[3, 4], lod_level=0, name="sum.x3")
+        sum_out = block.create_var(
+            dtype="int", shape=[3, 4], lod_level=0, name="sum.out")
+        sum_op = block.append_op(
+            type="sum",
+            inputs={"X": [sum_x1, sum_x2, sum_x3]},
+            outputs={"Out": sum_out})
+        self.assertEqual(sum_op.type, "sum")
+        self.assertEqual(sum_op.input_names, ["X"])
+        self.assertEqual(sum_op.input("X"), ["sum.x1", "sum.x2", "sum.x3"])
+        self.assertEqual(sum_op.output_names, ["Out"])
+        self.assertEqual(sum_op.output("Out"), ["sum.out"])
+        self.assertEqual(sum_out.op, sum_op)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_parameter.py b/python/paddle/v2/framework/tests/test_parameter.py
index 3b5d38f257..1ac0cdd99f 100644
--- a/python/paddle/v2/framework/tests/test_parameter.py
+++ b/python/paddle/v2/framework/tests/test_parameter.py
@@ -1,5 +1,5 @@
 import unittest
-from paddle.v2.framework.graph import g_program
+from paddle.v2.framework.framework import g_program
 import paddle.v2.framework.core as core
 
 
diff --git a/python/paddle/v2/framework/tests/test_pool2d_op.py b/python/paddle/v2/framework/tests/test_pool2d_op.py
index 2941fda81b..3fcd8941d4 100644
--- a/python/paddle/v2/framework/tests/test_pool2d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool2d_op.py
@@ -56,8 +56,8 @@ class TestPool2d_Op(OpTest):
             'strides': self.strides,
             'paddings': self.paddings,
             'ksize': self.ksize,
-            'poolingType': self.pool_type,
-            'globalPooling': self.global_pool,
+            'pooling_type': self.pool_type,
+            'global_pooling': self.global_pool,
         }
 
         self.outputs = {'Out': output}
diff --git a/python/paddle/v2/framework/tests/test_pool3d_op.py b/python/paddle/v2/framework/tests/test_pool3d_op.py
index 8792b492e3..f4e938041f 100644
--- a/python/paddle/v2/framework/tests/test_pool3d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool3d_op.py
@@ -64,8 +64,8 @@ class TestPool3d_Op(OpTest):
             'strides': self.strides,
             'paddings': self.paddings,
             'ksize': self.ksize,
-            'poolingType': self.pool_type,
-            'globalPooling': self.global_pool,
+            'pooling_type': self.pool_type,
+            'global_pooling': self.global_pool,
         }
 
         self.outputs = {'Out': output}
diff --git a/python/paddle/v2/framework/tests/test_pool_max_op.py b/python/paddle/v2/framework/tests/test_pool_max_op.py
index f0f8aa6089..b78f9bba05 100644
--- a/python/paddle/v2/framework/tests/test_pool_max_op.py
+++ b/python/paddle/v2/framework/tests/test_pool_max_op.py
@@ -86,7 +86,7 @@ class TestMaxPoolWithIndex_Op(OpTest):
             'strides': self.strides,
             'paddings': self.paddings,
             'ksize': self.ksize,
-            'globalPooling': self.global_pool,
+            'global_pooling': self.global_pool,
         }
 
         self.inputs = {'X': input}
diff --git a/python/paddle/v2/framework/tests/test_program.py b/python/paddle/v2/framework/tests/test_program.py
index 83e184494a..d06f86c09f 100644
--- a/python/paddle/v2/framework/tests/test_program.py
+++ b/python/paddle/v2/framework/tests/test_program.py
@@ -1,7 +1,8 @@
 import unittest
 
 import paddle.v2.framework.core as core
-from paddle.v2.framework.graph import g_program
+from paddle.v2.framework.framework import Program
+from paddle.v2.framework.framework import g_program
 
 
 class TestProgram(unittest.TestCase):
@@ -33,7 +34,7 @@ class TestProgram(unittest.TestCase):
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
-    def test_append_backward(self):
+    def test_desc_append_backward(self):
         prog = core.ProgramDesc.__create_program_desc__()
         self.assertIsNotNone(prog)
         block = prog.block(0)
@@ -51,16 +52,44 @@ class TestProgram(unittest.TestCase):
         sum_op_desc.set_input("Y", ["b1"])
         sum_op_desc.set_output("Out", ["out2"])
 
+        target = block.var("out2")
+
         expect_ops = [
-            "mul", "elementwise_add", "elementwise_add_grad", "mul_grad"
+            "mul", "elementwise_add", "fill_constant", "elementwise_add_grad",
+            "mul_grad"
         ]
+
+        def grad_name(name):
+            return name + "@GRAD"
+
         actual_ops = []
-        prog.append_backward(set())
+        param_to_grad = prog.append_backward(target, set())
+        for var_name in ("x1", "y1", "out1", "b1"):
+            self.assertEqual(param_to_grad[var_name][0], grad_name(var_name))
+            self.assertEqual(param_to_grad[var_name][1], 0)
+
         for op in block.all_ops():
             actual_ops.append(op.type())
-        print(actual_ops)
         self.assertEqual(actual_ops, expect_ops)
 
+    def test_append_backward(self):
+        prog = Program.instance()
+        block = prog.global_block()
+
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        mul_op = block.append_op(
+            type="mul",
+            inputs={"X": [mul_x],
+                    "Y": mul_y},
+            outputs={"Out": [mul_out]},
+            attrs={"x_num_col_dims": 1})
+        param_to_grad = prog.append_backward(mul_out, set())
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/framework/tests/test_protobuf_descs.py
index 3db1e79ce4..c775b1a398 100644
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
@@ -53,7 +53,7 @@ class TestOpDesc(unittest.TestCase):
         self.assertEqual(8, len(op.attr_names()))
 
         op.set_block_attr("block_attr", prog.block(0))
-        self.assertEqual(0, op.get_block_attr("block_attr"))
+        self.assertEqual(0, op.block_attr("block_attr"))
 
         mul_op = block.append_op()
         mul_op.set_type("mul")
@@ -93,18 +93,22 @@ class TestVarDesc(unittest.TestCase):
     def test_shape(self):
         program_desc = core.ProgramDesc.__create_program_desc__()
         block = program_desc.block(0)
-        var = block.new_var('my_var')
+        var = block.var('my_var')
+        var.set_type(core.VarDesc.VarType.SELECTED_ROWS)
         src_shape = [3, 2, 10, 8]
         var.set_shape(src_shape)
         res_shape = var.shape()
         self.assertEqual(src_shape, res_shape)
+        self.assertEqual(core.VarDesc.VarType.SELECTED_ROWS, var.type())
 
     def test_data_type(self):
         program_desc = core.ProgramDesc.__create_program_desc__()
         block = program_desc.block(0)
-        var = block.new_var('my_var')
+        var = block.var('my_var')
+        var.set_type(core.VarDesc.VarType.LOD_TENSOR)
         var.set_data_type(core.DataType.INT32)
         self.assertEqual(core.DataType.INT32, var.data_type())
+        self.assertEqual(core.VarDesc.VarType.LOD_TENSOR, var.type())
 
 
 class TestBlockDesc(unittest.TestCase):
@@ -113,12 +117,12 @@ class TestBlockDesc(unittest.TestCase):
         self.assertIsNotNone(prog)
         block = prog.block(0)
         self.assertIsNotNone(block)
-        var1 = block.new_var("var1")
-        var2 = block.new_var("var2")
-        var3 = block.new_var("var3")
+        var1 = block.var("var1")
+        var2 = block.var("var2")
+        var3 = block.var("var3")
         all_vars = block.all_vars()
         self.assertEqual(set(all_vars), set([var1, var2, var3]))
-        var2_re = block.var("var2")
+        var2_re = block.find_var("var2")
         self.assertEqual(var2_re, var2)
 
     def test_add_op(self):
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py
index 1f114432c0..191ce0b0c8 100644
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -66,7 +66,7 @@ class PySimpleRNNTest(unittest.TestCase):
 
 
 def create_tensor(scope, name, shape, np_data):
-    tensor = scope.new_var(name).get_tensor()
+    tensor = scope.var(name).get_tensor()
     tensor.set_dims(shape)
     tensor.set(np_data, core.CPUPlace())
     return tensor
@@ -125,8 +125,8 @@ class RecurrentOpTest(unittest.TestCase):
         h_boot_np_data = self.py_rnn.h_boot
         create_tensor(self.scope, "h_boot", [self.batch_size, self.input_dim],
                       h_boot_np_data)
-        self.scope.new_var("step_scopes")
-        self.scope.new_var("h@mem")
+        self.scope.var("step_scopes")
+        self.scope.var("h@mem")
 
     def create_rnn_op(self):
         # create RNNOp
diff --git a/python/paddle/v2/framework/tests/test_scope.py b/python/paddle/v2/framework/tests/test_scope.py
index 1ce9454067..1474365479 100644
--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/framework/tests/test_scope.py
@@ -18,7 +18,7 @@ class TestScope(unittest.TestCase):
     def test_create_var_get_var(self):
         paddle_c = paddle.v2.framework.core
         scope = paddle_c.Scope()
-        var_a = scope.new_var("var_a")
+        var_a = scope.var("var_a")
         self.assertIsNotNone(var_a)
         self.assertIsNotNone(scope.find_var('var_a'))
         scope2 = scope.new_scope()
@@ -27,7 +27,7 @@ class TestScope(unittest.TestCase):
     def test_var_get_int(self):
         paddle_c = paddle.v2.framework.core
         scope = paddle_c.Scope()
-        var = scope.new_var("test_int")
+        var = scope.var("test_int")
         var.set_int(10)
         self.assertTrue(var.is_int())
         self.assertEqual(10, var.get_int())
diff --git a/python/paddle/v2/framework/tests/test_selected_rows.py b/python/paddle/v2/framework/tests/test_selected_rows.py
new file mode 100644
index 0000000000..661e818179
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_selected_rows.py
@@ -0,0 +1,37 @@
+import paddle.v2.framework.core as core
+import unittest
+import numpy as np
+
+
+class TestSelectedRows(unittest.TestCase):
+    def test_selected_rows(self):
+        place = core.CPUPlace()
+        height = 10
+        rows = [0, 4, 7]
+        row_numel = 10
+        selcted_rows = core.SelectedRows(rows, row_numel)
+        np_array = np.ones((len(rows), height)).astype("float32")
+        np_array[0, 0] = 2.0
+        np_array[2, 8] = 4.0
+        tensor = selcted_rows.get_tensor()
+        tensor.set(np_array, place)
+
+        # compare rows
+        self.assertEqual(0, selcted_rows.rows()[0])
+        self.assertEqual(4, selcted_rows.rows()[1])
+        self.assertEqual(7, selcted_rows.rows()[2])
+
+        # compare height
+        self.assertEqual(10, selcted_rows.height())
+
+        # compare tensor
+        self.assertAlmostEqual(2.0,
+                               selcted_rows.get_tensor().get_float_element(0))
+        self.assertAlmostEqual(1.0,
+                               selcted_rows.get_tensor().get_float_element(1))
+        self.assertAlmostEqual(
+            4.0, selcted_rows.get_tensor().get_float_element(2 * row_numel + 8))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_seq_pool.py b/python/paddle/v2/framework/tests/test_seq_pool.py
index 211086e5f4..0ebf78bf8f 100644
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/framework/tests/test_seq_pool.py
@@ -82,5 +82,70 @@ class TestSeqSumPool2D(TestSeqAvgPool2D):
             out[i] = np.reshape(sub_x.sum(axis=0), (3, 17))
 
 
+class TestSeqSqrtPool(TestSeqAvgPool):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.SQRT}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            len = lod[0][i + 1] - lod[0][i]
+            out[i] = sub_x.sum(axis=0) / np.sqrt(len)
+
+
+class TestSeqSqrtPool2D(TestSeqAvgPool2D):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.SQRT}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+            len = lod[0][i + 1] - lod[0][i]
+            out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(len), (3, 17))
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", max_relative_error=0.06)
+
+
+class TestSeqLastPool(TestSeqAvgPool):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.LAST}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x[-1, :]
+
+
+class TestSeqLastPool2D(TestSeqAvgPool2D):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.LAST}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+            out[i] = np.reshape(sub_x[-1, :], (3, 17))
+
+
+class TestSeqFirstPool(TestSeqAvgPool):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.FIRST}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x[0, :]
+
+
+class TestSeqFirstPool2D(TestSeqAvgPool2D):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.FIRST}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+            out[i] = np.reshape(sub_x[0, :], (3, 17))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
index 377d07fb59..05ba954c0b 100644
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
@@ -57,7 +57,7 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
 
         self.inputs = {"Logits": logits, "Label": labels}
         self.outputs = {"Softmax": softmax, "Loss": cross_entropy}
-        self.attrs = {"softLabel": True}
+        self.attrs = {"soft_label": True}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/framework/tests/test_tensor.py b/python/paddle/v2/framework/tests/test_tensor.py
index 8cd93b35d7..e0cd2fa8aa 100644
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
@@ -6,7 +6,7 @@ import numpy
 class TestTensor(unittest.TestCase):
     def test_int_tensor(self):
         scope = core.Scope()
-        var = scope.new_var("test_tensor")
+        var = scope.var("test_tensor")
         place = core.CPUPlace()
 
         tensor = var.get_tensor()
@@ -25,7 +25,7 @@ class TestTensor(unittest.TestCase):
 
     def test_float_tensor(self):
         scope = core.Scope()
-        var = scope.new_var("test_tensor")
+        var = scope.var("test_tensor")
         place = core.CPUPlace()
 
         tensor = var.get_tensor()
@@ -46,7 +46,7 @@ class TestTensor(unittest.TestCase):
     def test_int_lod_tensor(self):
         place = core.CPUPlace()
         scope = core.Scope()
-        var_lod = scope.new_var("test_lod_tensor")
+        var_lod = scope.var("test_lod_tensor")
         lod_tensor = var_lod.get_tensor()
 
         lod_tensor.set_dims([4, 4, 6])
@@ -68,7 +68,7 @@ class TestTensor(unittest.TestCase):
     def test_float_lod_tensor(self):
         place = core.CPUPlace()
         scope = core.Scope()
-        var_lod = scope.new_var("test_lod_tensor")
+        var_lod = scope.var("test_lod_tensor")
 
         lod_tensor = var_lod.get_tensor()
         lod_tensor.set_dims([5, 2, 3, 4])
diff --git a/python/paddle/v2/framework/tests/test_tensor_array.py b/python/paddle/v2/framework/tests/test_tensor_array.py
index 11f8a01f92..50b3e09162 100644
--- a/python/paddle/v2/framework/tests/test_tensor_array.py
+++ b/python/paddle/v2/framework/tests/test_tensor_array.py
@@ -13,7 +13,7 @@ class TestTensorArray(unittest.TestCase):
 
         # create a LoDTensor
         self.scope = core.Scope()
-        var = self.scope.new_var("test_tensor")
+        var = self.scope.var("test_tensor")
         self.place = core.CPUPlace()
         tensor = var.get_tensor()
         tensor.set_dims([self.batch_size, self.dim])
@@ -51,7 +51,7 @@ class TestTensorArray(unittest.TestCase):
         self.ta.unstack(self.tensor)
 
         # create a tensor with shape of [1, self.dim]
-        var = self.scope.new_var("hell")
+        var = self.scope.var("hell")
         tensor = var.get_tensor()
         tensor.set_dims([1, self.dim])
         tensor.alloc_float(self.place)
@@ -71,7 +71,7 @@ class TestTensorArray(unittest.TestCase):
         self.ta.unstack(self.tensor)
 
         # create a tensor with shape of [1, self.dim]
-        var = self.scope.new_var("hell")
+        var = self.scope.var("hell")
         tensor = var.get_tensor()
         tensor.set_dims([1, self.dim])
         tensor.alloc_float(self.place)
diff --git a/python/paddle/v2/framework/tests/test_uniform_random_op.py b/python/paddle/v2/framework/tests/test_uniform_random_op.py
index 30c59789d3..a2d28a65a6 100644
--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
@@ -14,7 +14,7 @@ class TestUniformRandomOp(unittest.TestCase):
 
     def uniform_random_test(self, place):
         scope = core.Scope()
-        scope.new_var('X').get_tensor()
+        scope.var('X').get_tensor()
 
         op = Operator(
             "uniform_random",
diff --git a/python/paddle/v2/framework/tests/test_variable.py b/python/paddle/v2/framework/tests/test_variable.py
index 8ea1083ff6..6fb934c743 100644
--- a/python/paddle/v2/framework/tests/test_variable.py
+++ b/python/paddle/v2/framework/tests/test_variable.py
@@ -1,5 +1,5 @@
 import unittest
-from paddle.v2.framework.graph import Variable, g_program
+from paddle.v2.framework.framework import Variable, g_program
 import paddle.v2.framework.core as core
 import numpy as np
 
@@ -21,6 +21,7 @@ class TestVariable(unittest.TestCase):
         b = g_program.current_block()
         w = b.create_var(
             dtype="float64", shape=[784, 100], lod_level=0, name="fc.w")
+        self.assertNotEqual(str(w), "")
         self.assertEqual(core.DataType.FP64, w.data_type)
         self.assertEqual((784, 100), w.shape)
         self.assertEqual("fc.w", w.name)
diff --git a/v1_api_demo/README.md b/v1_api_demo/README.md
index 9442f76941..0460a85fae 100644
--- a/v1_api_demo/README.md
+++ b/v1_api_demo/README.md
@@ -1,4 +1,4 @@
-The examples in v1_api_demo are using v1_api now, and will be upgraded into v2_api later.
+The examples in v1_api_demo are using v1_api currently, and will be upgraded to v2_api later.
 Thus, v1_api_demo is a temporary directory. We decide not to maintain it and will delete it in future.
 
 Please go to [PaddlePaddle/book](https://github.com/PaddlePaddle/book) and