merge baidu/develop

8 years ago · aa1ac36cc2
parent 5f5c00f244 fc5c432a32
commit aa1ac36cc2
135 changed files with 2817 additions and 1070 deletions
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@ -6,8 +6,18 @@ height = 227
 width = 227
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
+gp = get_config_arg('layer_num', int, 1)
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)

-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer,
+    'num_samples': num_samples
+}
 define_py_data_sources2(
    "train.list", None, module="provider", obj="process", args=args)

@ -31,7 +41,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2)

 # conv2
 net = img_conv_layer(
-    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
 net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
 net = img_pool_layer(input=net, pool_size=3, stride=2)

@ -40,11 +50,11 @@ net = img_conv_layer(
    input=net, filter_size=3, num_filters=384, stride=1, padding=1)
 # conv4
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)

 # conv5
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
 net = img_pool_layer(input=net, pool_size=3, stride=2)

 net = fc_layer(
@ -59,6 +69,9 @@ net = fc_layer(
    layer_attr=ExtraAttr(drop_rate=0.5))
 net = fc_layer(input=net, size=1000, act=SoftmaxActivation())

-lab = data_layer('label', num_class)
-loss = cross_entropy(input=net, label=lab)
-outputs(loss)
+if is_infer:
+    outputs(net)
+else:
+    lab = data_layer('label', num_class)
+    loss = cross_entropy(input=net, label=lab)
+    outputs(loss)
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
 use_gpu = get_config_arg('use_gpu', bool, True)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)

 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@ -14,6 +14,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
    else:
        settings.data_size = settings.height * settings.width
    settings.is_infer = kwargs.get('is_infer', False)
+    settings.num_samples = kwargs.get('num_samples', 2560)
    if settings.is_infer:
        settings.slots = [dense_vector(settings.data_size)]
    else:
@ -23,7 +24,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
@provider(
    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_list):
-    for i in xrange(2560 if settings.is_infer else 1024):
+    for i in xrange(settings.num_samples):
        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
        if settings.is_infer:
            yield img.astype('float32')
--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg("layer_num", int, 50)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)

 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,
--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@ -37,7 +37,7 @@ function infer() {
      --trainer_count=1 \
      --num_passes=1 \
      --save_dir="models/${topology}-${layer_num}" \
-      --config_args="batch_size=128,layer_num=${layer_num}" \
+      --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
      > /dev/null 2>&1
    echo "Done"
  fi
@ -79,8 +79,9 @@ fi
 # inference benchmark
 for use_mkldnn in True False; do
  for batchsize in 1 2 4 8 16; do
-    infer googlenet v1 $batchsize $use_mkldnn
-    infer resnet 50 $batchsize $use_mkldnn
    infer vgg 19 $batchsize $use_mkldnn
+    infer resnet 50 $batchsize $use_mkldnn
+    infer googlenet v1 $batchsize $use_mkldnn
+    infer alexnet 2 $batchsize $use_mkldnn
  done
 done
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@ -47,5 +47,6 @@ for use_mkldnn in True False; do
    train vgg 19 $batchsize $use_mkldnn
    train resnet 50 $batchsize $use_mkldnn
    train googlenet v1 $batchsize $use_mkldnn
+    train alexnet 2 $batchsize $use_mkldnn
  done
 done
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@ -23,24 +23,25 @@ function infer() {
    echo "./run_mkl_infer.sh to save the model first"
    exit 0
  fi
-  log_period=$((256 / bs))
+  log_period=$((32 / bs))
  paddle train --job=test \
    --config="${topology}.py" \
+    --use_mkldnn=False \
    --use_gpu=False \
    --trainer_count=$thread \
    --log_period=$log_period \
-    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
    --init_model_path=$models_in \
    2>&1 | tee ${log}

-  # calculate the last 5 logs period time of 1280 samples,
+  # calculate the last 5 logs period time of 160(=32*5) samples,
  # the time before are burning time.
  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
  start_sec=`clock_to_seconds $start`
  end_sec=`clock_to_seconds $end`
-  fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
-  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
+  echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 }

@ -56,7 +57,8 @@ fi

 # inference benchmark
 for batchsize in 1 2 4 8 16; do
-  infer googlenet v1 $batchsize
-  infer resnet 50 $batchsize
  infer vgg 19 $batchsize
+  infer resnet 50 $batchsize 
+  infer googlenet v1 $batchsize
+  infer alexnet 2 $batchsize
 done
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
@ -12,10 +12,11 @@ function train() {
  config="${topology}.py"
  paddle train --job=time \
    --config=$config \
+    --use_mkldnn=False \
    --use_gpu=False \
    --trainer_count=$thread \
-    --log_period=10 \
-    --test_period=100 \
+    --log_period=3 \
+    --test_period=30 \
    --config_args=$args \
    2>&1 | tee ${log} 

@ -36,4 +37,5 @@ for batchsize in 64 128 256; do
  train vgg 19 $batchsize
  train resnet 50 $batchsize
  train googlenet v1 $batchsize
+  train alexnet 2 $batchsize
 done
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg('layer_num', int, 19)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)

 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@ -253,9 +253,9 @@ IF(NOT PROTOBUF_FOUND)
    IF(WITH_C_API)
        INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
        IF(ANDROID)
-            INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
        ELSE()
-            INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib)
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib)
        ENDIF()
    ENDIF()

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -467,7 +467,7 @@ lambda_cost
    :noindex:

 square_error_cost
--------
+-----------------
 ..  autoclass:: paddle.v2.layer.square_error_cost
    :noindex:

@ -533,7 +533,7 @@ Miscs
 =====

 dropout
--------------
+--------
 ..  autoclass:: paddle.v2.layer.dropout
    :noindex:

--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
@ -3,19 +3,19 @@ Nets
 ===========

 simple_img_conv_pool
-----------
+--------------------
 ..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
    :noindex:


 img_conv_group
-----------
+---------------
 ..  autofunction:: paddle.v2.fluid.nets.img_conv_group
    :noindex:


 sequence_conv_pool
-----------
+------------------
 ..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
    :noindex:

--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
@ -18,7 +18,7 @@ SGDOptimizer


 MomentumOptimizer
-----------
+-----------------
 ..  automodule:: paddle.v2.fluid.optimizer
    :members: MomentumOptimizer
    :noindex:
@ -26,14 +26,14 @@ MomentumOptimizer


 AdagradOptimizer
-----------
+----------------
 ..  automodule:: paddle.v2.fluid.optimizer
    :members: AdagradOptimizer
    :noindex:


 AdamOptimizer
-----------
+-------------
 ..  automodule:: paddle.v2.fluid.optimizer
    :members: AdamOptimizer
    :noindex:
@ -47,7 +47,7 @@ AdamaxOptimizer


 DecayedAdagradOptimizer
-----------
+-----------------------
 ..  automodule:: paddle.v2.fluid.optimizer
    :members: DecayedAdagradOptimizer
    :noindex:
--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
@ -3,14 +3,14 @@ Regularizer
 ===========

 WeightDecayRegularizer
-----------
+----------------------
 ..  automodule:: paddle.v2.fluid.regularizer
    :members: WeightDecayRegularizer
    :noindex:


 L2DecayRegularizer
-----------
+------------------
 ..  automodule:: paddle.v2.fluid.regularizer
    :members: L2DecayRegularizer
    :noindex:
@ -18,7 +18,7 @@ L2DecayRegularizer


 L1DecayRegularizer
-----------
+-------------------
 ..  automodule:: paddle.v2.fluid.regularizer
    :members: L1DecayRegularizer

--- a/doc/design/kernel_hint_design.md
+++ b/doc/design/kernel_hint_design.md
@ -0,0 +1,57 @@
+## Problem
+In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
+
+In the current design, we use KernelType to describe one kernel.
+
+```cpp
+struct KernelType {
+  Place place_;
+  DataType data_type_;
+  LayoutType layout_;
+};
+```
+ `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
+
+The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
+
+So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
+
+The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
+
+## Solution
+
+### Potential choice
+1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
+
+2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
+
+### Final choice
+To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
+
+In C++
+
+```cpp
+const std::string kForceCPU = "force_cpu";
+const std::string kUseCUDNN = "use_cudnn";
+const std::string kUseMKLDNN = "use_mkldnn";
+
+KernelType GetExpectedKernelType() {
+  if (Attr<bool>(kForceCPU)) {
+    return KernelType(CPUPlace, ...)
+  } else {
+    ...
+  }
+}
+```
+
+In Python code
+
+```python
+FORCE_CPU = core.kForceCPU()
+
+def xx_layer(..., force_cpu=false):
+  layer_helper = LayerHelper(...)
+  layer_helper.append_op(
+    type="xx",
+    attr={FORCE_CPU: force_cpu})
+```
--- a/doc/design/operator_kernel_type.md
+++ b/doc/design/operator_kernel_type.md
@ -0,0 +1,91 @@
+# Design Doc: The Keys of Operator Kernel Type
+## Problem
+An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique Kernel. Before an operator runs, an certain kernel must be chosen by a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  proto::DataType data_type_;
+};
+```
+For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
+
+It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys are not enough. We need a more complete representation of `OpKernelType`. 
+
+We often implement a kernel of an operator with some computing library in certain device(place). Please remind that computing library and device are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. 
+
+For example, Eigen library can support Nvidia GPU/AMD GPU/CPU. And MKLDNN library can support Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
+
+It's obvious that different DataTypes, like fp64/fp32/int8 will have different kernels. But the data layout of a Tensor will also lead to different implementation. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209). Data Layout should also be taken into consideration.
+
+## Solution
+
+There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  platform::Library library_;
+  proto::DataType data_type_;
+  framework::Layout layout_;
+};
+```
+
+Following is the details:
+
+### Place
+
+`Place` is defined as follows:
+
+```cpp
+typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
+```
+
+`Place` is to represent the device memory where data is locating.
+
+
+### Library
+
+One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
+
+```cpp
+enum Library { Plain, MKLDNN, CUDNN };
+```
+
+We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
+A library usually has a corresponding `DeviceContext` which contains some handles needed by computation. Fluid now have two default DeviceContexts in CPU and CUDA, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains a Eigen library handle and `CDUADeviceContext` contains a Eigen library handle and cuBLAS handle.
+
+If we want to support new Library, a new enumerator need to be added to `Library` and a new corresponding `LibraryDeviceContext` will be created.
+
+
+### DataType
+
+
+`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
+
+### Layout
+
+Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
+
+Different layout leads to different implementation of operator kernel. There are mainly 4 principles we have to follow to support layout in our fluid framework.
+
+- We take layout as a data member of Tensor. Layout is actually a enum variable. If fluid is built with MKLDNN, then, the memory format in MKLDNN will be added into this enum variable too.
+
+- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout of generating data. Of course, we can have some default layout, like NCHW.
+
+- The inference of Layout is at run-time, not compile-time.
+
+- Every operator have to implement different kernels for different layouts. Let's take MKLDNN as an example, if we want to implement a MKLDNN convolution operator, we have to realize all the kernels for different layout, list at [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to do registering kernels for MKLDNN operators.
+
+`Layout` is also defined as a enum variable:
+
+```cpp
+enum Layout {
+  kNCHW,
+  kNHWC,
+#ifdef PADDLE_WITH_MKLDNN
+  knChw8c
+  ...
+#endif
+};
+```
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@ -70,13 +70,13 @@ PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其
   :header: "依赖", "版本", "说明"
   :widths: 10, 15, 30

-   "CMake", ">=3.5", ""
+   "CMake", ">=3.2", ""
   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
-   "Python", "2.7.x", "依赖libpython2.7.so"
-   "pip", ">=9.0", ""
-   "numpy", "", ""
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
   "SWIG", ">=2.0", ""
-   "Go", ">=1.8", "可选"
+   "Go", ">=1.8", "可选"


 .. _build_options:
--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@ -76,13 +76,13 @@ will be downloaded automatically.
   :header: "Dependency", "Version", "Description"
   :widths: 10, 15, 30

-   "CMake", ">=3.5", ""
+   "CMake", ">=3.2", ""
   "GCC", "4.8.2", "Recommend devtools2 for CentOS"
-   "Python", "2.7.x", "Need libpython2.7.so"
-   "pip", ">=9.0", ""
-   "numpy", "", ""
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
   "SWIG", ">=2.0", ""
-   "Go", ">=1.8", "Optional"
+   "Go", ">=1.8", "Optional"


 .. _build_options:
--- a/doc/getstarted/build_and_install/pip_install_cn.rst
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@ -37,11 +37,11 @@ PaddlePaddle可以使用常用的Python包管理工具
    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API"
    :widths: 1, 3, 3, 3

-    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"

 .. _pip_dependency:

--- a/doc/getstarted/build_and_install/pip_install_en.rst
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@ -40,11 +40,11 @@ If the links below shows up the login form, just click "Log in as guest" to star
    :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API"
    :widths: 1, 3, 3, 3

-    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"

 .. _pip_dependency:

--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@ -42,7 +42,7 @@ static std::unordered_set<std::string>& CtrlFlowOps() {
 static inline std::unique_ptr<OperatorBase> CreateGradOp(
    const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
    std::unordered_map<std::string, std::string>* grad_to_var) {
-  OpDescBind op_desc;
+  OpDesc op_desc;
  op_desc.SetInputMap(op.Inputs());
  op_desc.SetOutputMap(op.Outputs());
  op_desc.SetType(op.Type());
@ -53,7 +53,7 @@ static inline std::unique_ptr<OperatorBase> CreateGradOp(
  grad_ops.reserve(grad_descs.size());
  std::transform(grad_descs.begin(), grad_descs.end(),
                 std::back_inserter(grad_ops),
-                 [](const std::unique_ptr<OpDescBind>& grad_desc) {
+                 [](const std::unique_ptr<OpDesc>& grad_desc) {
                   return OpRegistry::CreateOp(*grad_desc);
                 });
  PADDLE_ENFORCE(!grad_ops.empty());
@ -217,7 +217,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
        // If part of input gradient of that operator is not calculated, fill
        // zero variables to that input gradient.
        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
-                                           {{"Y", {grad_input}}},
+                                           {{"Out", {grad_input}}},
                                           AttributeMap{}));
      }
      return false;
@ -296,7 +296,7 @@ static std::string FwdName(const std::string& grad_name) {
 static void CreateGradVarInBlock(
    size_t grad_op_start_index,
    const std::unordered_map<std::string, std::string>& param_name_map,
-    BlockDescBind* block_desc,
+    BlockDesc* block_desc,
    std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
  auto ops = block_desc->AllOps();
  for (size_t op_index = grad_op_start_index; op_index < ops.size();
@ -350,12 +350,11 @@ static void CreateGradVarInBlock(
  }
 }

-std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
-    const OpDescBind* op_desc, std::unordered_set<std::string>* no_grad_vars,
+std::vector<std::unique_ptr<OpDesc>> MakeOpGrad(
+    const OpDesc* op_desc, std::unordered_set<std::string>* no_grad_vars,
    std::unordered_map<std::string, std::string>* grad_to_var,
-    const std::vector<BlockDescBind*>& grad_block =
-        std::vector<BlockDescBind*>()) {
-  std::vector<std::unique_ptr<OpDescBind>> grad_op_descs;
+    const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>()) {
+  std::vector<std::unique_ptr<OpDesc>> grad_op_descs;
  // All input gradients of forwarding operator do not need to calculate.
  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
  if (AllGradInSet(inputs, *no_grad_vars)) {
@ -386,7 +385,7 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
          .Get(op_desc->Type())
          .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block);

-  std::list<std::unique_ptr<OpDescBind>> pending_fill_zeros_ops;
+  std::list<std::unique_ptr<OpDesc>> pending_fill_zeros_ops;
  for (auto& desc : grad_op_descs) {
    for (const std::string& in_name : desc->InputArgumentNames()) {
      if (no_grad_vars->count(in_name)) {
@ -394,9 +393,9 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
            0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
        std::string new_name = prefix + kZeroVarSuffix;
        desc->Rename(in_name, new_name);
-        std::unique_ptr<OpDescBind> fill_zeros_op(
-            new OpDescBind("fill_zeros_like", {{"X", {prefix}}},
-                           {{"Y", {new_name}}}, AttributeMap{}));
+        std::unique_ptr<OpDesc> fill_zeros_op(
+            new OpDesc("fill_zeros_like", {{"X", {prefix}}},
+                       {{"Out", {new_name}}}, AttributeMap{}));
        pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
      }
    }
@ -408,34 +407,33 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
  return grad_op_descs;
 }

-static BlockDescBind* CreateStepBlock(
-    ProgramDescBind& program_desc,
-    std::unordered_set<std::string>* no_grad_vars,
+static BlockDesc* CreateStepBlock(
+    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
    std::unordered_map<std::string, std::string>* grad_to_var,
    int step_block_idx);

-std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
-    ProgramDescBind& program_desc, int block_idx,
+std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
+    ProgramDesc& program_desc, int block_idx,
    std::unordered_set<std::string>* no_grad_vars,
    std::unordered_map<std::string, std::string>* grad_to_var) {
  VLOG(5) << "MakeBlockBackward";
-  BlockDescBind* cur_block = program_desc.MutableBlock(block_idx);
-  std::vector<OpDescBind*> op_descs = cur_block->AllOps();
+  BlockDesc* cur_block = program_desc.MutableBlock(block_idx);
+  std::vector<OpDesc*> op_descs = cur_block->AllOps();
  std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
  size_t grad_desc_idx = 0;
-  std::vector<std::unique_ptr<OpDescBind>> backward_descs;
+  std::vector<std::unique_ptr<OpDesc>> backward_descs;

  for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
    VLOG(5) << "Making backward " << (*it)->Type() << " op";
-    std::vector<std::unique_ptr<OpDescBind>> op_grads;
+    std::vector<std::unique_ptr<OpDesc>> op_grads;

    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
      int step_block_idx = (*it)->GetBlockAttr("sub_block");
-      BlockDescBind* backward_block = CreateStepBlock(
-          program_desc, no_grad_vars, grad_to_var, step_block_idx);
+      BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
+                                                  grad_to_var, step_block_idx);
      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
    } else if ((*it)->Type() == "conditional_block") {
-      BlockDescBind* backward_block =
+      BlockDesc* backward_block =
          CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
                          (*it)->GetBlockAttr("sub_block"));
      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
@ -463,14 +461,14 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
      }
      ++grad_desc_idx;
    }
-    std::transform(
-        op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
-        [](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
+    std::transform(op_grads.begin(), op_grads.end(),
+                   std::back_inserter(backward_descs),
+                   [](std::unique_ptr<OpDesc>& ptr) { return std::move(ptr); });
  }

  VLOG(5) << "Appending Sums";
  // Check whether some variables are written more than once
-  std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
+  std::list<std::pair<size_t, std::unique_ptr<OpDesc>>> pending_sum_ops;
  for (const auto& dup : dup_out_ops) {
    const std::string& out_name = dup.first;
    const std::vector<size_t> dup_op = dup.second;
@ -486,18 +484,17 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
        sum_op_inputs.emplace_back(new_name);
        next_g_name = sum_op_inputs.back();
      }
-      std::unique_ptr<OpDescBind> sum_op(
-          new OpDescBind("sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}},
-                         AttributeMap{}));
+      std::unique_ptr<OpDesc> sum_op(new OpDesc("sum", {{"X", sum_op_inputs}},
+                                                {{"Out", {out_name}}},
+                                                AttributeMap{}));
      pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
    }
  }

-  pending_sum_ops.sort(
-      [](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
-         const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
-        return a.first > b.first;
-      });
+  pending_sum_ops.sort([](const std::pair<size_t, std::unique_ptr<OpDesc>>& a,
+                          const std::pair<size_t, std::unique_ptr<OpDesc>>& b) {
+    return a.first > b.first;
+  });
  for (auto& p : pending_sum_ops) {
    backward_descs.insert(backward_descs.begin() + p.first + 1,
                          std::move(p.second));
@ -508,14 +505,13 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
  return backward_descs;
 }

-static BlockDescBind* CreateStepBlock(
-    ProgramDescBind& program_desc,
-    std::unordered_set<std::string>* no_grad_vars,
+static BlockDesc* CreateStepBlock(
+    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
    std::unordered_map<std::string, std::string>* grad_to_var,
    int step_block_idx) {
  auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
                                                   no_grad_vars, grad_to_var);
-  BlockDescBind* backward_block =
+  BlockDesc* backward_block =
      program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
  for (auto& ptr : backward_block_op_descs) {
    backward_block->AppendAllocatedOp(move(ptr));
@ -524,7 +520,7 @@ static BlockDescBind* CreateStepBlock(
 }

 ParamGradInfoMap AppendBackward(
-    ProgramDescBind& program_desc, const VarDescBind& target,
+    ProgramDesc& program_desc, const VarDesc& target,
    const std::unordered_set<std::string>& no_grad_vars) {
  std::unordered_set<std::string> no_grad_var_names;
  no_grad_var_names.reserve(no_grad_vars.size() + 1);
@ -541,11 +537,11 @@ ParamGradInfoMap AppendBackward(
  PADDLE_ENFORCE(is_scalar, "target should be scalar");
  VLOG(3) << "backward from loss=" << target.Name()
          << " data_type=" << target.GetDataType();
-  std::unique_ptr<OpDescBind> fill_one_op(
-      new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
-                     {{"shape", std::vector<int>{1}},
-                      {"value", static_cast<float>(1.0)},
-                      {"dtype", target.GetDataType()}}));
+  std::unique_ptr<OpDesc> fill_one_op(
+      new OpDesc("fill_constant", {}, {{"Out", {fill_one_op_out}}},
+                 {{"shape", std::vector<int>{1}},
+                  {"value", static_cast<float>(1.0)},
+                  {"dtype", target.GetDataType()}}));
  // infer var type of fill_one_op
  fill_one_op->InferVarType(root_block);

--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
@ -49,7 +49,7 @@ using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
                                            GradVarInfo /*grad_var_info*/>;

 ParamGradInfoMap AppendBackward(
-    ProgramDescBind& program_desc, const VarDescBind& target,
+    ProgramDesc& program_desc, const VarDesc& target,
    const std::unordered_set<std::string>& no_grad_vars);

 }  // namespace framework
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@ -19,18 +19,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-VarDescBind *BlockDescBind::Var(const std::string &name) {
+VarDesc *BlockDesc::Var(const std::string &name) {
  auto it = vars_.find(name);
  if (it != vars_.end()) {
    return it->second.get();
  }
  need_update_ = true;
-  auto *var = new VarDescBind(name);
+  auto *var = new VarDesc(name);
  vars_[name].reset(var);
  return var;
 }

-VarDescBind *BlockDescBind::FindVar(const std::string &name) const {
+VarDesc *BlockDesc::FindVar(const std::string &name) const {
  auto it = vars_.find(name);
  if (it == vars_.end()) {
    return nullptr;
@ -38,11 +38,11 @@ VarDescBind *BlockDescBind::FindVar(const std::string &name) const {
  return it->second.get();
 }

-bool BlockDescBind::HasVar(const std::string &name) const {
+bool BlockDesc::HasVar(const std::string &name) const {
  return vars_.find(name) != vars_.end();
 }

-VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
+VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const {
  if (name == kEmptyVarName) return nullptr;

  auto it = vars_.find(name);
@ -53,53 +53,67 @@ VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
  return it->second.get();
 }

-VarDescBind *BlockDescBind::FindRecursiveOrCreateVar(
-    const std::string &name_bytes) {
-  VarDescBind *res = FindVarRecursive(name_bytes);
+VarDesc *BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) {
+  VarDesc *res = FindVarRecursive(name_bytes);
  if (res == nullptr) {
    res = Var(name_bytes);
  }
  return res;
 }

-bool BlockDescBind::HasVarRecursive(const std::string &name) const {
+bool BlockDesc::HasVarRecursive(const std::string &name) const {
  return FindVarRecursive(name) != nullptr;
 }

-std::vector<VarDescBind *> BlockDescBind::AllVars() const {
-  std::vector<VarDescBind *> res;
+std::vector<VarDesc *> BlockDesc::AllVars() const {
+  std::vector<VarDesc *> res;
  for (const auto &p : vars_) {
    res.push_back(p.second.get());
  }
  return res;
 }

-OpDescBind *BlockDescBind::AppendOp() {
+OpDesc *BlockDesc::AppendOp() {
  need_update_ = true;
-  ops_.emplace_back(new OpDescBind());
+  ops_.emplace_back(new OpDesc());
  return ops_.back().get();
 }

-void BlockDescBind::AppendAllocatedOp(std::unique_ptr<OpDescBind> &&op_desc) {
+void BlockDesc::AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {
  need_update_ = true;
  ops_.emplace_back(std::move(op_desc));
 }

-OpDescBind *BlockDescBind::PrependOp() {
+OpDesc *BlockDesc::PrependOp() {
  need_update_ = true;
-  ops_.emplace_front(new OpDescBind());
+  ops_.emplace_front(new OpDesc());
  return ops_.front().get();
 }

-std::vector<OpDescBind *> BlockDescBind::AllOps() const {
-  std::vector<OpDescBind *> res;
+void BlockDesc::RemoveOp(size_t s, size_t e) {
+  if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
+    return;
+  }
+  need_update_ = true;
+  for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) {
+    auto names = (*it)->InputArgumentNames();
+    for (auto n : names) {
+      // TODO(typhoonzero): delete vars if no other op use it.
+      VLOG(3) << "deleting var " << n;
+    }
+  }
+  ops_.erase(ops_.begin() + s, ops_.begin() + e);
+}
+
+std::vector<OpDesc *> BlockDesc::AllOps() const {
+  std::vector<OpDesc *> res;
  for (const auto &op : ops_) {
    res.push_back(op.get());
  }
  return res;
 }

-void BlockDescBind::Flush() {
+void BlockDesc::Flush() {
  for (auto &op_desc : ops_) {
    op_desc->Flush();
  }
@ -121,43 +135,43 @@ void BlockDescBind::Flush() {
  }
 }

-BlockDescBind *BlockDescBind::ParentBlock() const {
+BlockDesc *BlockDesc::ParentBlock() const {
  if (this->desc_->parent_idx() == kNoneBlockIndex) {
    return nullptr;
  }
  return prog_->MutableBlock(static_cast<size_t>(this->desc_->parent_idx()));
 }

-proto::BlockDesc *BlockDescBind::Proto() {
+proto::BlockDesc *BlockDesc::Proto() {
  Flush();
  return desc_;
 }

-BlockDescBind::BlockDescBind(ProgramDescBind *prog, proto::BlockDesc *desc)
+BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
    : prog_(prog), desc_(desc), need_update_(false) {
  for (const proto::VarDesc &var_desc : desc_->vars()) {
-    vars_[var_desc.name()].reset(new VarDescBind(var_desc));
+    vars_[var_desc.name()].reset(new VarDesc(var_desc));
  }
  for (const proto::OpDesc &op_desc : desc_->ops()) {
-    ops_.emplace_back(new OpDescBind(op_desc, prog));
+    ops_.emplace_back(new OpDesc(op_desc, prog));
  }
 }

-BlockDescBind::BlockDescBind(const BlockDescBind &other, proto::BlockDesc *desc,
-                             ProgramDescBind *prog)
+BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
+                     ProgramDesc *prog)
    : prog_(prog), desc_(desc) {
  need_update_ = true;
  for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDescBind(*op));
+    ops_.emplace_back(new OpDesc(*op));
  }

  for (auto &it : other.vars_) {
-    auto *var = new VarDescBind(*it.second);
+    auto *var = new VarDesc(*it.second);
    vars_[it.first].reset(var);
  }
 }

-void BlockDescBind::ClearPBOps() {
+void BlockDesc::ClearPBOps() {
  auto ops = this->desc_->mutable_ops();
  while (!ops->empty()) {
    // we do not own the OpDesc, so release the ownership.
@ -165,7 +179,7 @@ void BlockDescBind::ClearPBOps() {
  }
 }

-void BlockDescBind::ClearPBVars() {
+void BlockDesc::ClearPBVars() {
  auto vars = this->desc_->mutable_vars();
  while (!vars->empty()) {
    // we do not own the VarDesc, so release the ownership.
--- a/Show More
+++ b/Show More