diff --git a/.clang-format b/.clang-format
index aff93435f5..8b58306273 100644
--- a/.clang-format
+++ b/.clang-format
@@ -19,7 +19,7 @@ BasedOnStyle:  Google
 IndentWidth:     2
 TabWidth:        2
 ContinuationIndentWidth: 4
-AccessModifierOffset: -2  # The private/protected/public has no indent in class
+AccessModifierOffset: -1  # The private/protected/public has no indent in class
 Standard:  Cpp11 
 AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6140340890..eeda759ff1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -34,6 +34,14 @@ repos:
         entry: bash ./tools/codestyle/cpplint_pre_commit.hook
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
+-   repo: local
+    hooks:
+    -   id: pylint-doc-string
+        name: pylint
+        description: Check python docstring style using docstring_checker.
+        entry: bash ./tools/codestyle/pylint_pre_commit.hook
+        language: system
+        files: \.(py)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
     sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
     hooks:
diff --git a/.travis.yml b/.travis.yml
index 3391e2c3ca..8c77203092 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -18,6 +18,8 @@ env:
 addons:
   ssh_known_hosts: 13.229.163.131
 before_install:
+  # For pylint dockstring checker
+  - sudo pip install pylint pytest astroid isort
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
diff --git a/AUTHORS.md b/AUTHORS.md
index 4ee0542098..11f227be71 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -4,6 +4,7 @@
 | backyes | Yan-Fei Wang |
 | baiyfbupt | Yi-Fan Bai |
 | beckett1124 | Bin Qi |
+| ChengduoZH | Cheng-Duo Zhao|
 | chengxiaohua1105 | Xiao-Hua Cheng |
 | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
 | cxysteven | Xing-Yi Cheng |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 030bd19b3f..b35290e12f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,7 +41,6 @@ option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FO
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
-option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check"         ON)
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
 option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
@@ -56,10 +55,13 @@ option(WITH_FLUID_ONLY  "Compile PaddlePaddle fluid only"               OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
-option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
+option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
+option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
+option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
+option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -100,6 +102,9 @@ endif()
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
 
+set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
+  "A path setting fluid shared and static libraries")
+
 if (WITH_C_API AND WITH_PYTHON)
   message(WARNING "It is suggest not embedded a python interpreter in Paddle "
     "when using C-API. It will give an unpredictable behavior when using a "
@@ -117,13 +122,14 @@ else()
 endif()
 
 set(WITH_MKLML ${WITH_MKL})
-if (WITH_MKL AND AVX2_FOUND)
-    set(WITH_MKLDNN ON)
-else()
-    message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
-    set(WITH_MKLDNN OFF)
+if (NOT DEFINED WITH_MKLDNN)
+    if (WITH_MKL AND AVX2_FOUND)
+        set(WITH_MKLDNN ON)
+    else()
+        message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
+        set(WITH_MKLDNN OFF)
+    endif()
 endif()
-
 ########################################################################################
 
 include(external/mklml)     # download mklml package
@@ -142,7 +148,16 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
-include(external/grpc)
+
+if(WITH_DISTRIBUTE)
+    if(WITH_GRPC)
+        include(external/grpc)
+    else()
+        include(external/leveldb)
+        include(external/brpc)
+    endif()
+endif()
+
 include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
@@ -152,7 +167,6 @@ include(cupti)
 include(configure)          # add paddle env configuration
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
-include(cpplint)            # set paddle c++ style
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(rdma)               # set rdma libraries
@@ -201,7 +215,7 @@ endif(USE_NNPACK)
 
 add_subdirectory(proto)
 
-if(NOT MOBILE_INFERENCE)
+if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
     # "add_subdirectory(go)" should be placed after the following loine,
     # because it depends on paddle/optimizer.
     add_subdirectory(paddle/optimizer)
@@ -229,3 +243,7 @@ if(WITH_DOC)
     find_python_module(recommonmark REQUIRED)
     add_subdirectory(doc)
 endif()
+
+if (WITH_CONTRIB)
+    add_subdirectory(paddle/contrib)
+endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3c36cffcb4..b1b02bcc2f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -58,6 +58,8 @@ PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-
     create mode 100644 233
    ```
 
+	NOTE: The `yapf` installed by `pip install pre-commit` and `conda install -c conda-forge pre-commit` is slightly different. Paddle developers use `pip install pre-commit`.
+
 1. Build and test
 
    Users can build PaddlePaddle natively on Linux and Mac OS X.  But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md).
diff --git a/Dockerfile b/Dockerfile
index ea39efd00b..752fea5951 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -24,12 +24,12 @@ COPY ./paddle/scripts/docker/root/ /root/
 
 RUN apt-get update && \
     apt-get install -y --allow-downgrades \
-    git python-pip python-dev openssh-server bison \
+    git python-pip python-dev python-opencv openssh-server bison \
     libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
     wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
     python-matplotlib gcc-4.8 g++-4.8 \
-    automake locales clang-format swig doxygen cmake  \
+    automake locales clang-format swig cmake  \
     liblapack-dev liblapacke-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
     net-tools libtool ccache && \
@@ -76,8 +76,10 @@ RUN easy_install -U pip && \
     pip install sphinx-rtd-theme==0.1.9 recommonmark
 
 RUN pip install pre-commit 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip install opencv-python
+    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
+
+#For docstring checker
+RUN pip install pylint pytest astroid isort
 
 COPY ./python/requirements.txt /root/
 RUN pip install -r /root/requirements.txt
@@ -101,6 +103,3 @@ RUN echo 'root:root' | chpasswd
 RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
-
-# development image default do build work
-CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
diff --git a/Dockerfile.android b/Dockerfile.android
index 848a7eba6f..48db2efea2 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -40,5 +40,3 @@ RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
     unzip -q android-ndk-r14b-linux-x86_64.zip && \
     mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
     rm -rf /opt/android-ndk-tmp
-
-CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
diff --git a/benchmark/.gitignore b/benchmark/.gitignore
index 7b66e8a5b5..fb4114356d 100644
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@@ -7,3 +7,6 @@ paddle/rnn/imdb.pkl
 caffe/image/logs
 tensorflow/image/logs
 tensorflow/rnn/logs
+fluid/models/*.pyc
+fluid/logs
+fluid/nohup.out
diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
deleted file mode 100644
index 64816098a5..0000000000
--- a/benchmark/cluster/README.md
+++ /dev/null
@@ -1,196 +0,0 @@
-# Cluster Training Benchmark
-
-## Setup
-
-- Platform
-  - Kubernetes: v1.6.2
-  - Linux Kernel: v3.10.0
-
-- Resource
-  - CPU: 10 Cores per Pod
-  - Memory: 5GB per Pod
-
-- Docker Image
-
-  We use different base Docker Image to run the benchmark on Kubernetes:
-  - PaddlePaddle v2: paddlepaddle/paddle:0.11.0
-  - PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
-  - TensorFlow: tensorflow/tensorflow:1.5.0-rc0
-
-- Model
-  vgg16 is used in this benchmark.
-
-## Cases
-
-- Variable
-  - Batch Size of training data.
-  - PServer count of the training job.
-  - The number of trainers.
-
-- Invariant
-  - The resource of trainer/pserver Pod.
-
-### Measure the Performance for Different Batch Size
-
-- PServer Count: 40
-- Trainer Count: 100
-- Metrics: mini-batch / sec
-
-
-<table>
-<thead>
-<tr>
-<th>Batch Size </th>
-<th> 32</th>
-<th>64</th>
-<th>128 </th>
-<th>256</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td>-</td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-</tbody>
-</table>
-
-### Measure the Performance for Different PServer Count
-
-- Trainer Count: 100
-- Batch Size: 64
-- Metrics: mini-batch / sec
-
-
-<table>
-<thead>
-<tr>
-<th>PServer Count  </th>
-<th>10</th>
-<th>20</th>
-<th>40 </th>
-<th>60</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td>-</td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-</tbody>
-</table>
-
-### Measure Parallel Efficiency By Increasing Trainer Count
-
-- PServer Count: 20
-- Batch Size: 64
-- Metrics:
-
-$S = \div(T1, TN)$
-
-which S is the ratio of T1 over TN, training time of 1 and N trainers.
-The parallel efficiency is:
-
-$E = \div(S, N)$
-
-<table>
-<thead>
-<tr>
-<th>Trainer Counter  </th>
-<th>1</th>
-<th>10</th>
-<th>20 </th>
-<th>30</th>
-<th>40</th>
-<th>50</th>
-<th>60 </th>
-<th>70</th>
-<th>80</th>
-<th>90</th>
-<th>100 </th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-</tr>
-</tbody>
-</table>
-
-
-## Reproduce the benchmark
-
-TODO
diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
deleted file mode 100644
index 13ad8e1b62..0000000000
--- a/benchmark/cluster/vgg16/Dockerfile
+++ /dev/null
@@ -1,35 +0,0 @@
-FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
-
-# you can get mirror list here:
-# https://launchpad.net/ubuntu/+archivemirrors
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
-RUN pip install -U kubernetes opencv-python
-
-RUN pip install paddlepaddle
-# if network is slowly, you may need to add proxy here.
-# ENV https_proxy=
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
-RUN pip uninstall -y paddlepaddle
-# unset proxy if it is setted.
-# ENV https_proxy=""
-
-# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
-#       so we must build one with distribute support to install in this image.
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl
-ENV LD_LIBRARY_PATH=/usr/local/lib
-
-# tf k8s
-RUN pip install tensorflow==1.4.0
-ADD tf_k8s /usr/bin
-RUN chmod +x /usr/bin/tf_k8s
-ADD vgg16_tf.py /workspace/
-
-# below lines may change a lot for debugging
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-RUN chmod +x /usr/bin/paddle_k8s
-ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
deleted file mode 100644
index d56a912b9b..0000000000
--- a/benchmark/cluster/vgg16/README.md
+++ /dev/null
@@ -1,195 +0,0 @@
-# Performance for Distributed vgg16
-
-## Test Result
-
-### Hardware Infomation
-
-- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
-- cpu MHz		: 2101.000
-- cache size	: 20480 KB
-
-### Blas settings
-
-Setting environment variable: `MKL_NUM_THREADS=1`.
-
-### Single Node Single Thread
-
-- Metrics: samples / sec
-
-<table>
-<thead>
-<tr>
-<th>Batch Size </th>
-<th> 32</th>
-<th>64</th>
-<th>128 </th>
-<th>256</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td> 15.44 </td>
-<td> 16.32 </td>
-<td> 16.74 </td>
-<td> 16.79 </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td> 15.97 </td>
-<td> 17.04 </td>
-<td> 17.60 </td>
-<td> 17.83 </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> 9.09 </td>
-<td> 9.10 </td>
-<td> 9.24 </td>
-<td> 8.66 </td>
-</tr>
-</tbody>
-</table>
-
-
-### Different Batch Size
-
-- PServer Count: 10
-- Trainer Count: 20
-- Metrics: samples / sec
-
-<table>
-<thead>
-<tr>
-<th>Batch Size </th>
-<th> 32</th>
-<th>64</th>
-<th>128 </th>
-<th>256</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td> 190.20 </td>
-<td> 222.15 </td>
-<td> 247.40 </td>
-<td> 258.18 </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td> 170.96 </td>
-<td> 233.71 </td>
-<td> 256.14 </td>
-<td> 329.23 </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-</tr>
-</tbody>
-</table>
-
-### Accelerate Rate
-
-- Pserver Count: 20
-- Batch Size: 128
-- Metrics: samples / sec
-
-<table>
-<thead>
-<tr>
-<th>Trainer Count </th>
-<th>20</th>
-<th>40</th>
-<th>80</th>
-<th>100</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td> 263.29 (78.64%) </td>
-<td> 518.80 (77.47%) </td>
-<td> 836.26 (62.44%) </td>
-<td> 1019.29 (60.89%) </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2 (need more tests)   </td>
-<td> 326.85 (92.85%) </td>
-<td> 534.58 (75.93%) </td>
-<td> 853.30 (60.60%) </td>
-<td> 1041.99 (59.20%) </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-</tr>
-</tbody>
-</table>
-
-
-### Different Pserver Count
-
-- Trainer Count: 60
-- Batch Size: 128
-- Metrics: samples/ sec
-
-<table>
-<thead>
-<tr>
-<th>PServer Count </th>
-<th>3</th>
-<th>6</th>
-<th>10</th>
-<th>20</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid(should fix in next PR) </td>
-<td> 589.1 </td>
-<td> 592.6 </td>
-<td> 656.4 </td>
-<td> 655.8 </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2 (need more tests)   </td>
-<td> 593.4 </td>
-<td> 791.3 </td>
-<td> 729.7 </td>
-<td> 821.7 </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-</tr>
-</tbody>
-</table>
-
-
-*The performance gap between Fuild and v2 comes from the network interference.*
-
-
-## Steps to Run the Performance Test
-
-1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
-1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
-1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
-1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
-1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
-
-Check the logs for the distributed training progress and analyze the performance.
-
-## Enable Verbos Logs
-
-Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
diff --git a/benchmark/cluster/vgg16/fluid_pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml
deleted file mode 100644
index ee8b0763b6..0000000000
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16job-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        paddle-job-pserver: vgg16job
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16job
-        - name: MKL_NUM_THREADS
-          value: "1"
-        - name: TRAINING_ROLE
-          value: "PSERVER"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        command: ["paddle_k8s", "start_fluid"]
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
deleted file mode 100644
index 3d56caac00..0000000000
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16job-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        paddle-job: vgg16job
-    spec:
-      imagePullSecrets:
-      - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        command: ["paddle_k8s", "start_fluid"]
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16job
-        - name: TRAINING_ROLE
-          value: "TRAINER"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/run_vgg_dist.sh b/benchmark/cluster/vgg16/run_vgg_dist.sh
deleted file mode 100644
index 8c0501439e..0000000000
--- a/benchmark/cluster/vgg16/run_vgg_dist.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-# Update to point to the source file.
-VGG_SRC="vgg16_fluid.py"
-
-export TRAINING_ROLE=PSERVER
-export TRAINERS=2
-export POD_IP=127.0.0.1
-export PADDLE_INIT_PORT=6174
-MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
-
-# Need to wait for the ps to start first.
-sleep 10
-echo "done start ps"
-
-export TRAINING_ROLE=TRAINER
-export TRAINERS=2
-export POD_IP=127.0.0.1
-export PADDLE_INIT_PORT=6174
-CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
-CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
diff --git a/benchmark/cluster/vgg16/tf_k8s b/benchmark/cluster/vgg16/tf_k8s
deleted file mode 100644
index 4fc263d5f6..0000000000
--- a/benchmark/cluster/vgg16/tf_k8s
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-check_trainer_ret() {
-  ret=$1
-  stdbuf -oL echo "job returned $ret...setting pod return message..."
-  stdbuf -oL echo "==============================="
-
-  if [ $ret -eq 136 ] ; then
-    echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
-  elif [ $ret -eq 139 ] ; then
-    echo "Segmentation Fault" > /dev/termination-log
-  elif [ $ret -eq 1 ] ; then
-    echo "General Error" > /dev/termination-log
-  elif [ $ret -eq 134 ] ; then
-    echo "Program Abort" > /dev/termination-log
-  fi
-  stdbuf -oL echo "termination log wroted..."
-  exit $ret
-}
-
-g_pservers=""
-g_trainers=""
-
-wait_running_pods(){
-  pserver_label="tf-job-pserver=${JOB_NAME}"
-  trainer_label="tf-job-trainer=${JOB_NAME}"
-
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
-
-  g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
-  g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
-}
-
-start_tf_pserver(){
-  wait_running_pods
-
-  label="tf-job-pserver=${JOB_NAME}"
-  pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
-
-  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
-  --job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
-
-  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
-}
-
-start_tf_trainer(){
-  wait_running_pods
-
-  label="tf-job-trainer=${JOB_NAME}"
-  trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
-
-  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
-  --job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
-
-  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
-  check_trainer_ret $?
-}
-
-start_tf(){
-    if [[ "${TF_JOB_NAME}" == "worker" ]]; then
-        start_tf_trainer
-    else
-        start_tf_pserver
-    fi
-}
-
-usage() {
-    echo "usage: tf_k8s [<args>]:"
-    echo "  start_tf         Start tensorflow jobs"
-}
-
-case "$1" in
-    start_tf)
-        start_tf
-        ;;
-    --help)
-        usage
-        ;;
-    *)
-        usage
-        ;;
-esac
diff --git a/benchmark/cluster/vgg16/tf_pserver.yaml b/benchmark/cluster/vgg16/tf_pserver.yaml
deleted file mode 100644
index 5e37c70081..0000000000
--- a/benchmark/cluster/vgg16/tf_pserver.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16job-tf-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        tf-job-pserver: vgg16job-tf
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
-        imagePullPolicy: Always
-        command: ["tf_k8s", "start_tf"]
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PORT
-          value: "32036"
-        - name: ENTRY
-          value: "python vgg16_tf.py"
-        - name: JOB_NAME
-          value: vgg16job-tf
-        - name: PSERVERS_NUM
-          value: "10"
-        - name: TF_JOB_NAME 
-          value: "ps"
-        - name: TRAINERS_NUM
-          value: "20"
-        - name: BATCH_SIZE
-          value: "128"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: NUM_PASSES
-          value: "1"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
diff --git a/benchmark/cluster/vgg16/tf_trainer.yaml b/benchmark/cluster/vgg16/tf_trainer.yaml
deleted file mode 100644
index 08795df3ad..0000000000
--- a/benchmark/cluster/vgg16/tf_trainer.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16job-tf-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        tf-job-trainer: vgg16job-tf
-    spec:
-      imagePullSecrets:
-      - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
-        imagePullPolicy: Always
-        command: ["tf_k8s", "start_tf"]
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PORT
-          value: "32036"
-        - name: JOB_NAME
-          value: vgg16job-tf
-        - name: TF_JOB_NAME 
-          value: "worker"
-        - name: ENTRY
-          value: "python vgg16_tf.py"
-        - name: PSERVERS_NUM
-          value: "10"
-        - name: BATCH_SIZE
-          value: "128"
-        - name: TRAINERS_NUM
-          value: "20"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: NUM_PASSES
-          value: "1"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml
deleted file mode 100644
index dd1271e0cf..0000000000
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16v2job-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        paddle-job-pserver: vgg16v2job
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16v2job
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "python train.py"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        command: ["paddle_k8s", "start_pserver"]
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
deleted file mode 100644
index 12c8964066..0000000000
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16v2job-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        paddle-job: vgg16v2job
-    spec:
-      imagePullSecrets:
-        - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        command: ["paddle_k8s", "start_trainer", "v2"]
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16v2job
-        - name: BATCH_SIZE
-          value: "256"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "2"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
deleted file mode 100644
index 05b5f3977c..0000000000
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ /dev/null
@@ -1,308 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in Fluid"""
-from __future__ import print_function
-
-import sys
-import time
-import numpy as np
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.profiler as profiler
-import argparse
-import functools
-import os
-from paddle.fluid import debuger
-
-
-def str2bool(v):
-    if v.lower() in ('yes', 'true', 't', 'y', '1'):
-        return True
-    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
-        return False
-    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
-    '--learning_rate',
-    type=float,
-    default=1e-3,
-    help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='CPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument('--device_id', type=int, default=0, help="The device id.")
-parser.add_argument(
-    '--data_format',
-    type=str,
-    default='NCHW',
-    choices=['NCHW', 'NHWC'],
-    help='The data order, now only support NCHW.')
-parser.add_argument(
-    '--data_set',
-    type=str,
-    default='cifar10',
-    choices=['cifar10', 'flowers'],
-    help='Optional dataset for benchmark.')
-parser.add_argument(
-    '--local',
-    type=str2bool,
-    default=True,
-    help='Whether to run as local mode.')
-
-parser.add_argument(
-    "--ps_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--trainer_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--profile", action='store_true', help="If set, profile a few steps.")
-
-# Flags for defining the tf.train.Server
-parser.add_argument(
-    "--task_index", type=int, default=0, help="Index of task within the job")
-args = parser.parse_args()
-
-
-def vgg16_bn_drop(input):
-    def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0])
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
-    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
-    return fc2
-
-
-def main():
-    if args.data_set == "cifar10":
-        classdim = 10
-        if args.data_format == 'NCHW':
-            data_shape = [3, 32, 32]
-        else:
-            data_shape = [32, 32, 3]
-    else:
-        classdim = 102
-        if args.data_format == 'NCHW':
-            data_shape = [3, 224, 224]
-        else:
-            data_shape = [224, 224, 3]
-
-    # Input data
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    # Train program
-    net = vgg16_bn_drop(images)
-    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    # Evaluator
-    batch_size = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(batch_acc)
-
-    # Optimization
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-    optimize_ops, params_grads = optimizer.minimize(avg_cost)
-
-    # Initialize executor
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
-        args.device_id)
-    exe = fluid.Executor(place)
-
-    # test
-    def test(exe):
-        test_pass_acc = fluid.average.WeightedAverage()
-        for batch_id, data in enumerate(test_reader()):
-            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
-                                    data)).astype("float32")
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([-1, 1])
-
-            outs = exe.run(inference_program,
-                           feed={"pixel": img_data,
-                                 "label": y_data},
-                           fetch_list=[batch_acc, batch_size])
-            test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
-
-        return test_pass_acc.eval()
-
-    def train_loop(exe, trainer_prog):
-        iters = 0
-        ts = time.time()
-        train_pass_acc = fluid.average.WeightedAverage()
-        for pass_id in range(args.num_passes):
-            # train
-            start_time = time.time()
-            num_samples = 0
-            train_pass_acc.reset()
-
-            def run_step(batch_id, data):
-                img_data = np.array(
-                    map(lambda x: x[0].reshape(data_shape), data)).astype(
-                        "float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-                y_data = y_data.reshape([-1, 1])
-
-                loss, acc, b_size = exe.run(
-                    trainer_prog,
-                    feed={"pixel": img_data,
-                          "label": y_data},
-                    fetch_list=[avg_cost, batch_acc, batch_size])
-                return loss, acc, b_size
-
-            if args.profile and args.task_index == 0:
-                # warmup.
-                for batch_id, data in enumerate(train_reader()):
-                    if batch_id > 5: break
-                    run_step(batch_id, data)
-                with profiler.profiler('All', 'total', '/tmp/profile_vgg'):
-                    for batch_id, data in enumerate(train_reader()):
-                        if batch_id > 5: break
-                        run_step(batch_id, data)
-
-            for batch_id, data in enumerate(train_reader()):
-                ts = time.time()
-                loss, acc, b_size = run_step(batch_id, data)
-                iters += 1
-                num_samples += len(data)
-                train_pass_acc.add(value=acc, weight=b_size)
-                print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
-                    "Speed = %.2f img/s" % (pass_id, iters, loss, acc,
-                                            len(data) / (time.time() - ts))
-                )  # The accuracy is the accumulation of batches, but not the current batch.
-
-            pass_elapsed = time.time() - start_time
-            pass_train_acc = train_pass_acc.eval()
-            pass_test_acc = test(exe)
-            print("Task:%d Pass = %d, Training performance = %f imgs/s, "
-                  "Train accuracy = %f, Test accuracy = %f\n" %
-                  (args.task_index, pass_id, num_samples / pass_elapsed,
-                   pass_train_acc, pass_test_acc))
-
-    if args.local:
-        # Parameter initialization
-        exe.run(fluid.default_startup_program())
-
-        # data reader
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
-                else paddle.dataset.flowers.train(),
-                buf_size=5120),
-            batch_size=args.batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.cifar.test10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-            batch_size=args.batch_size)
-        train_loop(exe, fluid.default_main_program())
-    else:
-        trainers = int(os.getenv("TRAINERS"))  # total trainer count
-        print("trainers total: ", trainers)
-
-        training_role = os.getenv(
-            "TRAINING_ROLE",
-            "TRAINER")  # get the training role: trainer/pserver
-
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            trainer_id=args.task_index,
-            pservers=args.ps_hosts,
-            trainers=trainers)
-
-        if training_role == "PSERVER":
-            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
-                "PADDLE_INIT_PORT")
-            if not current_endpoint:
-                print("need env SERVER_ENDPOINT")
-                exit(1)
-            pserver_prog = t.get_pserver_program(current_endpoint)
-            pserver_startup = t.get_startup_program(current_endpoint,
-                                                    pserver_prog)
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif training_role == "TRAINER":
-            # Parameter initialization
-            exe.run(fluid.default_startup_program())
-
-            # data reader
-            train_reader = paddle.batch(
-                paddle.reader.shuffle(
-                    paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
-                    else paddle.dataset.flowers.train(),
-                    buf_size=5120),
-                batch_size=args.batch_size)
-            test_reader = paddle.batch(
-                paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
-                paddle.dataset.flowers.test(),
-                batch_size=args.batch_size)
-
-            trainer_prog = t.get_trainer_program()
-            feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-            # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
-            exe.run(fluid.default_startup_program())
-            train_loop(exe, trainer_prog)
-        else:
-            print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-
-def print_arguments():
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == "__main__":
-    print_arguments()
-    main()
diff --git a/benchmark/cluster/vgg16/vgg16_tf.py b/benchmark/cluster/vgg16/vgg16_tf.py
deleted file mode 100644
index 2d220478ac..0000000000
--- a/benchmark/cluster/vgg16/vgg16_tf.py
+++ /dev/null
@@ -1,366 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in TensorFlow
-You can get distribution example template structure here:
-https://medium.com/clusterone/how-to-write-distributed-tensorflow-code-with-an-example-on-tensorport-70bf3306adcb
-https://www.tensorflow.org/deploy/distributed
-"""
-
-import tensorflow as tf
-import paddle.v2 as paddle
-import numpy as np
-import argparse
-import time
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
-    '--learning_rate',
-    type=float,
-    default=1e-3,
-    help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='CPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument(
-    '--data_format',
-    type=str,
-    default='NHWC',
-    choices=['NCHW', 'NHWC'],
-    help='The data order, NCHW=[batch, channels, height, width].'
-    'Only support NHWC right now.')
-parser.add_argument(
-    '--data_set',
-    type=str,
-    default='cifar10',
-    choices=['cifar10', 'flowers'],
-    help='Optional dataset for benchmark.')
-
-parser.add_argument(
-    "--ps_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--worker_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--job_name", type=str, default="", help="One of 'worker', 'ps'")
-# Flags for defining the tf.train.Server
-parser.add_argument(
-    "--task_index", type=int, default=0, help="Index of task within the job")
-
-args = parser.parse_args()
-
-
-class VGG16Model(object):
-    def __init__(self):
-        self.parameters = []
-
-    def batch_norm_relu(self, inputs, is_training):
-        """Performs a batch normalization followed by a ReLU."""
-        # We set fused=True for a significant speed boost. See
-        # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
-        inputs = tf.layers.batch_normalization(
-            inputs=inputs,
-            axis=1 if args.data_format == 'NCHW' else -1,
-            momentum=0.9,
-            epsilon=1e-05,
-            center=True,
-            scale=True,
-            training=is_training,
-            fused=True)
-        inputs = tf.nn.relu(inputs)
-        return inputs
-
-    def conv_bn_layer(self,
-                      name,
-                      images,
-                      kernel_shape,
-                      is_training,
-                      drop_rate=0.0):
-        with tf.name_scope(name) as scope:
-            kernel = tf.Variable(
-                tf.truncated_normal(
-                    kernel_shape, dtype=tf.float32, stddev=1e-1),
-                name='weights')
-            conv = tf.nn.conv2d(
-                images,
-                kernel, [1, 1, 1, 1],
-                data_format=args.data_format,
-                padding='SAME')
-            biases = tf.Variable(
-                tf.constant(
-                    0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
-                trainable=True,
-                name='biases')
-            out = tf.nn.bias_add(conv, biases)
-            out = self.batch_norm_relu(out, is_training)
-            out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
-            return out
-
-    def fc_layer(self, name, inputs, shape):
-        with tf.name_scope(name) as scope:
-            fc_w = tf.Variable(
-                tf.truncated_normal(
-                    shape, dtype=tf.float32, stddev=1e-1),
-                name='weights')
-            fc_b = tf.Variable(
-                tf.constant(
-                    0.0, shape=[shape[-1]], dtype=tf.float32),
-                trainable=True,
-                name='biases')
-            out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
-            return out
-
-    def network(self, images, class_dim, is_training):
-        """ VGG16 model structure.
-
-            TODO(kuke): enable this network to support the 'NCHW' data format
-        """
-
-        # conv1
-        conv1_1 = self.conv_bn_layer(
-            'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
-        conv1_2 = self.conv_bn_layer(
-            'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
-        # pool1
-        pool1 = tf.nn.max_pool(
-            conv1_2,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool1')
-        # conv2
-        conv2_1 = self.conv_bn_layer(
-            'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
-        conv2_2 = self.conv_bn_layer(
-            'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
-        # pool2
-        pool2 = tf.nn.max_pool(
-            conv2_2,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool2')
-        # conv3
-        conv3_1 = self.conv_bn_layer(
-            'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
-        conv3_2 = self.conv_bn_layer(
-            'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
-        conv3_3 = self.conv_bn_layer(
-            'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
-        # pool3
-        pool3 = tf.nn.max_pool(
-            conv3_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool3')
-        # conv4
-        conv4_1 = self.conv_bn_layer(
-            'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
-        conv4_2 = self.conv_bn_layer(
-            'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv4_3 = self.conv_bn_layer(
-            'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
-        # pool4
-        pool4 = tf.nn.max_pool(
-            conv4_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool4')
-        # conv5
-        conv5_1 = self.conv_bn_layer(
-            'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv5_2 = self.conv_bn_layer(
-            'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv5_3 = self.conv_bn_layer(
-            'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
-        # pool5
-        pool5 = tf.nn.max_pool(
-            conv5_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool4')
-        # flatten
-        shape = int(np.prod(pool5.get_shape()[1:]))
-        pool5_flat = tf.reshape(pool5, [-1, shape])
-        # fc1
-        drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
-        fc1 = self.fc_layer('fc1', drop, [shape, 512])
-        # fc2
-        bn = self.batch_norm_relu(fc1, is_training)
-        drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
-        fc2 = self.fc_layer('fc2', drop, [512, 512])
-
-        fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
-
-        return fc3
-
-
-def run_benchmark(cluster_spec, server):
-    """Run benchmark on cifar10 or flowers."""
-
-    if args.data_set == "cifar10":
-        class_dim = 10
-        raw_shape = (3, 32, 32)
-        dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
-            None, 3, 32, 32)
-    else:
-        class_dim = 102
-        raw_shape = (3, 224, 224)
-        dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
-            None, 3, 224, 224)
-
-    device = tf.train.replica_device_setter(
-        worker_device="/job:worker/task:{}".format(args.task_index),
-        cluster=cluster_spec)
-
-    with tf.device(device):
-        images = tf.placeholder(tf.float32, shape=dat_shape)
-        labels = tf.placeholder(tf.int64, shape=(None, ))
-        is_training = tf.placeholder('bool')
-        onehot_labels = tf.one_hot(labels, depth=class_dim)
-
-        vgg16 = VGG16Model()
-        logits = vgg16.network(images, class_dim, is_training)
-        loss = tf.losses.softmax_cross_entropy(
-            onehot_labels=onehot_labels, logits=logits)
-        avg_loss = tf.reduce_mean(loss)
-
-        correct = tf.equal(tf.argmax(logits, 1), labels)
-        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
-
-        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
-        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-        global_step = tf.Variable(0, name='global_step', trainable=False)
-        with tf.control_dependencies(update_ops):
-            train_op = optimizer.minimize(avg_loss, global_step=global_step)
-
-        summary_op = tf.summary.merge_all()
-        init_op = tf.global_variables_initializer()
-
-    # data reader
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-    test_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.test10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-
-    # test
-    def test():
-        test_accs = []
-        for batch_id, data in enumerate(test_reader()):
-            test_images = np.array(
-         map(lambda x: np.transpose(x[0].reshape(raw_shape),
-         axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
-            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
-            test_accs.append(
-                accuracy.eval(feed_dict={
-                    images: test_images,
-                    labels: test_labels,
-                    is_training: False
-                }))
-        return np.mean(test_accs)
-
-    config = tf.ConfigProto(
-        intra_op_parallelism_threads=1,
-        inter_op_parallelism_threads=1,
-        log_device_placement=True)
-    config.gpu_options.allow_growth = True
-
-    hooks = [tf.train.StopAtStepHook(last_step=1000000)]
-
-    with tf.train.MonitoredTrainingSession(
-            master=server.target,
-            is_chief=(args.task_index == 0),
-            hooks=hooks,
-            config=config) as sess:
-        iters, num_samples, start_time = 0, 0, 0.0
-        for pass_id in range(args.num_passes):
-            # train
-            num_samples = 0
-            start_time = time.time()
-            for batch_id, data in enumerate(train_reader()):
-                train_images = np.array(
-                    map(lambda x: np.transpose(x[0].reshape(raw_shape),
-                    axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
-                train_labels = np.array(map(lambda x: x[1], data)).astype(
-                    'int64')
-                iter_begin_time = time.time()
-                _, loss, acc = sess.run([train_op, avg_loss, accuracy],
-                                        feed_dict={
-                                            images: train_images,
-                                            labels: train_labels,
-                                            is_training: True
-                                        })
-                iters += 1
-                print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed=%.2f imgs/sec"
-                    % (pass_id, iters, loss, acc,
-                       len(data) / (time.time() - iter_begin_time)))
-                num_samples += len(data)
-            train_elapsed = time.time() - start_time
-            # test
-            pass_test_acc = test()
-            print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
-                  (pass_id, num_samples / train_elapsed, pass_test_acc))
-
-
-def print_arguments():
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    print_arguments()
-
-    ps_hosts = args.ps_hosts.split(",")
-    worker_hosts = args.worker_hosts.split(",")
-
-    # Create a cluster from the parameter server and worker hosts.
-    cluster_spec = tf.train.ClusterSpec({
-        "ps": ps_hosts,
-        "worker": worker_hosts
-    })
-
-    # Create and start a server for the local task.
-    server = tf.train.Server(
-        cluster_spec, job_name=args.job_name, task_index=args.task_index)
-
-    if args.job_name == "ps":
-        print("start pserver")
-        server.join()
-    elif args.job_name == "worker":
-        print("start worker")
-        run_benchmark(cluster_spec, server)
diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py
deleted file mode 100644
index 1a66af32d7..0000000000
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ /dev/null
@@ -1,154 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-import gzip
-
-import paddle.v2.dataset.cifar as cifar
-import paddle.v2 as paddle
-import time
-import os
-
-DATA_DIM = 3 * 32 * 32
-CLASS_DIM = 10
-BATCH_SIZE = os.getenv("BATCH_SIZE")
-if BATCH_SIZE:
-    BATCH_SIZE = int(BATCH_SIZE)
-else:
-    BATCH_SIZE = 128
-print "batch_size", BATCH_SIZE
-NODE_COUNT = int(os.getenv("TRAINERS"))
-ts = 0
-
-
-def vgg(input, nums, class_dim):
-    def conv_block(input, num_filter, groups, num_channels=None):
-        return paddle.networks.img_conv_group(
-            input=input,
-            num_channels=num_channels,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act=paddle.activation.Relu(),
-            pool_type=paddle.pooling.Max())
-
-    assert len(nums) == 5
-    # the channel of input feature is 3
-    conv1 = conv_block(input, 64, nums[0], 3)
-    conv2 = conv_block(conv1, 128, nums[1])
-    conv3 = conv_block(conv2, 256, nums[2])
-    conv4 = conv_block(conv3, 512, nums[3])
-    conv5 = conv_block(conv4, 512, nums[4])
-
-    fc_dim = 512
-    fc1 = paddle.layer.fc(input=conv5,
-                          size=fc_dim,
-                          act=paddle.activation.Relu(),
-                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
-    fc2 = paddle.layer.fc(input=fc1,
-                          size=fc_dim,
-                          act=paddle.activation.Relu(),
-                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
-    out = paddle.layer.fc(input=fc2,
-                          size=class_dim,
-                          act=paddle.activation.Softmax())
-    return out
-
-
-def vgg13(input, class_dim):
-    nums = [2, 2, 2, 2, 2]
-    return vgg(input, nums, class_dim)
-
-
-def vgg16(input, class_dim):
-    nums = [2, 2, 3, 3, 3]
-    return vgg(input, nums, class_dim)
-
-
-def vgg19(input, class_dim):
-    nums = [2, 2, 4, 4, 4]
-    return vgg(input, nums, class_dim)
-
-
-def main():
-    global ts
-    paddle.init(use_gpu=False)
-    image = paddle.layer.data(
-        name="image", type=paddle.data_type.dense_vector(DATA_DIM))
-    lbl = paddle.layer.data(
-        name="label", type=paddle.data_type.integer_value(CLASS_DIM))
-
-    extra_layers = None
-    # NOTE: for v2 distributed training need averaging updates.
-    learning_rate = 1e-3 / NODE_COUNT
-    out = vgg16(image, class_dim=CLASS_DIM)
-    cost = paddle.layer.classification_cost(input=out, label=lbl)
-
-    # Create parameters
-    parameters = paddle.parameters.create(cost)
-
-    # Create optimizer
-    optimizer = paddle.optimizer.Momentum(
-        momentum=0.9,
-        regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
-                                                         BATCH_SIZE),
-        learning_rate=learning_rate / BATCH_SIZE,
-        learning_rate_decay_a=0.1,
-        learning_rate_decay_b=128000 * 35,
-        learning_rate_schedule="discexp", )
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            cifar.train10(),
-            # To use other data, replace the above line with:
-            # reader.train_reader('train.list'),
-            buf_size=1000),
-        batch_size=BATCH_SIZE)
-    test_reader = paddle.batch(
-        cifar.test10(),
-        # To use other data, replace the above line with:
-        # reader.test_reader('val.list'),
-        batch_size=BATCH_SIZE)
-
-    # Create trainer
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer,
-                                 extra_layers=extra_layers,
-                                 is_local=False)
-
-    # End batch and end pass event handler
-    def event_handler(event):
-        global ts, ts_pass
-        if isinstance(event, paddle.event.BeginPass):
-            ts_pass = time.time()
-        if isinstance(event, paddle.event.BeginIteration):
-            ts = time.time()
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 1 == 0:
-                print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics,
-                    time.time() - ts)
-        if isinstance(event, paddle.event.EndPass):
-            print "Pass %d end, spent: %f" % (event.pass_id,
-                                              time.time() - ts_pass)
-            result = trainer.test(reader=test_reader)
-            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
-
-    trainer.train(
-        reader=train_reader, num_passes=200, event_handler=event_handler)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
new file mode 100644
index 0000000000..b9eaca5ee6
--- /dev/null
+++ b/benchmark/fluid/Dockerfile
@@ -0,0 +1,22 @@
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
+RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
+RUN pip install -U pip
+RUN pip install -U kubernetes paddlepaddle
+
+# IMPORTANT:
+# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
+
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
+RUN pip uninstall -y paddlepaddle && mkdir /workspace
+
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
+
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD fluid_benchmark.py recordio_converter.py models/ /workspace/
diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md
new file mode 100644
index 0000000000..28cade4634
--- /dev/null
+++ b/benchmark/fluid/README.md
@@ -0,0 +1,99 @@
+# Fluid Benchmark
+
+This directory contains several models configurations and tools that used to run
+Fluid benchmarks for local and distributed training.
+
+
+## Run the Benchmark
+
+To start, run the following command to get the full help message:
+
+```bash
+python fluid_benchmark.py --help
+```
+
+Currently supported `--model` argument include:
+
+* mnist
+* resnet
+    * you can chose to use different dataset using `--data_set cifar10` or
+      `--data_set flowers`.
+* vgg
+* stacked_dynamic_lstm
+* machine_translation
+
+* Run the following command to start a benchmark job locally:
+    ```bash
+      python fluid_benchmark.py --model mnist --device GPU
+    ```
+    You can choose to use GPU/CPU training. With GPU training, you can specify
+    `--gpus <gpu_num>` to run multi GPU training.
+    You can set async mode parameter server. With async mode, you can specify
+    `--async_mode` to train model asynchronous.
+* Run distributed training with parameter servers:
+    * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
+    * start parameter servers:
+        ```bash
+        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
+        sleep 15
+        ```
+    * start trainers:
+        ```bash
+        PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
+        ```
+* Run distributed training using NCCL2
+    ```bash
+    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
+    ```
+
+## Prepare the RecordIO file to Achieve Better Performance
+
+Run the following command will generate RecordIO files like "mnist.recordio" under the path
+and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
+at any time using `fluid.batch`.
+
+```bash
+python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
+```
+
+## Run Distributed Benchmark on Kubernetes Cluster
+
+You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
+have to start all those processes mannually on each node, which is not recommended.
+
+To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
+download it from
+http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
+build it by your own. Once you've got the "whl" package, put it under the current directory and run:
+
+```bash
+docker build -t [your docker image name]:[your docker image tag] .
+```
+
+Then push the image to a Docker registry that your Kubernetes cluster can reach.
+
+We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
+distributed benchmark jobs to your cluster. To generate a job yaml, just run:
+
+```bash
+python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
+```
+
+Then the yaml files are generated under directory `myjob`, you can run:
+
+```bash
+kubectl create -f myjob/
+```
+
+The job shall start.
+
+
+## Notes for Run Fluid Distributed with NCCL2 and RDMA
+
+Before running NCCL2 distributed jobs, please check that whether your node has multiple network
+interfaces, try to add the environment variable `export NCCL_SOCKET_IFNAME=eth0` to use your actual
+network device.
+
+To run high-performance distributed training, you must prepare your hardware environment to be
+able to run RDMA enabled network communication, please check out [this](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/cluster/nccl2_rdma_training.md)
+note for details.
diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
new file mode 100644
index 0000000000..68a3d42d7a
--- /dev/null
+++ b/benchmark/fluid/args.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+__all__ = ['parse_args', ]
+
+BENCHMARK_MODELS = [
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Fluid model benchmarks.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=BENCHMARK_MODELS,
+        default='resnet',
+        help='The model to run benchmark with.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    #  args related to learning rate
+    parser.add_argument(
+        '--learning_rate', type=float, default=0.001, help='The learning rate.')
+    # TODO(wuyi): add "--use_fake_data" option back.
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=80, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=100, help='The number of passes.')
+    parser.add_argument(
+        '--data_format',
+        type=str,
+        default='NCHW',
+        choices=['NCHW', 'NHWC'],
+        help='The data data_format, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--gpus',
+        type=int,
+        default=1,
+        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
+    # this option is available only for vgg and resnet.
+    parser.add_argument(
+        '--cpus',
+        type=int,
+        default=1,
+        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
+    parser.add_argument(
+        '--data_set',
+        type=str,
+        default='flowers',
+        choices=['cifar10', 'flowers'],
+        help='Optional dataset for benchmark.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--no_test',
+        action='store_true',
+        help='If set, do not test the testset during training.')
+    parser.add_argument(
+        '--memory_optimize',
+        action='store_true',
+        help='If set, optimize runtime memory before start.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='If set ommit the actual read data operators.')
+    parser.add_argument(
+        '--profile', action='store_true', help='If set, profile a few steps.')
+    parser.add_argument(
+        '--update_method',
+        type=str,
+        default='local',
+        choices=['local', 'pserver', 'nccl2'],
+        help='Choose parameter update method, can be local, pserver, nccl2.')
+    parser.add_argument(
+        '--no_split_var',
+        action='store_true',
+        default=False,
+        help='Whether split variables into blocks when update_method is pserver')
+    parser.add_argument(
+        '--async_mode',
+        action='store_true',
+        default=False,
+        help='Whether start pserver in async mode to support ASGD')
+    parser.add_argument(
+        '--use_reader_op',
+        action='store_true',
+        help='Whether to use reader op, and must specify the data path if set this to true.'
+    )
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default="",
+        help='Directory that contains all the training recordio files.')
+    args = parser.parse_args()
+    return args
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
new file mode 100644
index 0000000000..aa70783ecd
--- /dev/null
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -0,0 +1,357 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import cProfile
+import time
+import os
+
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
+
+from args import *
+
+
+def append_nccl2_prepare(trainer_id):
+    if trainer_id >= 0:
+        # append gen_nccl_id at the end of startup program
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        port = os.getenv("PADDLE_PSERVER_PORT")
+        worker_ips = os.getenv("PADDLE_TRAINER_IPS")
+        worker_endpoints = []
+        for ip in worker_ips.split(","):
+            worker_endpoints.append(':'.join([ip, port]))
+        num_trainers = len(worker_endpoints)
+        current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
+        worker_endpoints.remove(current_endpoint)
+
+        nccl_id_var = fluid.default_startup_program().global_block().create_var(
+            name="NCCLID",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.RAW)
+        fluid.default_startup_program().global_block().append_op(
+            type="gen_nccl_id",
+            inputs={},
+            outputs={"NCCLID": nccl_id_var},
+            attrs={
+                "endpoint": current_endpoint,
+                "endpoint_list": worker_endpoints,
+                "trainer_id": trainer_id
+            })
+        return nccl_id_var, num_trainers, trainer_id
+    else:
+        raise Exception("must set positive PADDLE_TRAINER_ID env variables for "
+                        "nccl-based dist train.")
+
+
+def dist_transpile(trainer_id, args):
+    if trainer_id < 0:
+        return None, None
+
+    # the port of all pservers, needed by both trainer and pserver
+    port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+    # comma separated ips of all pservers, needed by trainer and
+    # pserver
+    pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
+    eplist = []
+    for ip in pserver_ips.split(","):
+        eplist.append(':'.join([ip, port]))
+    pserver_endpoints = ",".join(eplist)
+    # total number of workers/trainers in the job, needed by
+    # trainer and pserver
+    trainers = int(os.getenv("PADDLE_TRAINERS"))
+    # the IP of the local machine, needed by pserver only
+    current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
+    # the role, should be either PSERVER or TRAINER
+    training_role = os.getenv("PADDLE_TRAINING_ROLE")
+
+    t = distribute_transpiler.DistributeTranspiler()
+    t.transpile(
+        trainer_id,
+        pservers=pserver_endpoints,
+        trainers=trainers,
+        sync_mode=not args.async_mode,
+        slice_var_up=not args.no_split_var)
+    if training_role == "PSERVER":
+        pserver_program = t.get_pserver_program(current_endpoint)
+        pserver_startup_program = t.get_startup_program(current_endpoint,
+                                                        pserver_program)
+        return pserver_program, pserver_startup_program
+    elif training_role == "TRAINER":
+        train_program = t.get_trainer_program()
+        return train_program, fluid.default_startup_program()
+    else:
+        raise ValueError(
+            'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
+        )
+
+
+def test(exe, inference_program, test_reader, feeder, batch_acc):
+    accuracy_evaluator = fluid.metrics.Accuracy()
+    for batch_id, data in enumerate(test_reader()):
+        acc = exe.run(inference_program,
+                      feed=feeder.feed(data),
+                      fetch_list=[batch_acc])
+        accuracy_evaluator.update(value=np.array(acc), weight=len(data))
+
+    return accuracy_evaluator.eval()
+
+
+# TODO(wuyi): replace train, train_parallel, test functions with new trainer
+# API once it is ready.
+def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
+          args, train_prog, startup_prog):
+    if os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
+        place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        exe.run(train_prog)
+        return
+
+    if args.use_fake_data:
+        raise Exception(
+            "fake data is not supported in single GPU test for now.")
+
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(startup_prog)
+
+    if not args.use_reader_op:
+        feed_var_list = [
+            var for var in train_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+        feeder = fluid.DataFeeder(feed_var_list, place)
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in range(args.pass_num):
+        train_losses = []
+        if not args.use_reader_op:
+            reader_generator = train_reader()
+        batch_id = 0
+        data = None
+        while True:
+            if not args.use_reader_op:
+                data = next(reader_generator, None)
+                if data == None:
+                    break
+            if iters == args.iterations:
+                break
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+
+            if args.use_reader_op:
+                try:
+                    loss = exe.run(train_prog, fetch_list=[avg_loss])
+                except fluid.core.EnforceNotMet as ex:
+                    break
+            else:
+                loss = exe.run(train_prog,
+                               feed=feeder.feed(data),
+                               fetch_list=[avg_loss])
+            iters += 1
+            batch_id += 1
+            # FIXME(wuyi): For use_reader_op, if the current
+            # pass is not the last, the last batch of this pass
+            # is also equal to args.batch_size.
+            if args.use_reader_op:
+                num_samples += args.batch_size * args.gpus
+            else:
+                num_samples += len(data)
+            train_losses.append(loss)
+            print("Pass: %d, Iter: %d, Loss: %f\n" %
+                  (pass_id, iters, np.mean(train_losses)))
+        print_train_time(start_time, time.time(), num_samples)
+        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
+        # evaluation
+        if not args.no_test and batch_acc and not args.use_reader_op:
+            pass_test_acc = test(exe, infer_prog, test_reader, feeder,
+                                 batch_acc)
+            print(", Test Accuracy: %f" % pass_test_acc)
+        print("\n")
+        # TODO(wuyi): add warmup passes to get better perf data.
+        exit(0)
+
+
+# TODO(wuyi): replace train, train_parallel, test functions with new trainer
+# API once it is ready.
+def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
+                   batch_acc, args, train_prog, startup_prog, nccl_id_var,
+                   num_trainers, trainer_id):
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    if not args.use_reader_op:
+        feed_var_list = [
+            var for var in train_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+        feeder = fluid.DataFeeder(feed_var_list, place)
+
+    # generate fake:
+    if args.use_fake_data:
+        for var in feed_var_list:
+            v = startup_prog.global_block().clone_variable(var)
+            var.persistable = True
+            v.persistable = True
+
+            real_shape = list(var.shape)
+            real_shape[0] = args.batch_size / args.gpus
+            startup_prog.global_block().append_op(
+                outputs={"Out": v},
+                type="fill_constant",
+                attrs={"shape": real_shape,
+                       "value": 1.0,
+                       "dtype": var.dtype})
+
+    if nccl_id_var and trainer_id == 0:
+        #FIXME(wuyi): wait other trainer to start listening
+        time.sleep(30)
+
+    startup_exe = fluid.Executor(place)
+    startup_exe.run(startup_prog)
+    strategy = fluid.ExecutionStrategy()
+    strategy.num_threads = 1
+    strategy.allow_op_delay = False
+    exe = fluid.ParallelExecutor(
+        True,
+        avg_loss.name,
+        exec_strategy=strategy,
+        num_trainers=num_trainers,
+        trainer_id=trainer_id)
+
+    for pass_id in range(args.pass_num):
+        num_samples = 0
+        iters = 0
+        start_time = time.time()
+        if not args.use_reader_op:
+            reader_generator = train_reader()
+        batch_id = 0
+        data = None
+        while True:
+            if not args.use_reader_op:
+                data = next(reader_generator, None)
+                if data == None:
+                    break
+            if iters == args.iterations:
+                break
+            if args.profile and pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif args.profile and pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
+
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if args.use_fake_data or args.use_reader_op:
+                try:
+                    loss, = exe.run([avg_loss.name])
+                except fluid.core.EnforceNotMet as ex:
+                    break
+            else:
+                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
+            if args.update_method == "pserver":
+                exe.bcast_params()
+            if args.use_reader_op:
+                num_samples += args.batch_size * args.gpus
+            else:
+                num_samples += len(data)
+            iters += 1
+            if batch_id % 1 == 0:
+                print("Pass %d, batch %d, loss %s" %
+                      (pass_id, batch_id, np.array(loss)))
+            batch_id += 1
+
+        print_train_time(start_time, time.time(), num_samples)
+        if not args.no_test and batch_acc and not args.use_reader_op:
+            # we have not implement record io for test
+            # skip test when use args.use_reader_op
+            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
+                            batch_acc)
+            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('----------- Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def print_train_time(start_time, end_time, num_samples):
+    train_elapsed = end_time - start_time
+    examples_per_sec = num_samples / train_elapsed
+    print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+          (num_samples, train_elapsed, examples_per_sec))
+
+
+def main():
+    args = parse_args()
+    print_arguments(args)
+
+    # the unique trainer id, starting from 0, needed by trainer
+    # only
+    nccl_id_var, num_trainers, trainer_id = (
+        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0")))
+
+    if args.use_cprof:
+        pr = cProfile.Profile()
+        pr.enable()
+    model_def = __import__("models.%s" % args.model, fromlist=["models"])
+    train_args = list(model_def.get_model(args))
+    train_args.append(args)
+    # Run optimizer.minimize(avg_loss)
+    train_args[2].minimize(train_args[0])
+    if args.memory_optimize:
+        fluid.memory_optimize(fluid.default_main_program())
+
+    if args.update_method == "pserver":
+        train_prog, startup_prog = dist_transpile(trainer_id, args)
+        if not train_prog:
+            raise Exception(
+                "Must configure correct environments to run dist train.")
+        train_args.extend([train_prog, startup_prog])
+        if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
+            train_args.extend([nccl_id_var, num_trainers, trainer_id])
+            train_parallel(*train_args)
+        train(*train_args)
+        exit(0)
+
+    # for other update methods, use default programs
+    train_args.append(fluid.default_main_program())
+    train_args.append(fluid.default_startup_program())
+
+    if args.update_method == "nccl2":
+        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
+    if args.gpus == 1:
+        # NOTE: parallel executor use profiler interanlly
+        if args.use_nvprof and args.device == 'GPU':
+            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+                train(*train_args)
+        else:
+            train(*train_args)
+    else:
+        if args.device == "CPU":
+            raise Exception("Only support GPU perf with parallel exe")
+        train_args.extend([nccl_id_var, num_trainers, trainer_id])
+        train_parallel(*train_args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py
new file mode 100644
index 0000000000..9da8a69af1
--- /dev/null
+++ b/benchmark/fluid/kube_gen_job.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+import copy
+import argparse
+import random
+import os
+from kube_templates import pserver, trainer, envs
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Generate dist job yamls.')
+
+    parser.add_argument(
+        '--jobname', default="paddlejob", help='unique job name')
+    parser.add_argument(
+        '--cpu', default=1, type=int, help='CPU cores per trainer node')
+    parser.add_argument(
+        '--pscpu', default=1, type=int, help='CPU cores per pserver node')
+    parser.add_argument(
+        '--gpu', default=0, type=int, help='num of GPUs per node')
+    parser.add_argument(
+        '--image',
+        default="bootstrapper:5000/fluid_benchmark:gpu",
+        help='num of GPUs per node')
+    parser.add_argument(
+        '--pservers', default=1, type=int, help='num of pservers')
+    parser.add_argument(
+        '--trainers', default=1, type=int, help='num of trainers')
+    parser.add_argument('--memory', default=1, type=int, help='trainer memory')
+    parser.add_argument(
+        '--psmemory', default=1, type=int, help='pserver memory')
+    parser.add_argument(
+        '--port', default=30236, type=int, help='num of trainers')
+    parser.add_argument(
+        '--entry', default="python train.py", help='command to run')
+    parser.add_argument(
+        '--fluid', default=1, type=int, help='whether is fluid job')
+    parser.add_argument(
+        '--rdma', action='store_true', help='whether mount rdma libs')
+    parser.add_argument(
+        '--disttype',
+        default="pserver",
+        type=str,
+        choices=['pserver', 'nccl2', 'local'],
+        help='pserver or nccl2 or local')
+
+    args = parser.parse_args()
+    return args
+
+
+def gen_job():
+    ps = pserver
+    tn = trainer
+    args = parse_args()
+
+    ps_container = ps["spec"]["template"]["spec"]["containers"][0]
+    tn_container = tn["spec"]["template"]["spec"]["containers"][0]
+
+    if args.fluid == 1:
+        ps_container["command"] = \
+            ["paddle_k8s", "start_fluid"]
+        tn_container["command"] = \
+            ["paddle_k8s", "start_fluid"]
+    ps["metadata"]["name"] = args.jobname + "-pserver"
+    ps["spec"]["template"]["metadata"]["labels"][
+        "paddle-job-pserver"] = args.jobname
+    tn["metadata"]["name"] = args.jobname + "-trainer"
+    tn["spec"]["template"]["metadata"]["labels"]["paddle-job"] = args.jobname
+
+    ps_container["image"] = args.image
+    tn_container["image"] = args.image
+
+    ps_container["resources"]["requests"]["cpu"] = str(args.pscpu)
+    ps_container["resources"]["requests"]["memory"] = str(args.psmemory) + "Gi"
+    ps_container["resources"]["limits"]["cpu"] = str(args.pscpu)
+    ps_container["resources"]["limits"]["memory"] = str(args.psmemory) + "Gi"
+
+    tn_container["resources"]["requests"]["cpu"] = str(args.cpu)
+    tn_container["resources"]["requests"]["memory"] = str(args.memory) + "Gi"
+    tn_container["resources"]["limits"]["cpu"] = str(args.cpu)
+    tn_container["resources"]["limits"]["memory"] = str(args.memory) + "Gi"
+    if args.gpu > 0:
+        tn_container["resources"]["requests"][
+            "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
+        tn_container["resources"]["limits"][
+            "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
+
+    ps["spec"]["replicas"] = int(args.pservers)
+    tn["spec"]["parallelism"] = int(args.trainers)
+    tn["spec"]["completions"] = int(args.trainers)
+    ps_container["ports"][0]["name"] = "jobport-" + str(args.port)
+    ps_container["ports"][0]["containerPort"] = args.port
+    spreadport = random.randint(40000, 60000)
+    tn_container["ports"][0]["name"] = "spr-" + str(spreadport)
+    tn_container["ports"][0]["containerPort"] = spreadport
+
+    envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
+    envs.append({"name": "TRAINERS", "value": str(args.trainers)})
+    envs.append({"name": "PSERVERS", "value": str(args.pservers)})
+    envs.append({"name": "ENTRY", "value": args.entry})
+    envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
+    envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
+    # NOTE: these directories below are cluster specific, please modify
+    # this settings before you run on your own cluster.
+    envs.append({
+        "name": "LD_LIBRARY_PATH",
+        "value":
+        "/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind"
+    })
+
+    volumes = [{
+        "name": "nvidia-driver",
+        "hostPath": {
+            "path": "/usr/local/nvidia/lib64"
+        }
+    }]
+    volumeMounts = [{
+        "mountPath": "/usr/local/nvidia/lib64",
+        "name": "nvidia-driver"
+    }]
+
+    if args.rdma:
+        volumes.extend([{
+            "name": "ibetc",
+            "hostPath": {
+                "path": "/etc/libibverbs.d"
+            }
+        }, {
+            "name": "iblibs",
+            "hostPath": {
+                "path": "/usr/local/rdma"
+            }
+        }, {
+            "name": "valgrind",
+            "hostPath": {
+                "path": "/usr/lib64/mlnx_ofed/valgrind"
+            }
+        }])
+        volumeMounts.extend([{
+            "mountPath": "/etc/libibverbs.d",
+            "name": "ibetc"
+        }, {
+            "mountPath": "/usr/local/rdma",
+            "name": "iblibs"
+        }, {
+            "mountPath": "/usr/lib64/mlnx_ofed/valgrind",
+            "name": "valgrind"
+        }])
+        # append shm for NCCL2
+        volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
+        volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
+
+    tn["spec"]["template"]["spec"]["volumes"] = volumes
+    tn_container["volumeMounts"] = volumeMounts
+
+    ps_container["env"] = envs
+    ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"})
+    tn_container["env"] = envs
+    if args.disttype == "pserver":
+        tn_container["env"].append({
+            "name": "TRAINING_ROLE",
+            "value": "TRAINER"
+        })
+    elif args.disttype == "nccl2" or args.disttype == "local":
+        # NCCL2 have no training role, set to plain WORKER
+        tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"})
+
+    os.mkdir(args.jobname)
+    if args.disttype == "pserver":
+        with open("%s/pserver.yaml" % args.jobname, "w") as fn:
+            yaml.dump(ps, fn)
+
+    with open("%s/trainer.yaml" % args.jobname, "w") as fn:
+        yaml.dump(tn, fn)
+
+
+if __name__ == "__main__":
+    gen_job()
diff --git a/benchmark/fluid/kube_templates/__init__.py b/benchmark/fluid/kube_templates/__init__.py
new file mode 100644
index 0000000000..2d09d940a5
--- /dev/null
+++ b/benchmark/fluid/kube_templates/__init__.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pserver import pserver
+from trainer import trainer
+
+__all__ = ["pserver", "trainer", "envs"]
+
+envs = [
+    # envs that don't need to change
+    {
+        "name": "GLOG_v",
+        "value": "0"
+    },
+    {
+        "name": "GLOG_logtostderr",
+        "value": "1"
+    },
+    {
+        "name": "TOPOLOGY",
+        "value": ""
+    },
+    {
+        "name": "TRAINER_PACKAGE",
+        "value": "/workspace"
+    },
+    {
+        "name": "PADDLE_INIT_NICS",
+        "value": "eth2"
+    },
+    {
+        "name": "NAMESPACE",
+        "valueFrom": {
+            "fieldRef": {
+                "fieldPath": "metadata.namespace"
+            }
+        }
+    },
+    {
+        "name": "POD_IP",
+        "valueFrom": {
+            "fieldRef": {
+                "fieldPath": "status.podIP"
+            }
+        }
+    },
+    {
+        "name": "PADDLE_CURRENT_IP",
+        "valueFrom": {
+            "fieldRef": {
+                "fieldPath": "status.podIP"
+            }
+        }
+    }
+]
diff --git a/benchmark/fluid/kube_templates/pserver.py b/benchmark/fluid/kube_templates/pserver.py
new file mode 100644
index 0000000000..b54982c806
--- /dev/null
+++ b/benchmark/fluid/kube_templates/pserver.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+pserver = {
+    "apiVersion": "extensions/v1beta1",
+    "kind": "ReplicaSet",
+    "metadata": {
+        "name": "jobname-pserver"
+    },
+    "spec": {
+        "replicas": 1,
+        "template": {
+            "metadata": {
+                "labels": {
+                    "paddle-job-pserver": "jobname"
+                }
+            },
+            "spec": {
+                "hostNetwork": True,
+                "imagePullSecrets": [{
+                    "name": "job-registry-secret"
+                }],
+                "containers": [{
+                    "name": "pserver",
+                    "image": "",
+                    "imagePullPolicy": "Always",
+                    "ports": [{
+                        "name": "jobport-1",
+                        "containerPort": 1
+                    }],
+                    "env": [],
+                    "command": ["paddle_k8s", "start_pserver"],
+                    "resources": {
+                        "requests": {
+                            "memory": "10Gi",
+                            "cpu": "4"
+                        },
+                        "limits": {
+                            "memory": "10Gi",
+                            "cpu": "4"
+                        }
+                    }
+                }]
+            }
+        }
+    }
+}
diff --git a/benchmark/fluid/kube_templates/trainer.py b/benchmark/fluid/kube_templates/trainer.py
new file mode 100644
index 0000000000..b915d31e37
--- /dev/null
+++ b/benchmark/fluid/kube_templates/trainer.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+trainer = {
+    "apiVersion": "batch/v1",
+    "kind": "Job",
+    "metadata": {
+        "name": "jobname-pserver"
+    },
+    "spec": {
+        "parallelism": 4,
+        "completions": 4,
+        "template": {
+            "metadata": {
+                "labels": {
+                    "paddle-job": "jobname"
+                }
+            },
+            "spec": {
+                "hostNetwork": True,
+                "imagePullSecrets": [{
+                    "name": "job-registry-secret"
+                }],
+                "restartPolicy": "Never",
+                "containers": [{
+                    "name": "trainer",
+                    "image": "",
+                    "imagePullPolicy": "Always",
+                    # to let container set rlimit
+                    "securityContext": {
+                        "privileged": True
+                        # TODO(wuyi): use below specific cap instead of privileged,
+                        # using privileged will cause all GPU device are visible
+                        # in the container.
+                        # "capabilities": {
+                        #     "add": ["SYS_RESOURCE"]
+                        # }
+                    },
+                    "ports": [{
+                        "name": "jobport-1",
+                        "containerPort": 1
+                    }],
+                    "env": [],
+                    "command": ["paddle_k8s", "start_trainer", "v2"],
+                    "resources": {
+                        "requests": {
+                            "memory": "10Gi",
+                            "cpu": "4",
+                        },
+                        "limits": {
+                            "memory": "10Gi",
+                            "cpu": "4",
+                        }
+                    }
+                }]
+            }
+        }
+    }
+}
diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
deleted file mode 100644
index 400200c474..0000000000
--- a/benchmark/fluid/mnist.py
+++ /dev/null
@@ -1,228 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-
-SEED = 1
-DTYPE = "float32"
-
-# random seed must set before configuring the network.
-# fluid.default_startup_program().random_seed = SEED
-
-
-def parse_args():
-    parser = argparse.ArgumentParser("mnist model benchmark.")
-    parser.add_argument(
-        '--batch_size', type=int, default=128, help='The minibatch size.')
-    parser.add_argument(
-        '--skip_batch_num',
-        type=int,
-        default=5,
-        help='The first num of minibatch num to skip, for better performance test'
-    )
-    parser.add_argument(
-        '--iterations', type=int, default=35, help='The number of minibatches.')
-    parser.add_argument(
-        '--pass_num', type=int, default=5, help='The number of passes.')
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='GPU',
-        choices=['CPU', 'GPU'],
-        help='The device type.')
-    parser.add_argument(
-        '--infer_only', action='store_true', help='If set, run forward only.')
-    parser.add_argument(
-        '--use_cprof', action='store_true', help='If set, use cProfile.')
-    parser.add_argument(
-        '--use_nvprof',
-        action='store_true',
-        help='If set, use nvprof for CUDA.')
-    parser.add_argument(
-        '--with_test',
-        action='store_true',
-        help='If set, test the testset during training.')
-    args = parser.parse_args()
-    return args
-
-
-def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=data,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-
-    # TODO(dzhwinter) : refine the initializer and random seed settting
-    SIZE = 10
-    input_shape = conv_pool_2.shape
-    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
-    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
-
-    predict = fluid.layers.fc(
-        input=conv_pool_2,
-        size=SIZE,
-        act="softmax",
-        param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.NormalInitializer(
-                loc=0.0, scale=scale)))
-    return predict
-
-
-def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
-    test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=args.batch_size)
-    test_pass_acc = fluid.average.WeightedAverage()
-    for batch_id, data in enumerate(test_reader()):
-        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
-                                data)).astype(DTYPE)
-        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-        y_data = y_data.reshape([len(y_data), 1])
-
-        acc, weight = exe.run(inference_program,
-                              feed={"pixel": img_data,
-                                    "label": y_data},
-                              fetch_list=[batch_acc, batch_size_tensor])
-        test_pass_acc.add(value=acc, weight=weight)
-        pass_acc = test_pass_acc.eval()
-    return pass_acc
-
-
-def run_benchmark(model, args):
-    if args.use_cprof:
-        pr = cProfile.Profile()
-        pr.enable()
-    start_time = time.time()
-    # Input data
-    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    # Train program
-    predict = model(images)
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    # Evaluator
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size_tensor)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-
-    # Optimization
-    opt = fluid.optimizer.AdamOptimizer(
-        learning_rate=0.001, beta1=0.9, beta2=0.999)
-    opt.minimize(avg_cost)
-
-    fluid.memory_optimize(fluid.default_main_program())
-
-    # Initialize executor
-    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
-    exe = fluid.Executor(place)
-
-    # Parameter initialization
-    exe.run(fluid.default_startup_program())
-
-    # Reader
-    train_reader = paddle.batch(
-        paddle.dataset.mnist.train(), batch_size=args.batch_size)
-
-    accuracy = fluid.metrics.Accuracy()
-    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in range(args.pass_num):
-        accuracy.reset()
-        train_accs = []
-        train_losses = []
-        for batch_id, data in enumerate(train_reader()):
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
-            if iters == args.iterations:
-                break
-            img_data = np.array(
-                map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([len(y_data), 1])
-
-            outs = train_exe.run(
-                feed={"pixel": img_data,
-                      "label": y_data},
-                fetch_list=[
-                    avg_cost.name, batch_acc.name, batch_size_tensor.name
-                ]
-            )  # The accuracy is the accumulation of batches, but not the current batch.
-            accuracy.update(
-                value=np.array(np.mean(outs[1])),
-                weight=np.mean(np.array(outs[2])))
-            iters += 1
-            num_samples += len(y_data)
-            loss = np.mean(np.array(outs[0]))
-            acc = np.mean(np.array(outs[1]))
-            train_losses.append(loss)
-            train_accs.append(acc)
-            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
-                  (pass_id, iters, loss, acc))
-
-        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
-              (pass_id, np.mean(train_losses), np.mean(train_accs)))
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
-        # evaluation
-        if args.with_test:
-            test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
-                                     inference_program)
-        exit(0)
-
-
-def print_arguments(args):
-    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
-                                vars(args)['device'] == 'GPU')
-    print('----------- mnist Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    print_arguments(args)
-    if args.use_nvprof and args.device == 'GPU':
-        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
-            run_benchmark(cnn_model, args)
-    else:
-        run_benchmark(cnn_model, args)
diff --git a/benchmark/fluid/models/__init__.py b/benchmark/fluid/models/__init__.py
new file mode 100644
index 0000000000..1c3fcac8dd
--- /dev/null
+++ b/benchmark/fluid/models/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+]
diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/models/machine_translation.py
similarity index 59%
rename from benchmark/fluid/machine_translation.py
rename to benchmark/fluid/models/machine_translation.py
index adde5f21ac..69541adf6b 100644
--- a/benchmark/fluid/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -27,74 +27,6 @@ import paddle.fluid.core as core
 import paddle.fluid.framework as framework
 from paddle.fluid.executor import Executor
 
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--embedding_dim",
-    type=int,
-    default=512,
-    help="The dimension of embedding table. (default: %(default)d)")
-parser.add_argument(
-    "--encoder_size",
-    type=int,
-    default=512,
-    help="The size of encoder bi-rnn unit. (default: %(default)d)")
-parser.add_argument(
-    "--decoder_size",
-    type=int,
-    default=512,
-    help="The size of decoder rnn unit. (default: %(default)d)")
-parser.add_argument(
-    "--batch_size",
-    type=int,
-    default=16,
-    help="The sequence number of a mini-batch data. (default: %(default)d)")
-parser.add_argument(
-    '--skip_batch_num',
-    type=int,
-    default=5,
-    help='The first num of minibatch num to skip, for better performance test')
-parser.add_argument(
-    '--iterations', type=int, default=80, help='The number of minibatches.')
-parser.add_argument(
-    "--dict_size",
-    type=int,
-    default=30000,
-    help="The dictionary capacity. Dictionaries of source sequence and "
-    "target dictionary have same capacity. (default: %(default)d)")
-parser.add_argument(
-    "--pass_num",
-    type=int,
-    default=2,
-    help="The pass number to train. (default: %(default)d)")
-parser.add_argument(
-    "--learning_rate",
-    type=float,
-    default=0.0002,
-    help="Learning rate used to train the model. (default: %(default)f)")
-parser.add_argument(
-    "--infer_only", action='store_true', help="If set, run forward only.")
-parser.add_argument(
-    "--beam_size",
-    type=int,
-    default=3,
-    help="The width for beam searching. (default: %(default)d)")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='GPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument(
-    "--max_length",
-    type=int,
-    default=250,
-    help="The maximum length of sequence when doing generation. "
-    "(default: %(default)d)")
-parser.add_argument(
-    '--with_test',
-    action='store_true',
-    help='If set, test the testset during training.')
-
 
 def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
     def linear(inputs):
@@ -264,116 +196,39 @@ def lodtensor_to_ndarray(lod_tensor):
     return ndarray
 
 
-def train():
+def get_model(args):
+    if args.use_reader_op:
+        raise Exception("machine_translation do not support reader op for now.")
+    embedding_dim = 512
+    encoder_size = 512
+    decoder_size = 512
+    dict_size = 30000
+    beam_size = 3
+    max_length = 250
     avg_cost, feeding_list = seq_to_seq_net(
-        args.embedding_dim,
-        args.encoder_size,
-        args.decoder_size,
-        args.dict_size,
-        args.dict_size,
+        embedding_dim,
+        encoder_size,
+        decoder_size,
+        dict_size,
+        dict_size,
         False,
-        beam_size=args.beam_size,
-        max_length=args.max_length)
+        beam_size=beam_size,
+        max_length=max_length)
 
     # clone from default main program
     inference_program = fluid.default_main_program().clone()
 
     optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-    optimizer.minimize(avg_cost)
-
-    fluid.memory_optimize(fluid.default_main_program())
 
     train_batch_generator = paddle.batch(
         paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
-        batch_size=args.batch_size)
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=args.batch_size * args.gpus)
 
     test_batch_generator = paddle.batch(
         paddle.reader.shuffle(
-            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
+            paddle.dataset.wmt14.test(dict_size), buf_size=1000),
         batch_size=args.batch_size)
 
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    exe = Executor(place)
-    exe.run(framework.default_startup_program())
-
-    def do_validation():
-        total_loss = 0.0
-        count = 0
-        for batch_id, data in enumerate(test_batch_generator()):
-            src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0]
-            trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0]
-            lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0]
-
-            fetch_outs = exe.run(inference_program,
-                                 feed={
-                                     feeding_list[0]: src_seq,
-                                     feeding_list[1]: trg_seq,
-                                     feeding_list[2]: lbl_seq
-                                 },
-                                 fetch_list=[avg_cost],
-                                 return_numpy=False)
-
-            total_loss += lodtensor_to_ndarray(fetch_outs[0])[0]
-            count += 1
-
-        return total_loss / count
-
-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in xrange(args.pass_num):
-        train_accs = []
-        train_losses = []
-        for batch_id, data in enumerate(train_batch_generator()):
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
-            if iters == args.iterations:
-                break
-            src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
-            num_samples += word_num
-            trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
-            num_samples += word_num
-            lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
-
-            fetch_outs = exe.run(framework.default_main_program(),
-                                 feed={
-                                     feeding_list[0]: src_seq,
-                                     feeding_list[1]: trg_seq,
-                                     feeding_list[2]: lbl_seq
-                                 },
-                                 fetch_list=[avg_cost])
-
-            iters += 1
-            loss = np.array(fetch_outs[0])
-            print(
-                "Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss)
-            )  # The accuracy is the accumulation of batches, but not the current batch.
-
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
-        # evaluation
-        if args.with_test:
-            test_loss = do_validation()
-        exit(0)
-
-
-def infer():
-    pass
-
-
-def print_arguments(args):
-    print('----------- seq2seq Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    args = parser.parse_args()
-    print_arguments(args)
-    if args.infer_only:
-        infer()
-    else:
-        train()
+    return avg_cost, inference_program, optimizer, train_batch_generator, \
+           test_batch_generator, None
diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py
new file mode 100644
index 0000000000..8e740dc689
--- /dev/null
+++ b/benchmark/fluid/models/mnist.py
@@ -0,0 +1,125 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import cProfile
+import os
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+
+SEED = 1
+DTYPE = "float32"
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def get_model(args):
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1, 1, 28, 28], (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        images, label = fluid.layers.read_file(data_file)
+    else:
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if args.device == 'CPU' and args.cpus > 1:
+        places = fluid.layers.get_places(args.cpus)
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            predict = cnn_model(pd.read_input(images))
+            label = pd.read_input(label)
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+            pd.write_output(avg_cost)
+            pd.write_output(batch_acc)
+
+        avg_cost, batch_acc = pd()
+        avg_cost = fluid.layers.mean(avg_cost)
+        batch_acc = fluid.layers.mean(batch_acc)
+    else:
+        # Train program
+        predict = cnn_model(images)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        # Evaluator
+        batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
new file mode 100644
index 0000000000..9ed1093c54
--- /dev/null
+++ b/benchmark/fluid/models/resnet.py
@@ -0,0 +1,208 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import numpy as np
+import time
+import os
+
+import cProfile, pstats, StringIO
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+from recordio_converter import imagenet_train, imagenet_test
+
+
+def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+    conv1 = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv1, act=act)
+
+
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
+    if ch_in != ch_out:
+        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+    else:
+        return input
+
+
+def basicblock(input, ch_out, stride):
+    short = shortcut(input, ch_out, stride)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def bottleneck(input, ch_out, stride):
+    short = shortcut(input, ch_out * 4, stride)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
+    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+
+
+def layer_warp(block_func, input, ch_out, count, stride):
+    res_out = block_func(input, ch_out, stride)
+    for i in range(1, count):
+        res_out = block_func(res_out, ch_out, 1)
+    return res_out
+
+
+def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
+
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = fluid.layers.pool2d(
+        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+    pool2 = fluid.layers.pool2d(
+        input=res4,
+        pool_size=7,
+        pool_type='avg',
+        pool_stride=1,
+        global_pooling=True)
+    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+    return out
+
+
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+    assert (depth - 2) % 6 == 0
+
+    n = (depth - 2) // 6
+
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+    return out
+
+
+def get_model(args):
+    model = resnet_cifar10
+    if args.data_set == "cifar10":
+        class_dim = 10
+        if args.data_format == 'NCHW':
+            dshape = [3, 32, 32]
+        else:
+            dshape = [32, 32, 3]
+        model = resnet_cifar10
+        train_reader = paddle.dataset.cifar.train10()
+        test_reader = paddle.dataset.cifar.test10()
+    elif args.data_set == "flowers":
+        class_dim = 102
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        train_reader = paddle.dataset.flowers.train()
+        test_reader = paddle.dataset.flowers.test()
+    elif args.data_set == "imagenet":
+        class_dim = 1000
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if not args.data_path:
+            raise Exception(
+                "Must specify --data_path when training with imagenet")
+        train_reader = imagenet_train(args.data_path)
+        test_reader = imagenet_test(args.data_path)
+
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1] + dshape, (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        input, label = fluid.layers.read_file(data_file)
+    else:
+        input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if args.device == 'CPU' and args.cpus > 1:
+        places = fluid.layers.get_places(args.cpus)
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            predict = model(pd.read_input(input), class_dim)
+            label = pd.read_input(label)
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+            pd.write_output(avg_cost)
+            pd.write_output(batch_acc)
+
+        avg_cost, batch_acc = pd()
+        avg_cost = fluid.layers.mean(avg_cost)
+        batch_acc = fluid.layers.mean(batch_acc)
+    else:
+        predict = model(input, class_dim)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc])
+
+    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+
+    batched_train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            train_reader, buf_size=5120),
+        batch_size=args.batch_size * args.gpus,
+        drop_last=True)
+    batched_test_reader = paddle.batch(
+        train_reader, batch_size=args.batch_size, drop_last=True)
+
+    return avg_cost, inference_program, optimizer, batched_train_reader,\
+                   batched_test_reader, batch_acc
diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py
similarity index 52%
rename from benchmark/fluid/stacked_dynamic_lstm.py
rename to benchmark/fluid/models/stacked_dynamic_lstm.py
index 73bcc47b4d..211869af4e 100644
--- a/benchmark/fluid/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -29,57 +29,6 @@ import paddle.fluid as fluid
 import paddle.batch as batch
 import paddle.fluid.profiler as profiler
 
-
-def parse_args():
-    parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
-    parser.add_argument(
-        '--batch_size',
-        type=int,
-        default=32,
-        help='The sequence number of a batch data. (default: %(default)d)')
-    parser.add_argument(
-        '--skip_batch_num',
-        type=int,
-        default=5,
-        help='The first num of minibatch num to skip, for better performance test'
-    )
-    parser.add_argument(
-        '--iterations', type=int, default=80, help='The number of minibatches.')
-    parser.add_argument(
-        '--emb_dim',
-        type=int,
-        default=512,
-        help='Dimension of embedding table. (default: %(default)d)')
-    parser.add_argument(
-        '--hidden_dim',
-        type=int,
-        default=512,
-        help='Hidden size of lstm unit. (default: %(default)d)')
-    parser.add_argument(
-        '--pass_num',
-        type=int,
-        default=100,
-        help='Epoch number to train. (default: %(default)d)')
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='CPU',
-        choices=['CPU', 'GPU'],
-        help='The device type.')
-    parser.add_argument(
-        '--crop_size',
-        type=int,
-        default=int(os.environ.get('CROP_SIZE', '1500')),
-        help='The max sentence length of input. Since this model use plain RNN,'
-        ' Gradient could be explored if sentence is too long')
-    parser.add_argument(
-        '--with_test',
-        action='store_true',
-        help='If set, test the testset during training.')
-    args = parser.parse_args()
-    return args
-
-
 word_dict = imdb.word_dict()
 
 
@@ -94,14 +43,18 @@ def crop_sentence(reader, crop_size):
     return __impl__
 
 
-def main():
-    args = parse_args()
-    lstm_size = args.hidden_dim
+def get_model(args):
+    if args.use_reader_op:
+        raise Exception(
+            "stacked_dynamic_lstm do not support reader op for now.")
+    lstm_size = 512
+    emb_dim = 512
+    crop_size = 1500
 
     data = fluid.layers.data(
         name="words", shape=[1], lod_level=1, dtype='int64')
     sentence = fluid.layers.embedding(
-        input=data, size=[len(word_dict), args.emb_dim])
+        input=data, size=[len(word_dict), emb_dim])
 
     sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
 
@@ -161,51 +114,17 @@ def main():
             target_vars=[batch_acc, batch_size_tensor])
 
     adam = fluid.optimizer.Adam()
-    adam.minimize(loss)
-
-    fluid.memory_optimize(fluid.default_main_program())
-
-    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
 
     train_reader = batch(
         paddle.reader.shuffle(
-            crop_sentence(imdb.train(word_dict), args.crop_size),
-            buf_size=25000),
+            crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
+        batch_size=args.batch_size * args.gpus)
+    test_reader = batch(
+        paddle.reader.shuffle(
+            crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
         batch_size=args.batch_size)
 
-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in range(args.pass_num):
-        train_accs = []
-        train_losses = []
-        for batch_id, data in enumerate(train_reader()):
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
-            if iters == args.iterations:
-                break
-            tensor_words = to_lodtensor([x[0] for x in data], place)
-            label = numpy.array([x[1] for x in data]).astype("int64")
-            label = label.reshape((-1, 1))
-            loss_np, acc, weight = exe.run(
-                fluid.default_main_program(),
-                feed={"words": tensor_words,
-                      "label": label},
-                fetch_list=[loss, batch_acc, batch_size_tensor])
-            iters += 1
-            for x in data:
-                num_samples += len(x[0])
-            print(
-                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
-                (pass_id, iters, loss_np, acc)
-            )  # The accuracy is the accumulation of batches, but not the current batch.
-
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
-        exit(0)
+    return loss, inference_program, adam, train_reader, test_reader, batch_acc
 
 
 def to_lodtensor(data, place):
@@ -221,16 +140,3 @@ def to_lodtensor(data, place):
     res.set(flattened_data, place)
     res.set_lod([lod])
     return res
-
-
-def print_arguments(args):
-    print('----------- lstm Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    print_arguments(args)
-    main()
diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py
new file mode 100644
index 0000000000..932601302d
--- /dev/null
+++ b/benchmark/fluid/models/vgg.py
@@ -0,0 +1,121 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import argparse
+import functools
+import os
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+def get_model(args):
+    if args.data_set == "cifar10":
+        classdim = 10
+        if args.data_format == 'NCHW':
+            data_shape = [3, 32, 32]
+        else:
+            data_shape = [32, 32, 3]
+    else:
+        classdim = 102
+        if args.data_format == 'NCHW':
+            data_shape = [3, 224, 224]
+        else:
+            data_shape = [224, 224, 3]
+
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1] + data_shape, (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        images, label = fluid.layers.read_file(data_file)
+    else:
+        images = fluid.layers.data(
+            name='data', shape=data_shape, dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    net = vgg16_bn_drop(images)
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    # Optimization
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size * args.gpus)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
+
+    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
diff --git a/benchmark/fluid/recordio_converter.py b/benchmark/fluid/recordio_converter.py
new file mode 100644
index 0000000000..f2dc39109b
--- /dev/null
+++ b/benchmark/fluid/recordio_converter.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.dataset import mnist, cifar, flowers, image
+
+
+def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data,
+                       shape_label):
+    num_batches = 0
+    with fluid.program_guard(fluid.Program(), fluid.Program()):
+        reader = paddle.batch(py_reader(), batch_size=batch_size)
+        feeder = fluid.DataFeeder(
+            feed_list=[  # order is image and label
+                fluid.layers.data(
+                    name='image', shape=shape_data),
+                fluid.layers.data(
+                    name='label', shape=shape_label, dtype='int64'),
+            ],
+            place=fluid.CPUPlace())
+        num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+            outfilepath, reader, feeder)
+    return num_batches
+
+
+def prepare_mnist(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "mnist.recordio")
+    convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1])
+
+
+def prepare_cifar10(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "cifar.recordio")
+    convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1])
+
+
+def prepare_flowers(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "flowers.recordio")
+    convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224],
+                       [1])
+
+
+def default_mapper(sample):
+    img, label = sample
+    img = image.simple_transform(
+        img, 256, 224, True, mean=[103.94, 116.78, 123.68])
+    return img.flatten().astype('float32'), label
+
+
+def imagenet_train(data_dir):
+    contents = os.listdir(data_dir)
+    if set(contents) != set(
+        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
+        raise Exception("Imagenet data contents error!")
+    img2label = dict()
+    imgfilelist = []
+    with open(os.path.join(data_dir, "train.txt")) as fn:
+        while 1:
+            l = fn.readline()
+            if not l:
+                break
+            img, lbl = l[:-1].split(" ")
+            img2label[img] = int(lbl)
+            imgfilelist.append(img)
+    # shuffle all, this is slow
+    random.shuffle(imgfilelist)
+
+    def train_reader():
+        for idx, imgfile in enumerate(imgfilelist):
+            data = image.load_image(
+                os.path.join(data_dir, "train", imgfile.lower()))
+            label = [img2label[imgfile], ]
+            yield [data, label]
+
+    return paddle.reader.map_readers(default_mapper, train_reader)
+
+
+def imagenet_test(data_dir):
+    contents = os.listdir(data_dir)
+    if set(contents) != set(
+        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
+        raise Exception("Imagenet data contents error!")
+    img2label = dict()
+    imgfilelist = []
+    with open(os.path.join(data_dir, "val.txt")) as fn:
+        while 1:
+            l = fn.readline()
+            if not l:
+                break
+            img, lbl = l[:-1].split(" ")
+            img2label[img] = int(lbl)
+            imgfilelist.append(img)
+
+    def test_reader():
+        for idx, imgfile in enumerate(imgfilelist):
+            base_path = os.path.join(data_dir, "val", imgfile.split(".")[0])
+            image_path = ".".join([base_path, "jpeg"])
+            data = image.load_image(image_path)
+            label = [img2label[imgfile], ]
+            yield [data, label]
+
+    return paddle.reader.map_readers(default_mapper, test_reader)
+
+
+# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged
+def convert_reader_to_recordio_files(
+        filename,
+        batch_per_file,
+        reader_creator,
+        feeder,
+        compressor=core.RecordIOWriter.Compressor.Snappy,
+        max_num_records=1000,
+        feed_order=None):
+    if feed_order is None:
+        feed_order = feeder.feed_names
+    f_name, f_ext = os.path.splitext(filename)
+    assert (f_ext == ".recordio")
+
+    lines = []
+    f_idx = 0
+    counter = 0
+    for idx, batch in enumerate(reader_creator()):
+        lines.append(batch)
+        if idx >= batch_per_file and idx % batch_per_file == 0:
+            filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
+            with fluid.recordio_writer.create_recordio_writer(
+                    filename, compressor, max_num_records) as writer:
+                for l in lines:
+                    res = feeder.feed(l)
+                    for each in feed_order:
+                        writer.append_tensor(res[each])
+                    writer.complete_append_tensor()
+                    counter += 1
+                lines = []
+                f_idx += 1
+            print("written file: ", filename)
+    return counter
+
+
+def prepare_imagenet(inpath, outpath, batch_size):
+    r = paddle.batch(imagenet_train(inpath), batch_size=batch_size)
+    feeder = fluid.DataFeeder(
+        feed_list=[
+            fluid.layers.data(
+                name="image", shape=[3, 224, 224]), fluid.layers.data(
+                    name="label", shape=[1], dtype='int64')
+        ],
+        place=fluid.CPUPlace())
+    outpath = os.path.join(outpath, "imagenet.recordio")
+    convert_reader_to_recordio_files(outpath, 10000, r, feeder)
diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py
deleted file mode 100644
index 0fd7258a80..0000000000
--- a/benchmark/fluid/resnet.py
+++ /dev/null
@@ -1,317 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import functools
-import numpy as np
-import time
-
-import cProfile, pstats, StringIO
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.profiler as profiler
-
-
-def parse_args():
-    parser = argparse.ArgumentParser('Convolution model benchmark.')
-    parser.add_argument(
-        '--model',
-        type=str,
-        choices=['resnet_imagenet', 'resnet_cifar10'],
-        default='resnet_imagenet',
-        help='The model architecture.')
-    parser.add_argument(
-        '--batch_size', type=int, default=32, help='The minibatch size.')
-    parser.add_argument(
-        '--use_fake_data',
-        action='store_true',
-        help='use real data or fake data')
-    parser.add_argument(
-        '--skip_batch_num',
-        type=int,
-        default=5,
-        help='The first num of minibatch num to skip, for better performance test'
-    )
-    parser.add_argument(
-        '--iterations', type=int, default=80, help='The number of minibatches.')
-    parser.add_argument(
-        '--pass_num', type=int, default=100, help='The number of passes.')
-    parser.add_argument(
-        '--data_format',
-        type=str,
-        default='NCHW',
-        choices=['NCHW', 'NHWC'],
-        help='The data data_format, now only support NCHW.')
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='GPU',
-        choices=['CPU', 'GPU'],
-        help='The device type.')
-    parser.add_argument(
-        '--data_set',
-        type=str,
-        default='flowers',
-        choices=['cifar10', 'flowers'],
-        help='Optional dataset for benchmark.')
-    parser.add_argument(
-        '--infer_only', action='store_true', help='If set, run forward only.')
-    parser.add_argument(
-        '--use_cprof', action='store_true', help='If set, use cProfile.')
-    parser.add_argument(
-        '--use_nvprof',
-        action='store_true',
-        help='If set, use nvprof for CUDA.')
-    parser.add_argument(
-        '--with_test',
-        action='store_true',
-        help='If set, test the testset during training.')
-    args = parser.parse_args()
-    return args
-
-
-def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
-    conv1 = fluid.layers.conv2d(
-        input=input,
-        filter_size=filter_size,
-        num_filters=ch_out,
-        stride=stride,
-        padding=padding,
-        act=None,
-        bias_attr=False)
-    return fluid.layers.batch_norm(input=conv1, act=act)
-
-
-def shortcut(input, ch_out, stride):
-    ch_in = input.shape[1] if args.data_format == 'NCHW' else input.shape[-1]
-    if ch_in != ch_out:
-        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
-    else:
-        return input
-
-
-def basicblock(input, ch_out, stride):
-    short = shortcut(input, ch_out, stride)
-    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
-    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
-
-
-def bottleneck(input, ch_out, stride):
-    short = shortcut(input, ch_out * 4, stride)
-    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
-    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
-    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
-
-
-def layer_warp(block_func, input, ch_out, count, stride):
-    res_out = block_func(input, ch_out, stride)
-    for i in range(1, count):
-        res_out = block_func(res_out, ch_out, 1)
-    return res_out
-
-
-def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
-
-    cfg = {
-        18: ([2, 2, 2, 1], basicblock),
-        34: ([3, 4, 6, 3], basicblock),
-        50: ([3, 4, 6, 3], bottleneck),
-        101: ([3, 4, 23, 3], bottleneck),
-        152: ([3, 8, 36, 3], bottleneck)
-    }
-    stages, block_func = cfg[depth]
-    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
-    pool1 = fluid.layers.pool2d(
-        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
-    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
-    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
-    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
-    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
-    pool2 = fluid.layers.pool2d(
-        input=res4,
-        pool_size=7,
-        pool_type='avg',
-        pool_stride=1,
-        global_pooling=True)
-    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
-    return out
-
-
-def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
-    assert (depth - 2) % 6 == 0
-
-    n = (depth - 2) // 6
-
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
-    return out
-
-
-def run_benchmark(model, args):
-    if args.use_cprof:
-        pr = cProfile.Profile()
-        pr.enable()
-
-    if args.data_set == "cifar10":
-        class_dim = 10
-        if args.data_format == 'NCHW':
-            dshape = [3, 32, 32]
-        else:
-            dshape = [32, 32, 3]
-    else:
-        class_dim = 102
-        if args.data_format == 'NCHW':
-            dshape = [3, 224, 224]
-        else:
-            dshape = [224, 224, 3]
-
-    input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    predict = model(input, class_dim)
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size_tensor)
-
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
-
-    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
-    opts = optimizer.minimize(avg_cost)
-
-    fluid.memory_optimize(fluid.default_main_program())
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10()
-        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-        batch_size=args.batch_size)
-
-    def test(exe):
-        test_accuracy = fluid.average.WeightedAverage()
-        for batch_id, data in enumerate(test_reader()):
-            img_data = np.array(map(lambda x: x[0].reshape(dshape),
-                                    data)).astype("float32")
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([-1, 1])
-
-            acc, weight = exe.run(inference_program,
-                                  feed={"data": img_data,
-                                        "label": y_data},
-                                  fetch_list=[batch_acc, batch_size_tensor])
-            test_accuracy.add(value=acc, weight=weight)
-
-        return test_accuracy.eval()
-
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-    accuracy = fluid.average.WeightedAverage()
-    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
-    if args.use_fake_data:
-        data = train_reader().next()
-        image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype(
-            'float32')
-        label = np.array(map(lambda x: x[1], data)).astype('int64')
-        label = label.reshape([-1, 1])
-
-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in range(args.pass_num):
-        accuracy.reset()
-        train_accs = []
-        train_losses = []
-        for batch_id, data in enumerate(train_reader()):
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
-            if iters == args.iterations:
-                break
-            if not args.use_fake_data:
-                image = np.array(map(lambda x: x[0].reshape(dshape),
-                                     data)).astype('float32')
-                label = np.array(map(lambda x: x[1], data)).astype('int64')
-                label = label.reshape([-1, 1])
-            loss, acc, weight = train_exe.run(
-                feed={'data': image,
-                      'label': label},
-                fetch_list=[
-                    avg_cost.name, batch_acc.name, batch_size_tensor.name
-                ])
-            iters += 1
-            num_samples += len(label)
-            accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight))
-            loss = np.mean(np.array(loss))
-            acc = np.mean(np.array(acc))
-            train_losses.append(loss)
-            train_accs.append(acc)
-            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
-                  (pass_id, iters, loss, acc))
-        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
-              (pass_id, np.mean(train_losses), np.mean(train_accs)))
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
-        # evaluation
-        if args.with_test:
-            pass_test_acc = test(exe)
-        exit(0)
-
-
-def print_arguments(args):
-    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
-                                vars(args)['device'] == 'GPU')
-    print('----------- resnet Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    model_map = {
-        'resnet_imagenet': resnet_imagenet,
-        'resnet_cifar10': resnet_cifar10
-    }
-    args = parse_args()
-    print_arguments(args)
-    if args.data_format == 'NHWC':
-        raise ValueError('Only support NCHW data_format now.')
-    if args.use_nvprof and args.device == 'GPU':
-        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
-            run_benchmark(model_map[args.model], args)
-    else:
-        run_benchmark(model_map[args.model], args)
diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh
index f6dfd20bf2..5d9b2db871 100644
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@@ -2,6 +2,7 @@
 # This script benchmarking the PaddlePaddle Fluid on
 # single thread single GPU.
 
+mkdir -p logs
 #export FLAGS_fraction_of_gpu_memory_to_use=0.0
 export CUDNN_PATH=/paddle/cudnn_v5
 
@@ -35,71 +36,74 @@ nohup stdbuf -oL nvidia-smi \
       --format=csv \
       --filename=mem.log  \
       -l 1 &
+
 # mnist
 # mnist gpu mnist 128
-FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=mnist \
                --device=GPU \
                --batch_size=128 \
                --skip_batch_num=5 \
                --iterations=500 \
-               2>&1 | tee -a mnist_gpu_128.log
+               2>&1 | tee -a logs/mnist_gpu_128.log
 
 # vgg16
 # gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=vgg16 \
                --device=GPU \
                --batch_size=128 \
                --skip_batch_num=5 \
                --iterations=30 \
-               2>&1 | tee -a vgg16_gpu_128.log
+               2>&1 | tee -a logs/vgg16_gpu_128.log
 
 # flowers gpu  128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=vgg16 \
                --device=GPU \
                --batch_size=32 \
                --data_set=flowers \
                --skip_batch_num=5 \
                --iterations=30 \
-               2>&1 | tee -a vgg16_gpu_flowers_32.log
+               2>&1 | tee -a logs/vgg16_gpu_flowers_32.log
 
 # resnet50
 # resnet50 gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=resnet \
                --device=GPU \
                --batch_size=128 \
                --data_set=cifar10 \
-               --model=resnet_cifar10 \
                --skip_batch_num=5 \
                --iterations=30 \
-               2>&1 | tee -a resnet50_gpu_128.log
+               2>&1 | tee -a logs/resnet50_gpu_128.log
 
 # resnet50 gpu flowers 64
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=resnet \
                --device=GPU \
                --batch_size=64 \
                --data_set=flowers \
-               --model=resnet_imagenet \
                --skip_batch_num=5 \
                --iterations=30 \
-               2>&1 | tee -a resnet50_gpu_flowers_64.log
+               2>&1 | tee -a logs/resnet50_gpu_flowers_64.log
 
 # lstm
 # lstm gpu imdb 32 # tensorflow only support batch=32
-FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=stacked_dynamic_lstm \
                --device=GPU \
                --batch_size=32 \
                --skip_batch_num=5 \
                --iterations=30 \
-               --hidden_dim=512 \
-               --emb_dim=512 \
-               --crop_size=1500 \
-               2>&1 | tee -a lstm_gpu_32.log
+               2>&1 | tee -a logs/lstm_gpu_32.log
 
 # seq2seq
 # seq2seq gpu wmb 128
-FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=machine_translation \
                --device=GPU \
                --batch_size=128 \
                --skip_batch_num=5 \
                --iterations=30 \
-               2>&1 | tee -a lstm_gpu_128.log
+               2>&1 | tee -a logs/lstm_gpu_128.log
diff --git a/benchmark/fluid/run_fluid_benchmark.sh b/benchmark/fluid/run_fluid_benchmark.sh
new file mode 100644
index 0000000000..4309a3126c
--- /dev/null
+++ b/benchmark/fluid/run_fluid_benchmark.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device CPU --update_method pserver --iterations=10000 &
+
+sleep 15
+
+CUDA_VISIBLE_DEVICES=0,1 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
+
+CUDA_VISIBLE_DEVICES=2,3 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=1 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py
deleted file mode 100644
index 2a9566a45c..0000000000
--- a/benchmark/fluid/vgg.py
+++ /dev/null
@@ -1,228 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in Fluid"""
-from __future__ import print_function
-
-import sys
-import time
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import argparse
-import functools
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
-    '--skip_batch_num',
-    type=int,
-    default=5,
-    help='The first num of minibatch num to skip, for better performance test')
-parser.add_argument(
-    '--iterations', type=int, default=80, help='The number of minibatches.')
-parser.add_argument(
-    '--learning_rate',
-    type=float,
-    default=1e-3,
-    help="Learning rate for training.")
-parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='GPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument(
-    '--data_format',
-    type=str,
-    default='NCHW',
-    choices=['NCHW', 'NHWC'],
-    help='The data order, now only support NCHW.')
-parser.add_argument(
-    '--data_set',
-    type=str,
-    default='cifar10',
-    choices=['cifar10', 'flowers'],
-    help='Optional dataset for benchmark.')
-parser.add_argument(
-    '--with_test',
-    action='store_true',
-    help='If set, test the testset during training.')
-args = parser.parse_args()
-
-
-def vgg16_bn_drop(input):
-    def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0])
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
-    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
-    return fc2
-
-
-def main():
-    if args.data_set == "cifar10":
-        classdim = 10
-        if args.data_format == 'NCHW':
-            data_shape = [3, 32, 32]
-        else:
-            data_shape = [32, 32, 3]
-    else:
-        classdim = 102
-        if args.data_format == 'NCHW':
-            data_shape = [3, 224, 224]
-        else:
-            data_shape = [224, 224, 3]
-
-    # Input data
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    # Train program
-    net = vgg16_bn_drop(images)
-    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    # Evaluator
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size_tensor)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
-
-    # Optimization
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-    opts = optimizer.minimize(avg_cost)
-
-    fluid.memory_optimize(fluid.default_main_program())
-
-    # Initialize executor
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    exe = fluid.Executor(place)
-
-    # Parameter initialization
-    exe.run(fluid.default_startup_program())
-
-    # data reader
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10()
-        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-        batch_size=args.batch_size)
-
-    # test
-    def test(exe):
-        test_accuracy = fluid.average.WeightedAverage()
-        for batch_id, data in enumerate(test_reader()):
-            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
-                                    data)).astype("float32")
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([-1, 1])
-
-            acc, weight = exe.run(inference_program,
-                                  feed={"pixel": img_data,
-                                        "label": y_data},
-                                  fetch_list=[batch_acc, batch_size_tensor])
-            test_accuracy.add(value=acc, weight=weight)
-        return test_accuracy.eval()
-
-    iters, num_samples, start_time = 0, 0, time.time()
-    accuracy = fluid.average.WeightedAverage()
-    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
-    for pass_id in range(args.pass_num):
-        accuracy.reset()
-        train_accs = []
-        train_losses = []
-        for batch_id, data in enumerate(train_reader()):
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
-            if iters == args.iterations:
-                break
-            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
-                                    data)).astype("float32")
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([-1, 1])
-
-            loss, acc, weight = train_exe.run(
-                feed={"pixel": img_data,
-                      "label": y_data},
-                fetch_list=[
-                    avg_cost.name, batch_acc.name, batch_size_tensor.name
-                ])
-            accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight))
-            iters += 1
-            num_samples += len(y_data)
-            loss = np.mean(np.array(loss))
-            acc = np.mean(np.array(acc))
-            print(
-                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
-                (pass_id, iters, loss, acc)
-            )  # The accuracy is the accumulation of batches, but not the current batch.
-
-        # pass_train_acc = accuracy.eval()
-        train_losses.append(loss)
-        train_accs.append(acc)
-        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
-              (pass_id, np.mean(train_losses), np.mean(train_accs)))
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
-        # evaluation
-        if args.with_test:
-            pass_test_acc = test(exe)
-        exit(0)
-
-
-def print_arguments():
-    print('----------- vgg Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == "__main__":
-    print_arguments()
-    main()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e490397cc0..6a8b15a6b6 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -41,6 +41,10 @@ if(USE_EIGEN_FOR_BLAS)
     add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
 endif(USE_EIGEN_FOR_BLAS)
 
+if(EIGEN_USE_THREADS)
+    add_definitions(-DEIGEN_USE_THREADS)
+endif(EIGEN_USE_THREADS)
+
 if(NOT WITH_PROFILER)
     add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
@@ -88,6 +92,9 @@ if(WITH_GPU)
         if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
             message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
         endif()
+        if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
+            message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
+        endif()
         include_directories(${TENSORRT_INCLUDE_DIR})
     endif()
 elseif(WITH_AMD_GPU)
@@ -111,6 +118,10 @@ endif()
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
 
+if(WITH_DISTRIBUTE)
+  add_definitions(-DPADDLE_WITH_DISTRIBUTE)
+endif()
+
 if(WITH_GOLANG)
   # we need to symlink Paddle directory into GOPATH. If we
   # don't do it and we have code that depends on Paddle, go
@@ -159,3 +170,7 @@ if(WITH_GOLANG)
   endif()
 
 endif(WITH_GOLANG)
+
+if(WITH_GRPC)
+    add_definitions(-DPADDLE_WITH_GRPC)
+endif(WITH_GRPC)
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
deleted file mode 100644
index 4823dc3e91..0000000000
--- a/cmake/cpplint.cmake
+++ /dev/null
@@ -1,62 +0,0 @@
-# util to check C++ file style
-# * it basically use google cpplint.py.
-# * It provide "add_style_check_target" for cmake.
-#   Usage see add_style_check_target's document
-#
-# TODO(yuyang18): Add python style check.
-
-set(STYLE_FILTER)
-
-# diable unwanted filters
-
-# paddle do not indent public/potected/private in class
-set(STYLE_FILTER "${STYLE_FILTER}-whitespace/indent,")
-# paddle use mutable reference. BUT IT IS NOT RECOMMANDED
-set(STYLE_FILTER "${STYLE_FILTER}-runtime/references,")
-# paddle use relative path for include.
-set(STYLE_FILTER "${STYLE_FILTER}-build/include,")
-# paddle use <thread>, <mutex>, etc.
-set(STYLE_FILTER "${STYLE_FILTER}-build/c++11,")
-# paddle use c style casting. BUT IT IS NOT RECOMMANDED
-set(STYLE_FILTER "${STYLE_FILTER}-readability/casting")
-
-
-# IGNORE SOME FILES
-set(IGNORE_PATTERN
-    .*ImportanceSampler.*
-    .*cblas\\.h.*
-    .*\\.pb\\.txt
-    .*MultiDataProvider.*
-    .*pb.*
-    .*pybind.h)
-
-# add_style_check_target
-#
-# attach check code style step for target.
-#
-# first argument: target name to attach
-# rest arguments: source list to check code style.
-#
-# NOTE: If WITH_STYLE_CHECK is OFF, then this macro just do nothing.
-macro(add_style_check_target TARGET_NAME)
-    if(WITH_STYLE_CHECK)
-        set(SOURCES_LIST ${ARGN})
-        list(REMOVE_DUPLICATES SOURCES_LIST)
-        foreach(filename ${SOURCES_LIST})
-            foreach(pattern ${IGNORE_PATTERN})
-                if(filename MATCHES ${pattern})
-                    list(REMOVE_ITEM SOURCES_LIST ${filename})
-                endif()
-            endforeach()
-        endforeach()
-
-        if(SOURCES_LIST)
-            add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-                COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/scripts/cpplint.py"
-                        "--filter=${STYLE_FILTER}"
-                        ${SOURCES_LIST}
-                COMMENT "cpplint: Checking source code style"
-                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})        
-        endif()
-    endif()
-endmacro()
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index 499682f644..73713d93d5 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -23,8 +23,12 @@ set(BOOST_PROJECT       "extern_boost")
 # checked that the devtools package of CentOS 6 installs boost 1.41.0.
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
-set(BOOST_TAR           "boost_1_41_0")
-set(BOOST_URL           "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz")
+if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
+    message(STATUS "use pre defined download url")
+    set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
+    set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
+endif()
+MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
new file mode 100644
index 0000000000..8e2c913b2c
--- /dev/null
+++ b/cmake/external/brpc.cmake
@@ -0,0 +1,58 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc)
+SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
+SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE)
+SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc library." FORCE)
+
+INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
+
+# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
+set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf")
+
+# If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
+ExternalProject_Add(
+    extern_brpc
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/brpc/brpc"
+    GIT_TAG         "6d153dd7ff00f960ae6895c9c5fff0ce9f07aff2"
+    PREFIX          ${BRPC_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR}
+                    -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    -DCMAKE_PREFIX_PATH=${prefix_path}
+                    -DBRPC_WITH_GLOG=ON
+                    ${EXTERNAL_OPTIONAL_ARGS}
+    LIST_SEPARATOR |
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR}
+                     -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+)
+ADD_DEPENDENCIES(extern_brpc protobuf leveldb gflags glog gtest snappy)
+ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
+ADD_DEPENDENCIES(brpc extern_brpc)
+
+
+LIST(APPEND external_project_dependencies brpc)
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index e90948782b..ffdf91a354 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -23,17 +23,29 @@ SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
 SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
 SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
 SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
+
+include(ProcessorCount)
+ProcessorCount(NUM_OF_PROCESSOR)
+
 IF(APPLE)
-  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
 ELSE()
-  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin)
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin)
 ENDIF()
 
+# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them
 ExternalProject_Add(
     extern_grpc
     DEPENDS protobuf zlib
-    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
-    GIT_TAG "v1.10.x"
+    # NOTE(wuyi):
+    # this package is generated by following steps:
+    # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
+    # 2. submodule update --init
+    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
+    #    checkout and clean other dirs under third_party
+    # 4. remove .git, and package the directory.
+    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz"
+    URL_MD5  "c9c58ee7d0e8929a63155af6a2ecdbd0"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
@@ -46,7 +58,6 @@ ExternalProject_Add(
     INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
 )
 
-# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
 ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
              "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake
new file mode 100644
index 0000000000..fb5091731d
--- /dev/null
+++ b/cmake/external/leveldb.cmake
@@ -0,0 +1,44 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(LEVELDB_SOURCES_DIR ${THIRD_PARTY_PATH}/leveldb)
+SET(LEVELDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/leveldb)
+SET(LEVELDB_INCLUDE_DIR "${LEVELDB_INSTALL_DIR}/include" CACHE PATH "leveldb include directory." FORCE)
+SET(LEVELDB_LIBRARIES "${LEVELDB_INSTALL_DIR}/lib/libleveldb.a" CACHE FILEPATH "leveldb library." FORCE)
+INCLUDE_DIRECTORIES(${LEVELDB_INCLUDE_DIR})
+
+ExternalProject_Add(
+    extern_leveldb
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX ${LEVELDB_SOURCES_DIR}
+    URL "https://github.com/google/leveldb/archive/v1.18.tar.gz"
+    URL_MD5 "73770de34a2a5ab34498d2e05b2b7fa0"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a
+    INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ 
+        && cp ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/libleveldb.a ${LEVELDB_LIBRARIES}
+        && cp -r ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/include ${LEVELDB_INSTALL_DIR}/
+    BUILD_IN_SOURCE 1
+)
+
+ADD_DEPENDENCIES(extern_leveldb snappy)
+
+ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
+ADD_DEPENDENCIES(leveldb extern_leveldb)
+
+LIST(APPEND external_project_dependencies leveldb)
+
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index e9a37b52e6..82c424fb79 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -27,8 +27,12 @@ ENDIF()
 INCLUDE(ExternalProject)
 
 SET(MKLML_PROJECT       "extern_mklml")
-SET(MKLML_VER           "mklml_lnx_2018.0.3.20180406")
-SET(MKLML_URL           "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz")
+IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
+  MESSAGE(STATUS "use pre defined download url")
+  SET(MKLML_VER "mklml_lnx_2018.0.3.20180406" CACHE STRING "" FORCE)
+  SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+ENDIF()
+MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 0fde4373a4..2665996432 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -212,6 +212,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
         ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
             ${OPTIONAL_ARGS}
             -Dprotobuf_BUILD_TESTS=OFF
+            -DCMAKE_SKIP_RPATH=ON
             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
             -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 1d3e2ade6d..0e2df86c19 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -206,8 +206,6 @@ function(cc_library TARGET_NAME)
         list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
       endif()
     endforeach()
-    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
-
   else(cc_library_SRCS)
     if(cc_library_DEPS)
       merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
@@ -231,7 +229,7 @@ endfunction(cc_binary)
 
 function(cc_test TARGET_NAME)
   if(WITH_TESTING)
-    set(options "")
+    set(options SERIAL)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -241,6 +239,9 @@ function(cc_test TARGET_NAME)
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if (${cc_test_SERIAL})
+        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    endif()
   endif()
 endfunction(cc_test)
 
@@ -268,7 +269,6 @@ function(nv_library TARGET_NAME)
           list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
         endif()
       endforeach()
-      add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
     else(nv_library_SRCS)
       if (nv_library_DEPS)
         merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
@@ -295,7 +295,7 @@ endfunction(nv_binary)
 
 function(nv_test TARGET_NAME)
   if (WITH_GPU AND WITH_TESTING)
-    set(options "")
+    set(options SERIAL)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -303,6 +303,9 @@ function(nv_test TARGET_NAME)
     target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     add_test(${TARGET_NAME} ${TARGET_NAME})
+    if (nv_test_SERIAL)
+        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    endif()
   endif()
 endfunction(nv_test)
 
@@ -338,7 +341,6 @@ function(hip_library TARGET_NAME)
 	  list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
 	endif()
       endforeach()
-      add_style_check_target(${TARGET_NAME} ${hip_library_SRCS} ${hip_library_HEADERS})
     else(hip_library_SRCS)
       if (hip_library_DEPS)
 	merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
@@ -608,3 +610,21 @@ function(grpc_library TARGET_NAME)
     COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
 endfunction()
+
+
+function(brpc_library TARGET_NAME)
+  set(oneValueArgs PROTO)
+  set(multiValueArgs SRCS DEPS)
+  set(options "")
+  cmake_parse_arguments(brpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  message(STATUS "generating brpc ${brpc_library_PROTO}")
+
+  get_filename_component(ABS_PROTO ${brpc_library_PROTO} ABSOLUTE)
+  get_filename_component(PROTO_WE ${brpc_library_PROTO} NAME_WE)
+  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
+
+  protobuf_generate_cpp(brpc_proto_srcs brpc_proto_hdrs "${ABS_PROTO}")
+  cc_library("${TARGET_NAME}_proto" SRCS "${brpc_proto_srcs}")
+  cc_library("${TARGET_NAME}" SRCS "${brpc_library_SRCS}" DEPS "${TARGET_NAME}_proto" "${brpc_library_DEPS}")
+endfunction()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 1144ca7f43..236a55d332 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -52,80 +52,91 @@ function(copy TARGET)
 endfunction()
 
 # third party
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/eigen3")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3")
 copy(eigen3_lib
   SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
   DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
+  DEPS eigen3
 )
 
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/gflags")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags")
 copy(gflags_lib
   SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
   DSTS ${dst_dir} ${dst_dir}/lib
+  DEPS gflags
 )
 
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/glog")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog")
 copy(glog_lib
   SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
   DSTS ${dst_dir} ${dst_dir}/lib
+  DEPS glog
 )
 
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/boost/")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/")
 copy(boost_lib
   SRCS ${BOOST_INCLUDE_DIR}/boost
   DSTS ${dst_dir}
+  DEPS boost
 )
 
 if(NOT PROTOBUF_FOUND)
-    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
     copy(protobuf_lib
       SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
       DSTS ${dst_dir} ${dst_dir}/lib
+      DEPS extern_protobuf
     )
 endif()
 
 if(NOT CBLAS_FOUND)
-    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas")
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
     copy(openblas_lib
       SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
       DSTS ${dst_dir} ${dst_dir}
+      DEPS extern_openblas
     )
 elseif (WITH_MKLML)
-    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mklml")
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
     copy(mklml_lib
       SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
       DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
+      DEPS mklml
     )
 endif()
 
 if(WITH_MKLDNN)
-  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mkldnn")
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
   copy(mkldnn_lib
     SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
     DSTS ${dst_dir} ${dst_dir}/lib
+    DEPS mkldnn
   )
 endif()
 
 if(NOT MOBILE_INFERENCE AND NOT RPI)
-  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy")
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
   copy(snappy_lib
     SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
-    DSTS ${dst_dir} ${dst_dir}/lib)
+    DSTS ${dst_dir} ${dst_dir}/lib
+    DEPS snappy)
 
-  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappystream")
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
   copy(snappystream_lib
     SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
-    DSTS ${dst_dir} ${dst_dir}/lib)
+    DSTS ${dst_dir} ${dst_dir}/lib
+    DEPS snappystream)
 
-  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/zlib")
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
   copy(zlib_lib
     SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
-    DSTS ${dst_dir} ${dst_dir}/lib)
+    DSTS ${dst_dir} ${dst_dir}/lib
+    DEPS zlib)
 endif()
 
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")
+set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
 copy(framework_lib DEPS framework_py_proto 
   SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
@@ -162,4 +173,25 @@ copy(pybind_lib
   DSTS ${dst_dir}/${module}
 )
 
+# CMakeCache Info
+copy(cmake_cache
+  SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
+  DSTS ${FLUID_INSTALL_DIR})
+
 add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 
+
+# paddle fluid version
+execute_process(
+  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
+set(version_file ${FLUID_INSTALL_DIR}/version.txt)
+file(WRITE ${version_file}
+  "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
+  "WITH_MKL: ${WITH_MKL}\n"
+  "WITH_GPU: ${WITH_GPU}\n")
+if(WITH_GPU)
+  file(APPEND ${version_file}
+    "CUDA version: ${CUDA_VERSION}\n"
+    "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
+endif()
diff --git a/contrib/inference/paddle_inference_api.h b/contrib/inference/paddle_inference_api.h
deleted file mode 100644
index dbaa7c95b9..0000000000
--- a/contrib/inference/paddle_inference_api.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-namespace paddle {
-
-class Predictor {
-public:
-  struct Attr;
-  Predictor() = default;
-
-  // Build the network before inference.
-  bool Init(const Attr& attr);
-
-  // Predict an record.
-  // Arguments:
-  //   inputs: the name of the input variables.
-  //   outputs: the name of the output varaibles.
-  //   input_shapes: the shape of the input variables.
-  //   output_shapes: the shape of the output variables.
-  //   input_data: the data of the input variables.
-  //   output_data: the data of the output variables.
-  bool Run(const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs,
-           const std::vector<std::vector<int>>& input_shapes,
-           const std::vector<std::vector<int>>& output_shapes,
-           const std::vector<std::vector<float>>& input_data,
-           std::vector<std::vector<float>>* output_data);
-
-  // Clone a predictor that share the model weights.
-  Predictor* Clone();
-
-  // Destroy the Predictor.
-  ~Predictor();
-
-  struct Attr {
-    enum class EngineKind;
-
-    std::string model_dir;      // path to the model directory.
-    bool enable_engine{false};  // Enable to execute (part of) the model on
-                                // third-party engines.
-    EngineKind engine_kind{Attr::EngineKind::kNone};
-
-    enum class EngineKind {
-      kNone = -1,          // Use the native Fluid facility.
-      kAnakin,             // Use Anakin for inference.
-      kTensorRT,           // Use TensorRT for inference.
-      kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
-      kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
-    };
-  };
-};
-
-}  // namespace paddle
diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt
index 8086507bb4..be92af3902 100644
--- a/doc/fluid/CMakeLists.txt
+++ b/doc/fluid/CMakeLists.txt
@@ -15,6 +15,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 
+set(IMPORT_PADDLE_STRING "")
+set(IMPORT_PADDLEV2_STRING "")
+
 configure_file(
     "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
     "${BINARY_BUILD_DIR_EN}/conf.py"
@@ -27,8 +30,6 @@ sphinx_add_target(paddle_fluid_docs
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
-add_dependencies(paddle_fluid_docs gen_proto_py paddle_python)
-
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
 
@@ -50,6 +51,4 @@ sphinx_add_target(paddle_fluid_docs_cn
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
 
-add_dependencies(paddle_fluid_docs_cn gen_proto_py paddle_python)
-
 add_subdirectory(api)
diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt
index 48b396f078..435d6e10fb 100644
--- a/doc/fluid/api/CMakeLists.txt
+++ b/doc/fluid/api/CMakeLists.txt
@@ -7,6 +7,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 
+set(IMPORT_PADDLE_STRING "import paddle")
+set(IMPORT_PADDLEV2_STRING "import paddle.v2")
+
 configure_file(
     "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
     "${BINARY_BUILD_DIR_EN}/conf.py"
diff --git a/doc/fluid/api/clip.rst b/doc/fluid/api/clip.rst
new file mode 100644
index 0000000000..3ba096388f
--- /dev/null
+++ b/doc/fluid/api/clip.rst
@@ -0,0 +1,47 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+====
+clip
+====
+
+ErrorClipByValue
+----------------
+
+..  autoclass:: paddle.fluid.clip.ErrorClipByValue
+    :members:
+    :noindex:
+
+GradientClipByValue
+-------------------
+
+..  autoclass:: paddle.fluid.clip.GradientClipByValue
+    :members:
+    :noindex:
+
+GradientClipByNorm
+------------------
+
+..  autoclass:: paddle.fluid.clip.GradientClipByNorm
+    :members:
+    :noindex:
+
+GradientClipByGlobalNorm
+------------------------
+
+..  autoclass:: paddle.fluid.clip.GradientClipByGlobalNorm
+    :members:
+    :noindex:
+
+append_gradient_clip_ops
+------------------------
+
+..  autofunction:: paddle.fluid.clip.append_gradient_clip_ops
+    :noindex:
+
+error_clip_callback
+-------------------
+
+..  autofunction:: paddle.fluid.clip.error_clip_callback
+    :noindex:
+
diff --git a/doc/fluid/api/evaluator.rst b/doc/fluid/api/evaluator.rst
index f80b87c7d2..c0dc9a0d1d 100644
--- a/doc/fluid/api/evaluator.rst
+++ b/doc/fluid/api/evaluator.rst
@@ -5,24 +5,3 @@
 evaluator
 =========
 
-ChunkEvaluator
---------------
-
-..  autoclass:: paddle.fluid.evaluator.ChunkEvaluator
-    :members:
-    :noindex:
-
-EditDistance
---------------
-
-..  autoclass:: paddle.fluid.evaluator.EditDistance
-    :members:
-    :noindex:
-
-DetectionMAP
---------------
-
-..  autoclass:: paddle.fluid.evaluator.DetectionMAP
-    :members:
-    :noindex:
-  
diff --git a/doc/fluid/api/executor.rst b/doc/fluid/api/executor.rst
index a9cdf264e4..f67a14c49f 100644
--- a/doc/fluid/api/executor.rst
+++ b/doc/fluid/api/executor.rst
@@ -30,3 +30,9 @@ switch_scope
 ..  autofunction:: paddle.fluid.executor.switch_scope
     :noindex:
 
+fetch_var
+---------
+
+..  autofunction:: paddle.fluid.executor.fetch_var
+    :noindex:
+
diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh
index ba7b7ba8e5..0f05393555 100755
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
 
-for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
+for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
 do
   python gen_doc.py ${module} > ${module}.rst
 done
diff --git a/doc/fluid/api/index_en.rst b/doc/fluid/api/index_en.rst
index 06c686d950..29cea9c682 100644
--- a/doc/fluid/api/index_en.rst
+++ b/doc/fluid/api/index_en.rst
@@ -9,8 +9,9 @@ Fluid
     data_feeder.rst
     executor.rst
     initializer.rst
-    evaluator.rst
+    metrics.rst
     nets.rst
+    clip.rst
     optimizer.rst
     param_attr.rst
     profiler.rst
diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst
index 2f02c5de09..c49a98c744 100644
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@@ -33,11 +33,16 @@ Xavier
     :members:
     :noindex:
 
-MSRA
-------
+force_init_on_cpu
+-----------------
 
-..  autoclass:: paddle.fluid.initializer.MSRA
-    :members:
+..  autofunction:: paddle.fluid.initializer.force_init_on_cpu
+    :noindex:
+
+init_on_cpu
+-----------
+
+..  autofunction:: paddle.fluid.initializer.init_on_cpu
     :noindex:
 
 ConstantInitializer
@@ -68,9 +73,3 @@ XavierInitializer
     :members:
     :noindex:
 
-
-MSRAInitializer
------------------
-..  autoclass:: paddle.fluid.initializer.MSRAInitializer
-    :members:
-    :noindex:
diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst
index dd9d88b669..3e956f8302 100644
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -59,3 +59,21 @@ get_inference_program
 ..  autofunction:: paddle.fluid.io.get_inference_program
     :noindex:
 
+save_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.save_checkpoint
+    :noindex:
+
+load_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.load_checkpoint
+    :noindex:
+
+clean_checkpoint
+----------------
+
+..  autofunction:: paddle.fluid.io.clean_checkpoint
+    :noindex:
+
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index ff3c9346a2..f78e6db326 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -55,6 +55,13 @@ While
     :members:
     :noindex:
 
+Switch
+------
+
+..  autoclass:: paddle.fluid.layers.Switch
+    :members:
+    :noindex:
+
 lod_rank_table
 --------------
 
@@ -67,12 +74,6 @@ max_sequence_len
 ..  autofunction:: paddle.fluid.layers.max_sequence_len
     :noindex:
 
-topk
-----
-
-..  autofunction:: paddle.fluid.layers.topk
-    :noindex:
-
 lod_tensor_to_array
 -------------------
 
@@ -109,6 +110,12 @@ less_than
 ..  autofunction:: paddle.fluid.layers.less_than
     :noindex:
 
+equal
+-----
+
+..  autofunction:: paddle.fluid.layers.equal
+    :noindex:
+
 array_read
 ----------
 
@@ -174,6 +181,12 @@ Print
 ..  autofunction:: paddle.fluid.layers.Print
     :noindex:
 
+is_empty
+--------
+
+..  autofunction:: paddle.fluid.layers.is_empty
+    :noindex:
+
 device
 ======
 
@@ -212,6 +225,55 @@ Send
 ..  autofunction:: paddle.fluid.layers.Send
     :noindex:
 
+open_recordio_file
+------------------
+
+..  autofunction:: paddle.fluid.layers.open_recordio_file
+    :noindex:
+
+open_files
+----------
+
+..  autofunction:: paddle.fluid.layers.open_files
+    :noindex:
+
+read_file
+---------
+
+..  autofunction:: paddle.fluid.layers.read_file
+    :noindex:
+
+shuffle
+-------
+
+..  autofunction:: paddle.fluid.layers.shuffle
+    :noindex:
+
+batch
+-----
+
+..  autofunction:: paddle.fluid.layers.batch
+    :noindex:
+
+double_buffer
+-------------
+
+..  autofunction:: paddle.fluid.layers.double_buffer
+    :noindex:
+
+random_data_generator
+---------------------
+
+..  autofunction:: paddle.fluid.layers.random_data_generator
+    :noindex:
+
+Preprocessor
+------------
+
+..  autoclass:: paddle.fluid.layers.Preprocessor
+    :members:
+    :noindex:
+
 nn
 ==
 
@@ -281,12 +343,6 @@ square_error_cost
 ..  autofunction:: paddle.fluid.layers.square_error_cost
     :noindex:
 
-accuracy
---------
-
-..  autofunction:: paddle.fluid.layers.accuracy
-    :noindex:
-
 chunk_eval
 ----------
 
@@ -311,6 +367,18 @@ sequence_pool
 ..  autofunction:: paddle.fluid.layers.sequence_pool
     :noindex:
 
+sequence_softmax
+----------------
+
+..  autofunction:: paddle.fluid.layers.sequence_softmax
+    :noindex:
+
+softmax
+-------
+
+..  autofunction:: paddle.fluid.layers.softmax
+    :noindex:
+
 pool2d
 ------
 
@@ -323,12 +391,6 @@ batch_norm
 ..  autofunction:: paddle.fluid.layers.batch_norm
     :noindex:
 
-layer_norm
-----------
-
-..  autofunction:: paddle.fluid.layers.layer_norm
-    :noindex:
-
 beam_search_decode
 ------------------
 
@@ -377,6 +439,12 @@ reduce_min
 ..  autofunction:: paddle.fluid.layers.reduce_min
     :noindex:
 
+reduce_prod
+-----------
+
+..  autofunction:: paddle.fluid.layers.reduce_prod
+    :noindex:
+
 sequence_first_step
 -------------------
 
@@ -425,6 +493,12 @@ matmul
 ..  autofunction:: paddle.fluid.layers.matmul
     :noindex:
 
+topk
+----
+
+..  autofunction:: paddle.fluid.layers.topk
+    :noindex:
+
 warpctc
 -------
 
@@ -473,6 +547,60 @@ multiplex
 ..  autofunction:: paddle.fluid.layers.multiplex
     :noindex:
 
+layer_norm
+----------
+
+..  autofunction:: paddle.fluid.layers.layer_norm
+    :noindex:
+
+softmax_with_cross_entropy
+--------------------------
+
+..  autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
+    :noindex:
+
+smooth_l1
+---------
+
+..  autofunction:: paddle.fluid.layers.smooth_l1
+    :noindex:
+
+one_hot
+-------
+
+..  autofunction:: paddle.fluid.layers.one_hot
+    :noindex:
+
+autoincreased_step_counter
+--------------------------
+
+..  autofunction:: paddle.fluid.layers.autoincreased_step_counter
+    :noindex:
+
+reshape
+-------
+
+..  autofunction:: paddle.fluid.layers.reshape
+    :noindex:
+
+lod_reset
+---------
+
+..  autofunction:: paddle.fluid.layers.lod_reset
+    :noindex:
+
+lrn
+---
+
+..  autofunction:: paddle.fluid.layers.lrn
+    :noindex:
+
+pad
+---
+
+..  autofunction:: paddle.fluid.layers.pad
+    :noindex:
+
 label_smooth
 ------------
 
@@ -480,12 +608,35 @@ label_smooth
     :noindex:
 
 roi_pool
----------
+--------
 
 ..  autofunction:: paddle.fluid.layers.roi_pool
     :noindex:
 
-    
+dice_loss
+---------
+
+..  autofunction:: paddle.fluid.layers.dice_loss
+    :noindex:
+
+resize_bilinear
+---------------
+
+..  autofunction:: paddle.fluid.layers.resize_bilinear
+    :noindex:
+
+gather
+------
+
+..  autofunction:: paddle.fluid.layers.gather
+    :noindex:
+
+random_crop
+-----------
+
+..  autofunction:: paddle.fluid.layers.random_crop
+    :noindex:
+
 ops
 ===
 
@@ -501,18 +652,6 @@ mul
 ..  autofunction:: paddle.fluid.layers.mul
     :noindex:
 
-reshape
--------
-
-..  autofunction:: paddle.fluid.layers.reshape
-    :noindex:
-
-pad
----
-
-..  autofunction:: paddle.fluid.layers.pad
-    :noindex:
-
 scale
 -----
 
@@ -579,10 +718,76 @@ clip_by_norm
 ..  autofunction:: paddle.fluid.layers.clip_by_norm
     :noindex:
 
-sequence_softmax
-----------------
+logical_and
+-----------
 
-..  autofunction:: paddle.fluid.layers.sequence_softmax
+..  autofunction:: paddle.fluid.layers.logical_and
+    :noindex:
+
+logical_or
+----------
+
+..  autofunction:: paddle.fluid.layers.logical_or
+    :noindex:
+
+logical_xor
+-----------
+
+..  autofunction:: paddle.fluid.layers.logical_xor
+    :noindex:
+
+logical_not
+-----------
+
+..  autofunction:: paddle.fluid.layers.logical_not
+    :noindex:
+
+uniform_random
+--------------
+
+..  autofunction:: paddle.fluid.layers.uniform_random
+    :noindex:
+
+uniform_random_batch_size_like
+------------------------------
+
+..  autofunction:: paddle.fluid.layers.uniform_random_batch_size_like
+    :noindex:
+
+gaussian_random
+---------------
+
+..  autofunction:: paddle.fluid.layers.gaussian_random
+    :noindex:
+
+gaussian_random_batch_size_like
+-------------------------------
+
+..  autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
+    :noindex:
+
+cumsum
+------
+
+..  autofunction:: paddle.fluid.layers.cumsum
+    :noindex:
+
+scatter
+-------
+
+..  autofunction:: paddle.fluid.layers.scatter
+    :noindex:
+
+sum
+---
+
+..  autofunction:: paddle.fluid.layers.sum
+    :noindex:
+
+shape
+-----
+
+..  autofunction:: paddle.fluid.layers.shape
     :noindex:
 
 sigmoid
@@ -651,6 +856,18 @@ floor
 ..  autofunction:: paddle.fluid.layers.floor
     :noindex:
 
+cos
+---
+
+..  autofunction:: paddle.fluid.layers.cos
+    :noindex:
+
+sin
+---
+
+..  autofunction:: paddle.fluid.layers.sin
+    :noindex:
+
 round
 -----
 
@@ -822,10 +1039,3 @@ zeros
 ..  autofunction:: paddle.fluid.layers.zeros
     :noindex:
 
-topk
-----
-
-..  autofunction:: paddle.fluid.layers.topk
-    :noindex:
-
-
diff --git a/doc/fluid/api/metrics.rst b/doc/fluid/api/metrics.rst
new file mode 100644
index 0000000000..ddf07775d7
--- /dev/null
+++ b/doc/fluid/api/metrics.rst
@@ -0,0 +1,56 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=======
+metrics
+=======
+
+MetricBase
+----------
+
+..  autoclass:: paddle.fluid.metrics.MetricBase
+    :members:
+    :noindex:
+
+CompositeMetric
+---------------
+
+..  autoclass:: paddle.fluid.metrics.CompositeMetric
+    :members:
+    :noindex:
+
+Accuracy
+--------
+
+..  autoclass:: paddle.fluid.metrics.Accuracy
+    :members:
+    :noindex:
+
+ChunkEvaluator
+--------------
+
+..  autoclass:: paddle.fluid.metrics.ChunkEvaluator
+    :members:
+    :noindex:
+
+EditDistance
+------------
+
+..  autoclass:: paddle.fluid.metrics.EditDistance
+    :members:
+    :noindex:
+
+DetectionMAP
+------------
+
+..  autoclass:: paddle.fluid.metrics.DetectionMAP
+    :members:
+    :noindex:
+
+Auc
+---
+
+..  autoclass:: paddle.fluid.metrics.Auc
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst
index 7a92caf9b7..6ad44bb690 100644
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -89,9 +89,31 @@ DecayedAdagradOptimizer
     :members:
     :noindex:
 
+RMSPropOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+    :members:
+    :noindex:
+
 Adadelta
---------------
+--------
 
-..  autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer
+..  autoclass:: paddle.fluid.optimizer.Adadelta
     :members:
     :noindex:
+
+ModelAverage
+------------
+
+..  autoclass:: paddle.fluid.optimizer.ModelAverage
+    :members:
+    :noindex:
+
+Optimizer
+---------
+
+..  autoclass:: paddle.fluid.optimizer.Optimizer
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst
index 74d102dcb0..39fda65863 100644
--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -23,3 +23,15 @@ profiler
 ..  autofunction:: paddle.fluid.profiler.profiler
     :noindex:
 
+start_profiler
+--------------
+
+..  autofunction:: paddle.fluid.profiler.start_profiler
+    :noindex:
+
+stop_profiler
+-------------
+
+..  autofunction:: paddle.fluid.profiler.stop_profiler
+    :noindex:
+
diff --git a/doc/fluid/api/regularizer.rst b/doc/fluid/api/regularizer.rst
index 837c67111c..756bc53baa 100644
--- a/doc/fluid/api/regularizer.rst
+++ b/doc/fluid/api/regularizer.rst
@@ -11,6 +11,13 @@ append_regularization_ops
 ..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
     :noindex:
 
+WeightDecayRegularizer
+----------------------
+
+..  autoclass:: paddle.fluid.regularizer.WeightDecayRegularizer
+    :members:
+    :noindex:
+
 L1Decay
 -------
 
@@ -26,15 +33,16 @@ L2Decay
     :noindex:
 
 L1DecayRegularizer
----------------------
+------------------
 
 ..  autoclass:: paddle.fluid.regularizer.L1DecayRegularizer
     :members:
     :noindex:
 
 L2DecayRegularizer
----------------------
+------------------
 
 ..  autoclass:: paddle.fluid.regularizer.L2DecayRegularizer
     :members:
     :noindex:
+
diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md
index 6750323c01..8db67f6703 100644
--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
@@ -35,7 +35,7 @@ The computation `Program` consists of nested `Blocks`. Each `Block` will consist
 
 ## Definition of VarType
 
-A VarDesc should have a name, type and whether or not it is persistable. The are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
+A VarDesc should have a name, type and whether or not it is persistable. There are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
 
 ```proto
 message VarDesc {
diff --git a/doc/fluid/dev/api_doc_std_cn.md b/doc/fluid/dev/api_doc_std_cn.md
index b50f18f21d..7d39b8de1e 100644
--- a/doc/fluid/dev/api_doc_std_cn.md
+++ b/doc/fluid/dev/api_doc_std_cn.md
@@ -1,8 +1,9 @@
 # API注释撰写标准
 
-- [API注释模块](#API注释模块)
-- [格式及示例](#格式及示例)
-- [完整示例](#完整示例)
+- [API注释撰写标准](#api)
+    - [API注释模块](#api)
+    - [格式及示例](#)
+    - [完整示例](#)
 
 
 ## API注释模块
@@ -217,4 +218,4 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
 
 ## 完整示例
 
-fc 的完整注释见[示例](src/fc.py)。
+fc 的完整注释见[示例](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
diff --git a/doc/fluid/dev/api_doc_std_en.md b/doc/fluid/dev/api_doc_std_en.md
index e57072d52f..f175b21975 100644
--- a/doc/fluid/dev/api_doc_std_en.md
+++ b/doc/fluid/dev/api_doc_std_en.md
@@ -1,8 +1,9 @@
 # API Doc Standard
 
-- [API Doc Structure](#API Doc Structure)
-- [Format and Examples](#Format and Examples)
-- [Complete Example](#Complete Example)
+- [API Doc Standard](#api-doc-standard)
+    - [API Doc Structure](#api-doc-structure)
+    - [Format and Examples](#format-and-examples)
+    - [Complete Example](#complete-example)
 
 
 ## API Doc Structure
@@ -223,4 +224,4 @@ Format and examples of each part of API documantation are as follows: (take fc f
 
 ## Complete Example
 
-Complete Example of fc please see [here](src/fc.py)。
+Complete Example of fc please see [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
diff --git a/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
new file mode 100644
index 0000000000..79df6c5957
--- /dev/null
+++ b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
@@ -0,0 +1,1819 @@
+
+# Paddle Fluid 开发者指南
+
+---
+
+### ==1==. 为什么需要 PaddlePaddle Fluid？
+
+---
+
+### 两个基础问题
+
+<font size=6>
+
+1. 如何描述机器学习模型和优化过程？
+    - 完备自洽，表达能力足以支持潜在出现的各种计算需求
+1. 如何充分利用资源高效计算？
+    - 支持异步设备、多卡、分布式计算
+    - 降低计算/计算优化的开发成本
+    - ……
+
+</font>
+
+---
+
+### 如何描述模型和优化过程？
+
+<font size=6>
+
+<table>
+<thead>
+<tr>
+<th> </th>
+<th>一组连续执行的layers</th>
+<th>variable和operator构成的计算图 </th>
+<th>不再有模型的概念 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> 2013</td>
+<td> Caffe，Theano, Torch, PaddlePaddle </td>
+<td> </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> 2015 </td>
+<td> </td>
+<td> TensorFlow, MxNet, Caffe2, ONNX, n-graph </td>
+<td> </td>
+</tr>
+<tr>
+<td>2016 </td>
+<td> </td>
+<td> </td>
+<td> PyTorch, TensorFlow Eager Execution, <font color=#483D8B>**==PaddlePaddle Fluid==** </td>
+</tr>
+
+</tbody>
+</table>
+
+---
+
+
+### <p align="center">目标 </p>
+
+<font size=6>
+
+- 提高对各类机器学习任务的描述能力：能够描述潜在出现的任意机器学习模型。
+- 代码结构逻辑清晰，各模块充分解耦：内外部贡献者能够专注于自己所需的功能模块，基于框架进行再次开发。
+- 从设计上，留下技术优化的空间和潜力。
+- 代码解耦后降低多设备支持、计算优化等的开发成本。
+- 在统一的设计理念下，实现自动可伸缩，自动容错的分布式计算。
+
+</font>
+
+---
+
+## ==2.== Design Overview
+
+---
+
+# Fluid: 系统形态
+
+- <span style="background-color:#ACD6FF;">[编译器式的执行流程，区分编译时和运行时](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)</span>
+<br>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid-compiler.png" width=100%>
+</p>
+
+---
+
+#### 让我们在Fluid程序实例中，区分编译时和运行时
+
+---
+### Fluid 编译时
+
+<font size=5>
+
+- ==**定义前向计算**==
+
+  ```python
+  x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+  y_predict = fluid.layers.fc(input=x, size=1, act=None)
+  y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+  cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+  avg_cost = fluid.layers.mean(x=cost)
+  ```
+
+- ==**添加反向、正则、优化**==
+  ```python
+  learning_rate = 0.01
+  sgd_optimizer = fluid.optimizer.SGD(learning_rate)
+  sgd_optimizer.minimize(avg_cost)
+  ```
+</font>
+
+---
+
+### `Program` vs. 计算图
+
+<font size=5>
+
+- 在科学计算领域，计算图是一种描述计算的经典方式。下图展示了从前向计算图（蓝色）开始，通过添加反向（红色）和优化算法相关（绿色）操作，构建出整个计算图的过程：
+-
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/graph_construction_example_all.png" width=60%>
+</p>
+
+
+- Fluid ==使用`Program`而不是计算图==来描述模型和优化过程。`Program`由`Block`、`Operator`和`Variable`构成，相关概念会在后文详细展开。
+- 编译时 Fluid 接受前向计算（这里可以先简单的理解为是一段有序的计算流）`Program`，为这段前向计算按照：前向 -> 反向 -> 梯度 clip -> 正则 -> 优化 的顺序，添加相关 `Operator`和`Variable`到`Program`到完整的计算。
+
+</font>
+
+---
+
+### Fluid 运行时
+
+<font size=5>
+
+- ==**读入数据**==
+
+  ```python
+  train_reader = paddle.batch(
+      paddle.reader.shuffle(paddle.dataset.uci_housing.train(), buf_size=500),
+      batch_size=20)
+  feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+  ```
+- ==**定义执行程序的设备**==
+  ```python
+  place = fluid.CPUPlace()
+  feeder = fluid.DataFeeder(place=place,feed_list=[x, y])
+  ```
+
+- ==创建执行器（Executor），执行初始化 `Program`和训练`Program`==
+
+  ```python
+  exe = fluid.Executor(place)
+  exe.run(fluid.default_startup_program())
+  PASS_NUM = 100
+  for pass_id in range(PASS_NUM):
+      for data in train_reader():
+          avg_loss_value, = exe.run(fluid.default_main_program(),
+                                    feed=feeder.feed(data),
+                                    fetch_list=[avg_cost])
+          print(avg_loss_value)
+  ```
+</font>
+
+---
+
+### 总结：框架做什么？用户做什么？
+<br>
+
+<font size=5>
+<table>
+<thead>
+<tr>
+<th>构建训练</th>
+<th>执行训练</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+<span style="background-color:#B3D9D9">用户</span>：描述前向运算<br><span style="background-color:#DAB1D5;">框架</span>：添加反向运算<br><span style="background-color:#DAB1D5;">框架</span>：添加优化运算<br><span style="background-color:#DAB1D5;">框架</span>：添加内存优化<br><span style="background-color:#DAB1D5;">框架</span>：添加并行/多设备/分布式相关的计算单元
+</td>
+
+<td>
+<span style="background-color:#DAB1D5;">框架</span>：创建Operator（计算）+ Variable（数据）<br><span style="background-color:#DAB1D5;">框架</span>：创建`Block`<br><span style="background-color:#DAB1D5;">框架</span>：内存管理/设备管理<br><span style="background-color:#DAB1D5;">框架</span>：执行计算
+</td>
+</tr>
+</tbody>
+</table>
+</font>
+
+---
+
+### <p align="center">总结：编译时</p>
+<font size=5>
+
+<span style="background-color:#A3D1D1;">**用户编写一段Python程序，描述模型的前向计算**</span>
+1. 创建变量描述 `VarDesc`
+1. 创建operators的描述 `OpDesc`
+1. 创建operators的属性
+1. 推断变量的类型和形状，进行静态检查：`inferShape`
+1. 规划变量的内存复用
+1. 创建反向计算
+1. 添加优化相关的Operators
+1. （可选）添加多卡/多机相关的Operator，生成在多卡/多机上运行的程序
+
+</font>
+
+---
+
+### <p align="center">总结：运行时</p>
+<font size=5>
+
+<span style="background-color:#C7C7E2;">**执行规划好的计算**</span>
+1. 创建`Executor`
+1. 为将要执行的一段计算，在层级式的`Scope`空间中创建`Scope`
+1. 创建`Block`，依次执行`Block`
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/compile_run_time.png" width=50%><br>
+<font size=3> Figure. 编译时运行时概览</font>
+</p>
+
+</font>
+
+---
+<!-- *template: invert -->
+## ==3==. 用户如何描述计算？
+---
+
+### Fluid：==像写程序一样==定义计算
+<font size=5>
+
+- 顺序执行
+    ```python
+    x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    ```
+
+- 条件分支: [swith](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md)、[ifelse](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md)
+
+   ```python
+   a = fluid.Var(10)
+   b = fluid.Var(0)
+
+   switch = fluid.switch()
+   with switch.block():
+      with switch.case(fluid.less_equal(a, 10)):
+          fluid.print("Case 1")
+      with switch.case(fluid.larger(a, 0)):
+          fluid.print("Case 2")
+      with switch.default():
+          fluid.print("Case 3")
+   ```
+
+>[A Lisp cond form may be compared to a continued if-then-else as found in many algebraic programming languages](https://www.cs.cmu.edu/Groups/AI/html/cltl/clm/node84.html).
+
+</font>
+
+---
+
+### Fluid: ==像写程序一样==定义计算
+
+<font size=5>
+
+- 循环：[while](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+  ```python
+  d0 = layers.data("d0", shape=[10], dtype='float32')
+  data_array = layers.array_write(x=d0, i=i)
+  array_len = layers.fill_constant(shape=[1],dtype='int64', value=3)
+
+  cond = layers.less_than(x=i, y=array_len)
+  while_op = layers.While(cond=cond)
+  with while_op.block():
+      d = layers.array_read(array=data_array, i=i)
+      i = layers.increment(x=i, in_place=True)
+      layers.array_write(result, i=i, array=d)
+      layers.less_than(x=i, y=array_len, cond=cond)
+  ```
+
+- 完整实例请点查看 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_while_op.py#L36-L44)
+- beam search  [->]( https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+</font>
+
+---
+
+#### <p align="center">总结</p>
+
+<font size=5>
+
+1. 用户层提供的描述语法具有完备性、自洽性，有能力支持对复杂计算过程描述
+1. 使用方式和核心概念可以类比编程语言，认知能够直接迁移
+1. 能够支持：定义问题，逐步求解
+
+</font>
+
+---
+
+## ==3.== 核心概念
+
+---
+### 编译时概念 ：==变量和计算的描述==
+
+<font size=5>
+
+- `VarDesc` + `TensorDesc` + `OpDesc` -> `BlockDesc` -> `ProgramDesc`
+    - https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto
+
+- <span style="background-color:#DAB1D5;">什么是 Fluid Program</span>
+
+  - 在Fluid中，一个神经网络任务（训练/预测）被描述为一段`Program`
+  - `Program`包含对`Variable`（数据）和 `Operator`（对数据的操作）的描述
+  - `Variable` 和 `Operator` 被组织为多个可以嵌套的`Block`，构成一段完整的`Fluid Program`
+
+
+>编译阶段最终，经过 Transpiler 的执行规划，变换处理，生成使用`protobuf`序列化后的`ProgramDesc`。可以发送给多卡或者网络中的其它计算节点执行
+
+</font>
+
+---
+
+### 编译时概念 ：==**[Transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)**==
+<font size=5>
+
+1. 接受一段`ProgramDesc`作为输入，生成一段新的`ProgramDesc`
+
+    - *Memory optimization transpiler*：向原始`ProgramDesc` 中插入 `FreeMemoryOps`，在一次迭代优化结束前提前释放内存，使得能够维持较小的 memory footprint
+
+    - *Distributed training transpiler*：将原始的`ProgramDesc`中转化为对应的分布式版本，生成两段新的`ProgramDesc`:
+        1. trainer进程执行的`ProgramDesc`
+        1. parameter server执行的`ProgramDesc`
+
+1. ==**WIP**==: 接受一段`ProgramDesc`，生成可直接被`gcc`, `nvcc`, `icc`等编译的代码，编译后得到可执行文件
+
+</font>
+
+---
+### Transplier
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/transpiler.png" width=70%>
+</p>
+
+---
+
+### 打印 `ProgramDesc`
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/print_fluid_program.png" width=70%>
+</p>
+
+<font size=5>
+
+- `default_startup_program`：创建可学习参数，对参数进行初始化
+- `default_main_program`：由用户定义的模型，包括了前向、反向、优化及所有必要的计算
+
+- 打印可读的 `Program`
+  ```python
+  from paddle.v2.fluid import debuger
+  print debuger.pprint_program_codes(framework.default_main_program().desc)
+  ```
+</font>
+
+---
+### 输出效果
+
+<font size=5>
+
+<table>
+<thead>
+<th>variable in block 0</th>
+<th>variable in block 0</th>
+</thead>
+<tbody>
+<tr>
+<td><img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/program_desc1.png" width=70%></td>
+<td><img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/program_desc2.png" width=70%></td>
+</tr>
+</tbody>
+</table>
+</font>
+
+---
+
+### 运行时概念
+
+<font size=5>
+
+- 数据相关
+  - `Tensor` / `LoDTensor` / `Variable`
+  - `Scope`
+
+- 计算相关
+  - `Block`
+  - `Kernel`、`OpWithKernel`、`OpWithoutKernel`
+
+<table>
+<thead>
+<th></th>
+<th>protobuf messages</th>
+<th>C++ class objects</th>
+</thead>
+<tbody>
+<tr>
+<td>Data</td>
+<td>[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L107)
+</td>
+<td>[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24)
+</td>
+</tr>
+
+<tr>
+<td>Operation</td>
+<td>[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L35)
+</td>
+<td>[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L64)
+</td>
+</tr>
+<tr>
+<td>Block</td>
+<td>BlockDesc
+</td>
+<td>Block
+</td>
+</tr>
+
+
+</tbody>
+</table>
+
+- 执行相关 ：`Executor`
+
+</font>
+
+---
+#### Tensor 和 LoD(Level-of-Detail) Tensor
+<font size=5>
+
+- Tensor 是$n$-dimensional arry的推广，LoDTensor是在Tensor基础上附加了序列信息
+- Fluid中输入、输出，网络中的可学习参数全部统一使用LoDTensor（n-dimension array）表示
+- 一个mini-batch输入数据是一个LoDTensor
+  - 在Fluid中，RNN 处理变长序列无需padding，得益于 `LoDTensor`表示
+  - 可以简单将 LoD 理解为：`std::vector<std::vector<int>>`
+  - 对非序列数据，LoD 信息为空
+
+<table>
+<thead>
+<th></th>
+<th>TensorFlow</th>
+<th>PaddlePaddle</th>
+</thead>
+<tbody>
+<tr>
+<td>RNN</td>
+<td>Support
+</td>
+<td>Support
+</td>
+</tr>
+
+<tr>
+<td>recursive RNN</td>
+<td>Support
+</td>
+<td>Support
+</td>
+</tr>
+<tr>
+<td>padding zeros</td>
+<td>Must
+</td>
+<td>No need
+</td>
+<tr>
+<td>blob data type</td>
+<td>Tensor
+</td>
+<td>LODTensor
+</td>
+
+</tr>
+</tbody>
+</table>
+
+</font>
+
+---
+#### LoD 信息实例
+
+<font size=4>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/LoDTensor.png" width=43%>
+</p>
+
+- 图(a)的LoD 信息
+  ```cpp
+  [0, 5, 8, 10, 14]
+  ```
+- 图(b)的 LoD 信息
+  ```cpp
+  [[0, 5, 8, 10, 14] /*level=1*/, [0, 2, 3, 5, 7, 8, 10, 13, 14] /*level=2*/]
+  ```
+</font>
+
+---
+#### Tensor, Variable, Scope 之间的关系
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/scope_variable_tensor.png" width=40%>
+</p>
+<font size=5>
+
+1. `Block` 是一个实现层的概念，不在应用层暴露给用户。目前用户无法自行创建并利用`Block`，用户能够感知的只有`Program`这个概念。
+1. 逻辑上，可以将 `Block` 类比为编程语言中的大括号：定义了一段作用域，其中运行一段代码
+1. `Executor`会为每一个`Block`创建一个`Scope`，`Block`是可嵌套的，因此`Scope`也是可嵌套的
+
+</font>
+
+---
+### Executor
+
+<font size=5>
+
+<table>
+<thead>
+<th>接口</th>
+<th>说明</th>
+</thead>
+<tbody>
+<tr>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/executor.png" width=60%>
+</p></td>
+<td><span style="background-color:#B3D9D9;">输入</span><br>1. `ProgramDesc`<br>2. `Scope`<br> 3.`block_id`<br><br><span style="background-color:#B3D9D9;">解释执行步骤</span><br>1. 创建所有 Variables<br> 2. 逐一创建 Operator 并运行
+</td>
+</tr>
+</tbody>
+</table>
+
+---
+### Operator/OpWithKernel/Kernel
+<font size=5>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/operator1.png" width=50%>
+</p>
+
+- operator 无状态，Operator的核心是==Run==方法
+- 一个operator可以注册多个kernel
+- operator 可以无 kernel：while_op 、ifelse op
+
+</font>
+
+---
+#### Fluid Operator vs. PaddlePaddle layers
+<font size=5>
+
+<table>
+<thead>
+<th>Layer</th>
+<th>Operator</th>
+</thead>
+<tbody>
+<tr>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/layer.png" width=70%>
+</p></td>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/operator2.png" width=73%>
+</p></td>
+</tr>
+
+<tr>
+<td>1. 内部维护状态<br>2. 包含forward和backward方法</td>
+<td>1. 内部无状态<br>2. 只有Run方法</td>
+</tr>
+</tbody>
+</table>
+
+</font>
+
+---
+
+### ==4.== 内存管理
+
+---
+### 目标
+
+- 为异构设备提供统一的内存分配、回收接口
+- 最小化管理内存所需的时间，最小化管理开销
+- 减少内存碎片
+- 将内存管理与计算（Operators/Kernels）完全剥离
+- 统一内存管理是内存优化的基础
+
+---
+
+<font size=5>
+
+### Memory 接口
+
+- 内存管理模块向上层应用逻辑提供三个基础接口：
+  ```cpp
+  template <typename Place>
+  void* Alloc(Place place, size_t size);
+
+  template <typename Place>
+  void Free(Place place, void* ptr);
+
+  template <typename Place>
+  size_t Used(Place place);
+
+  struct Usage : public boost::static_visitor<size_t> {
+    size_t operator()(const platform::CPUPlace& cpu) const;
+    size_t operator()(const platform::CUDAPlace& gpu) const;
+  };
+  ```
+- 模板参数 `Place` 指示内存分配发生的设备
+- 实现时，需特化支持的 `Place`， 提供以上三个接口的实现
+
+</font>
+
+---
+### 代码结构
+
+<font size=5>
+
+内存管理模块可以理解为由以下两部分构成：
+
+1. SystemAllocator：实际从物理设备上分配、释放的内存的接口
+1. BuddyAllocator：内存管理算法
+
+</font>
+
+---
+### System Allocator
+
+<font size=5>
+
+- SystemAllocator 是实现物理内存分配、回收的基类
+    - 不同设备上的内存分配和回收终将转化为标准接口调用
+    - 为不同设备实现MemoryAllocator，继承自SystemAllocator
+
+  ```cpp
+  class SystemAllocator {
+   public:
+    virtual ~SystemAllocator() {}
+    virtual void* Alloc(size_t& index, size_t size) = 0;
+    virtual void Free(void* p, size_t size, size_t index) = 0;
+    virtual bool UseGpu() const = 0;
+  };
+  ```
+</font>
+
+---
+
+### CPU/GPU Allocator
+
+<font size=5>
+
+```cpp
+class CPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+};
+
+#ifdef PADDLE_WITH_CUDA
+class GPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+ private:
+  size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
+};
+#endif
+```
+- CPUAllocator和GPUAllocator分别继承自SystemAllocator，分别调用相应的标准库函数实现物理内存的分配和释放。
+- 一旦大块、连续的物理内存分配之后，将通过内存管理算法实现内存的按块分配、回收、重用等。
+
+</font>
+
+---
+### CPU Allocator
+
+<font size=5>
+
+- CPU 内存的分配提供两种选项：
+    1. non-pinned memory：可分页内存
+    2. pinned memory：页锁定内存
+        - 分配过大的页锁定内存有可能因为系统可使用的分页内存减少，影响系统性能，默认CPU下分配的是可分页内存
+
+- 通过gflags进行设置一次性分配内存的大小以及是否使用页锁定内存。
+
+   ```cpp
+   DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+   DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+                 "Default use 100% of CPU memory for PaddlePaddle,"
+                 "reserve the rest for page tables, etc");
+   ```
+
+</font>
+
+---
+### GPU Allocator
+
+<font size=5>
+
+- 通过 cudaMalloc 分配GPU显存
+- GPUAllocator::Alloc 首先会计算指定GPU device上的可用显存
+    - 如果可用显存小于请求分配大小，调用cudaMalloc进行分配
+    - 如果可用显存不足，目前会报错退出。
+- 通过gflags控制GPU下一次性分配显存的大小：
+
+  ```cpp
+  DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
+                "Default use 92% of GPU memory for PaddlePaddle,"
+                "reserve the rest for page tables, etc");
+  ```
+
+</font>
+
+---
+#### 内存管理算法:  [Buddy Memory Allocation](https://en.wikipedia.org/wiki/Buddy_memory_allocation)
+
+<font size=5>
+
+- Memory Arena：一次性分配大块连续内存，之后会基于这块内存进行内存管理：动态分配、释放、重用内存块。
+- 伙伴内存分配：
+    - 将内存划分为 2 的幂次方个分区，使用 best-fit 方法来分配内存请求。
+    - 当释放内存时，检查 buddy 块，查看相邻的内存块是否也已被释放。如果是，将内存块合并，以最小化内存碎片。
+    - 分配的内存在物理内存的自然边界对齐，提高内存访问效率。
+    - 算法的时间效率高，单使用 best-fit 方法的缘故，会产生一定的内存浪费
+
+</font>
+
+---
+
+### Buddy Allocator
+
+<font size=5>
+
+- BuddyAllocator 是一个单例，每个设备（如： GPU/CPU(0)/GPU(1)） 拥有一个BuddyAllocator
+- BuddyAllocator 内部拥有一个私有成员变量 SystemAllocator
+- 当请求的内存超过BuddyAllocator管理的空余内存时，将会调用SystemAllocator去指定的设备上分配物理内存
+
+</font>
+
+---
+### 实例：CPU 下内存管理接口的实现
+
+<font size=5>
+
+- 对上层应用，统一通过BuddyAllocator来实现内存的分配、释放以及用量查询
+    ```cpp
+    template <>
+    void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+      VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+      void* p = GetCPUBuddyAllocator()->Alloc(size);
+      VLOG(10) << "  pointer=" << p;
+      return p;
+    }
+
+    template <>
+    void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+      VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+      GetCPUBuddyAllocator()->Free(p);
+    }
+
+    template <>
+    size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+      return GetCPUBuddyAllocator()->Used();
+    }
+    ```
+</font>
+
+---
+### ==5.== 多设备支持
+
+---
+### 多设备支持（一）
+
+<font size=5>
+
+- step 1：添加Place类型，<span style="background-color:#DAB1D5;">由用户实现添加到框架</span>
+   - 可以将Place类型理解为一个整数加上一个枚举型，包括：设备号 + 设备类型
+
+    <p align="center">
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/place.png" width=40%>
+    </p>
+- DeviceContext
+    - 不同的Place会对应一个相应的DeviceContext，用于组织管理与设备相关的信息
+      - 例如，GpuDeviceContext中会管理Cuda stream
+    - 目前实现中一些特殊的库也会对应有自己的DeviceContext：例如：
+      ```cpp
+      class MKLDNNDeviceContext : public CPUDeviceContext {……}
+      ```
+    - 每种设备对应的DeviceContext需要管理的内容不尽相同，视具体需求来实现
+
+</font>
+
+---
+
+### 多设备支持（二）
+
+<font size=5>
+
+- step 2: 增加KernelType，为相应的KernelType注册Kernel对象，<span style="background-color:#DAB1D5;">由用户实现注册给框架</span> 可以按照：
+    1. Place 执行设备
+    1. DataType 执行数据类型 FP32/FP64/INT32/INT64
+    1. Memory layout： 运行时 Tensor 在内存中的排布格式 NCHW、 NHWC
+    1. 使用的库
+
+    来区分Kernel，为同一个operator注册多个 Kernel。
+
+    ```cpp
+    struct OpKernelType {
+      proto::DataType data_type_;
+      DataLayout data_layout_;
+      platform::Place place_;
+      LibraryType library_type_;
+    }
+    ```
+
+</font>
+
+---
+
+### 多设备支持（三）
+
+<font size=5>
+
+step 3: 运行时的 KernelType 推断和Kernel切换，<span style="background-color:#DAB1D5;">按需要修改Kernel推断和Kernel切换规则</span>
+- Expected Kernel：期待调用的Kernel：由（1）`Place`和计算精度决定；或（2）用户在配置中显示指定使用的计算库，如`cudnn`、`mkldnn`等。
+- Actual Kernel：运行时从`Operator`的输入（`Variable`）可以推断出实际需要的`KernelType`
+- 当Expected Kernel和Actual Kernel不一致的时候，框架会插入`data_transformer`或者`data_layerout_transform`等，保证Expected Kernel可以执行，包括：
+   - CPUPlace -> GPUPlace ：跨设备内存复制
+   - NCHW -> nChw8c ：Layout转换
+   - FP32 -> FP16 ：精度转换 _**尚未支持**_
+   - ……
+- 以上过程实现在OperatorWithKernel类的Run方法中 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.cc#L497)
+
+</font>
+
+---
+## ==6.== while_op
+
+---
+### while_op
+
+<font size=5>
+
+- 循环执行一段`Program`，直到条件operator判断循环条件不满足时终止循环
+- while_op 的特殊之处：
+  1. while_op 没有 kernel
+  1. while_op 拥有自己的`Block`，会形成一段嵌套的`Block`
+  1. ==while_op 内部创建了一个 Executor，来循环执行`Block`==
+
+- while_op 输入输出 ： LoDTensorArray
+    ```cpp
+    namespace paddle {
+    namespace framework {
+    using LoDTensorArray = std::vector<LoDTensor>;
+    }
+    }
+    ```
+    - 每一次循环，从原始输入中“切出”一个片段
+    - LoDTensorArray 在Python端暴露，是Fluid支持的基础数据结构之一，用户可以直接创建并使用
+
+</font>
+
+---
+### while_op [Run](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/while_op.cc#L42) 方法概览
+
+<font size=5>
+
+```cpp
+
+void Run(const framework::Scope &scope,
+         const platform::Place &dev_place) const override {
+  PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
+  auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
+  PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
+
+  framework::Executor executor(dev_place);
+  auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
+  auto *program = block->Program();
+  auto step_scopes =
+      scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
+
+  while (cond.data<bool>()[0]) {
+    auto &current_scope = scope.NewScope();
+    step_scopes->push_back(&current_scope);
+    executor.Run(*program, &current_scope, block->ID(),
+                   false /*create_local_scope*/);
+  }
+}
+
+```
+
+</font>
+
+---
+### while_op 的重要应用：Dynamic RNN
+
+---
+
+### 什么是 `dynamicRNN` ?
+
+<font size=5>
+<br>
+
+1. 用户可以自定义在一个时间步之内的计算, 框架接受序列输入数据，在其上循环调用用户定义的单步计算
+1. 可学习参数在多个时间步之间共享
+1. `dynamicRNN` 由 `while_op` 实现
+1. 如果`dynamicRNN`中定义了`memory`，将会构成一个循环神经网络，否则其行为就等于在输入序列上循环调用预定义的单步计算
+
+</font>
+
+---
+
+#### `dynamic RNN` 用户接口
+<font size=5>
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/user_interface.png" width=75%>
+</p>
+
+- `dynamicRNN` 中的重要元素
+  1. **step input**: `dynamicRNN` 每个时间步的输入
+  1. **step function**: 用户定义的单步计算
+  1. **memory**: 用于形成循环连接
+  1. **external/static memory**：单步计算的每一步都可以全部读取到的外部输入
+
+</font>
+
+---
+
+#### dynamicRNN 中的 Memory
+
+<font size=5>
+
+`dynamicRNN`中`memory`的行为非常类似于 C++ 中的引用变量
+  - `memory` “指向” 一个operator的输出变量，记作： A
+  - `memory` 可以被 LoDTensor 初始化（当LoD信息为空时，为非序列，否则为序列）,默认`memory`被初始化为零
+  - `memory` 在 operator A 前向计算之后，进行前向计算
+  - 当 `memory` 的前向计算会 "指向" A 的输出 LoDTensor
+  - `memory` 的输出可以是另一个 operator 的输入，于是形成了“循环”连接
+
+</font>
+
+---
+
+### DynamicRNN 实现细节
+
+<font size=5>
+
+- `while_op` <span style="background-color:#DAB1D5;">无法独立构成dynamicRNN</span>，必须和一组相关的 operator 及数据结构配合
+    - 依赖的 operators (这里仅列出最重要的，并非全部):
+        - `lod_rank_table` operator
+        - `lod_tensor_to_array` operator
+        - `array_to_lod_tensor` operator
+        - `shrink_memory` operator
+    - 依赖的数据结构
+        - `TensorArray`
+        - `LoDRankTable`
+
+- 在Fluid中，RNN接受变长序列输入，无需填充，以上数据结构和相关的operator配合工作，实现了对变长输入以batch计算
+
+</font>
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+<font size=5>
+
+- 问题：
+  - RNN 可以看作是一个展开的前向网络，前向网络的深度是最长序列的长度
+  - 如果不对变长序列进行填充，将它们填充到一样长度，每个mini-batch输入将会不等长，每个样本展开长度不一致，导致前向和反向计算实现困难
+
+</font>
+
+----
+##### 实例 ：RNN encoder-decoder with attention
+
+<font size=5>
+
+- 以机器翻译的RNN encoder-decoder 模型（涉及了`dynamicRNN`的所有设计要素）为例，下图是 RNN encoder-decoder 的原始输入：
+  <p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/raw_input.png" width=100%><br><font size=3> Figure. RNN encoder-decoder 原始batch 输入数据</font>
+  </p>
+
+- source word sequences 是encoder RNN的输出，是一个LoDTensor
+- target word sequences 是look_uptable的输入，是一个LoDTensor
+- 上图中一个矩形方块是CPU/GPU内存中一片连续的内存空间，表示一个dense vector
+
+</font>
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+<font size=5>
+
+1. 对一个mini batch中不等长样本进行排序，最长样本变成batch中的第一个，最短样本是batch中最后一个
+      - `LoDTensor` -> `LoDRankTable` :heavy_plus_sign: `lod_rank_table operaator`
+          - 可以将`LoDRankTable`理解为对LoDTensor中的多个序列按照长度排序LoDRankTable 存储了排序之后的index
+
+2. 构建每个时间步的batch输入：随着时间步增加，每个时间步的batch输入可能会逐渐缩小
+    - `TensorArray` :heavy_plus_sign: `lod_tensor_to_array` -> `LoDTensor` (without LoD)
+3. 每个时间步输出写入一个输出 `LoDTensorArray`
+3. `dynamicRNN`循环结束后, 按照`LoDRankTable`中记录的信息对输出`LoDTensorArray`重排序，还原会原始输入顺序
+    - `TensorArray` :heavy_plus_sign: `array_to_lod_tensor` -> `LoDTensor`
+
+</font>
+
+---
+
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/sorted_input.png" width=100%>
+</p>
+
+---
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/1.png" width=100%>
+</p>
+
+<font size=5>
+
+- 执行到第5~7个batch时，batch size将会缩小
+
+</font>
+
+---
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/1.png" width=80%>
+</p>
+
+<font size=5>
+
+- 第5 ~ 7个batch时RNN的`memory`会发生什么？
+    - `memory` 指向某个operator的输出Tensor，在该operator前向计算之后，“取回”其计算结果
+    - 5 ~ 7时，遇到了序列的结束，==下一个时间步计算不再需要在已经结束的序列上展开==
+    - 在`dynamicRNN`中`shrink_memory` operator 用来缩小`memory`的batch输入
+
+</font>
+
+---
+### 运行实例：batch 1 ~ 2
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/2.png" width=70%><br><font size=4>Figure. 第1、2个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+### 运行实例：batch 3 ~ 4
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/3.png" width=70%><br><font size=4>Figure. 第3、4个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+
+### 运行实例：batch 5 ~ 7
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/4.png" width=70%><br><font size=4>Figure. 第5、6、7个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+### ==7.== Fluid 代码结构
+
+---
+### Fluid 代码结构
+
+<table>
+<thead>
+<tr>
+<th>代码结构</th>
+<th>模块结构</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_module_1.png" width=60%>
+</p>
+</td>
+<td>
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_module_2.png" width=60%>
+</p>
+</td>
+</tr>
+
+</tbody>
+</table>
+
+---
+
+### ==8.== 文档总结
+
+---
+<font size=5>
+
+- 设计概览
+  - 重构概览 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/refactorization.md)
+  - fluid [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md)
+  - fluid_compiler [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)
+- 核心概念
+  - variable 描述 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/var_desc.md)
+  - Tensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.md)
+  - LoDTensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+  - TensorArray [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md)
+  - Program [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md)
+  - Block [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md)
+  - Scope [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md)
+
+---
+
+- 重要功能模块
+  - backward [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/backward.md)
+  - 内存优化 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/memory_optimization.md)
+  - evaluator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/executor.md)
+  - python API [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md)
+  - regularization [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/regularization.md)
+
+- 开发指南
+  - 支持新设硬件设备库 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md)
+  - 添加新的Operator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_cn.md)
+  - 添加新的Kernel [->](
+https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_kernel_en.md)
+
+</font>
+
+---
+
+### ==9.== 开发指南
+
+---
+
+#### 建议开发环境：使用 Docker 编译和测试
+
+<font size=5>
+
+Docker编译PaddlePaddle源码: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)
+
+PaddlePaddle 在 Dockerhub 地址：[->](
+    https://hub.docker.com/r/paddlepaddle/paddle/tags/)
+
+1. 获取PaddlePaddle的Docker镜像
+    ```bash
+    docker pull paddlepaddle/paddle:latest-dev
+    ```
+
+1. 启动 docker container
+
+    ```bash
+    docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+    ```
+
+1. 进入docker container后，从源码编译，请参考文档 [->]( http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/build_from_source_cn.html)
+
+</font>
+
+---
+
+### 一些说明
+
+<font size=5>
+
+1. PaddlePaddle的Docker镜像为了减小体积，默认没有安装vim，可以在容器中执行`apt-get install -y vim`来安装vim。
+1. 开发推荐使用tag为`latest-dev`的镜像，其中打包了所有编译依赖。`latest`及`lastest-gpu`是production镜像，主要用于运行PaddlePaddle程序。
+2. 在Docker中运行GPU程序，推荐使用nvidia-docker，[否则需要将CUDA库和设备挂载到Docker容器内](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)。
+   <font size=4>
+
+   ```bash
+   nvidia-docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+   ```
+   </font>
+
+
+</font>
+
+---
+
+### [如何贡献](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+
+<font size=5>
+
+- ==提交PullRequest前请务必阅读==： [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+- 代码要求
+    1. 代码注释遵守 Doxygen 的样式
+    1. 确保编译器选项 WITH_STYLE_CHECK 已打开，并且编译能通过代码样式检查
+    1. 所有代码必须具有单元测试，且能够通过所有单元测试
+- 使用 `pre-commit` 钩子提交Pull Request
+    1. 帮助格式化源代码（C++，Python）
+    1. 在提交前自动检查一些基本事宜：如每个文件只有一个 EOL，Git 中不要添加大文件等
+    1. 安装pre-commit，并在PaddlePaddle根目录运行：
+    ```bash
+      ➜  pip install pre-commit
+      ➜  pre-commit install
+    ```
+</font>
+
+---
+
+### 如何贡献
+
+<font size=5>
+
+1. 开始开发之前请先建立issue。
+    - 让其它同学知道某项工作已经有人在进行，以避免多人开发同一功能的情况。
+1. 提交PR必须关联相关的issue。做法请参考：[->](https://help.github.com/articles/closing-issues-using-keywords/)
+    - 目的：为了在提交的版本中留有记录描述这个PR是为了开发什么样的功能，为了解决什么样的问题。
+    - 当PR被merge后，关联的issue会被自动关闭。
+1. PR review 中，reviewer的每条comment都必须回复。
+    - 如修改完可直接回复：Done。
+    - 目的：review comment 中可能会有（1）询问类型的问题；（2）可以在下一个PR修改的问题；（3）comment意见不合理等。需要明确回复，以便reviewer和其他人有历史可查，便于区分是否已经进行修改，或者准备下一个PR修改，或者意见不合理可以不用进行修改。
+
+</font>
+
+---
+
+### ==10.== 添加新的 Operator
+
+---
+
+### 概念简介
+
+<font size=5>
+
+添加一个新的operator，会涉及实现以下C++类的派生类：
+
+1. `framework::OperatorBase`: Operator(简写，Op)基类。
+1. `framework::OpKernel`: Op计算函数的基类，称作Kernel。
+1. `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
+1. `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+
+依据是否包含kernel，可以将Op分为两种：
+1. 包含Kernel的Op：继承自OperatorWithKernel，==绝大多数operator都属于这一类==
+1. 不包含kernel的Op，继承自OperatorBase，只有少量Op属于这一类，例如while_op，ifelse_op
+
+<span style="background-color:#DAB1D5;">这里主要介绍带Kernel的Op如何编写。</span>
+
+</font>
+
+---
+
+#### 添加新的Operator需要修改/添加哪些文件？
+
+<font size=5>
+
+<table>
+<thead>
+<tr>
+<th>内容</th>
+<th>定义位置</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+OpProtoMake定义
+</td>
+<td>
+`.cc`文件，<span style="background-color:#DAB1D5;">Backward Op不需要OpProtoMaker</span>
+</td>
+</tr>
+<tr>
+<td>
+Op定义
+</td>
+<td>
+`.cc`文件
+</td>
+</tr>
+<tr>
+<td>
+Kernel实现
+</td>
+<td>
+<span style="background-color:#DAB1D5;">CPU、CUDA共享Kernel实现在`.h`文件中</span>，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。
+</td>
+</tr>
+
+<tr>
+<td>
+注册Op
+</td>
+<td>
+Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中
+</td>
+</tr>
+
+</tbody>
+</table>
+
+- 添加 Operator 之前请阅读：[Operator 命名规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/name_convention.md)及[Operator Markdown注释规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/op_markdown_format.md)。
+- 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。
+- 根据文件名自动构建op和Python端绑定，<span style="background-color:#DAB1D5;">请务必遵守以上命名，否则需要进一步修改PyBind相关文件及CMakeLists.txt</span>。
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step1</span>: 定义ProtoMaker类
+
+<font size=5>
+
+下面均以[clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h)为例进行介绍
+
+- clip_op计算公式：$Out = \min(\max(X, min), max)$
+- 首先定义`ProtoMaker`来描述该Op的输入、输出，并添加注释（<font size=4>*下面代码段的中注释进行了简化，实现时需按照规范添加注释*</font>）：
+
+    ```cpp
+    template <typename AttrType>
+    class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
+     public:
+      ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+          : OpProtoAndCheckerMaker(proto, op_checker) {
+        AddInput("X","(Tensor)The input of clip op.");
+        AddOutput("Out", "(Tensor),The output of clip op.");
+        AddAttr<AttrType>(
+            "min", "(float),Minimum value.");
+        AddAttr<AttrType>(
+            "max", "(float),Maximum value.");
+        AddComment(R"DOC(
+        ……
+    )DOC");
+      }
+    };
+    ```
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step2</span>: 定义Operator类
+
+<font size=5>
+
+下面的代码段实现了`clip_op`的定义：
+
+```cpp
+class ClipOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ClipOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ClipOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto max = ctx->Attrs().Get<float>("max");
+    auto min = ctx->Attrs().Get<float>("min");
+    PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+```
+</font>
+
+---
+
+### Operator 类中需要完成的工作
+
+<font size=5>
+
+1. clip_op 继承自`OperatorWithKernel`，
+
+    ```cpp
+    using framework::OperatorWithKernel::OperatorWithKernel;
+    ```
+    表示使用基类`OperatorWithKernel`的构造函数。
+
+1. 重写`InferShape`接口。
+    - `InferShape` 为const函数，不能修改Op的成员变
+    - `InferShape` 的参数为 `const framework::InferShapeContext &ctx`，从中可获取到输入输出以及属性
+    - `InferShape` 会被调用两次，一次是编译时（创建op），一次是运行时（调用op的`Run`方法时），需要完成以下功能：
+        1. 做检查， 尽早报错：检查输入数据维度、类型等是否合法
+        2. 设置输出Tensor的形状
+
+<span style="background-color:#DAB1D5;">通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中。</span>
+
+</font>
+
+---
+
+### 补充说明
+
+<font size=5>
+
+1. `InferShape`目前支持两种实现方式，<span style="background-color:#DAB1D5;">二者最后都会生成一个functor注册给OpInfo结构体。</span>
+    1. 继承framework::InferShapeBase，实现为一个functor（参考 [mul_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)）
+    2. override InferShape函数（参考 [clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.cc#L24)）
+
+1. 什么是`functor` ?
+
+   - 类或结构体仅重载了`()`，一般是可被多个kernel复用的计算函数。
+
+        <font size=4>
+
+        ```cpp
+        template <typename T>
+        class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
+         public:
+          void operator()(const platform::CPUDeviceContext& ctx,
+                          framework::Tensor* out,
+                          const framework::Tensor* prob,
+                          const framework::Tensor* labels, const bool softLabel) {
+               ……
+          }
+        };
+        ```
+        </font>
+
+    - 在 clip_op 内也会看到将一段计算函数抽象为functor的使用法： [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h#L27)。
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step3</span>: 定义OpKernel类
+
+<font size=5>
+
+- `ClipKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
+    1. `typename DeviceContext`: 表示设备类型，不同设备共享同一个Kernel时，需添加该模板参数。不共享时，需要提供针对不同设备的特化实现。
+    1. `typename T` : 表示支持的数据类型，如`float`, `double`等
+
+- 在`ClipKernel`类中重写`Compute`方法
+    1. `Compute`接受输入参数：`const framework::ExecutionContext& context`
+        - `ExecutionContext` 是从 `Scope`中将运行时Op的输入、输出`Variable`组织在一起，使得Op在调用`Compute`方法时，能够简单地通过名字拿到需要的输入输出`Variable`
+        - 与`InferShapeContext`相比，`ExecutionContext` 中增加了设备类型
+    1. 在`Compute`函数里实现`OpKernel`的具体计算逻辑
+
+</font>
+
+---
+#### ClipKernel 代码概览
+
+<font size=5>
+
+```cpp
+template <typename DeviceContext, typename T>
+class ClipKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto max = context.Attr<T>("max");
+    auto min = context.Attr<T>("min");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const T* x_data = x->data<T>();
+    int64_t numel = x->numel();
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x_data,
+          x_data + numel, out_data, ClipFunctor<T>(min, max));
+  }
+};
+```
+
+- 为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用， Fluid 使用 Eigen 作为基础的矩阵运算库
+- Fluid对Eigen unsupported Tensor提供了一些基本的封装，可以在`Compute`接口中直接调用
+    - 关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
+
+</font>
+
+---
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step4</span>: 实现反向Op
+
+<font size=5>
+
+- ==**反向Op没有`ProtoMaker`**==，除此之外定义与实现方式前向Op完全一致，不再赘述
+- 这里仅对反向Op的输入输出进行说明：
+    1. 反向Op的输入
+        - 前向Op的输出
+        - 反向传播过程中传递给当前Op的梯度
+            - 需要注意，<span style="background-color:#e1c4c4;">Fluid中，不区分Cost Op和中间层Op，所有Op都必须正确处理接收到的梯度</span>
+    2. 反向Op的输出
+        - 对可学习参数的求导结果
+        - 对所有输入的求导结果
+
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step5</span>: 注册Op及Kernel
+
+<font size=5>
+
+至此Op和Op kernel都已经实现完毕，接下来，需要在`.cc`和`cu`文件中注册op和kernel
+
+1. 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
+
+    <font size=4>
+
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
+                ops::ClipOpGrad);
+    REGISTER_OP_CPU_KERNEL(
+        clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
+    REGISTER_OP_CPU_KERNEL(
+        clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ```
+
+   - 在上面的代码片段中：
+
+     1. `REGISTER_OP` ： 注册`ops::ClipOp`类，类型名为`clip`，该类的`ProtoMaker`为`ops::ClipOpMaker`，注册`ops::ClipOpGrad`，类型名为`clip_grad`
+     1. `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op，例如：优化算法相关的Op
+     1. `REGISTER_OP_CPU_KERNEL` ：注册`ops::ClipKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::ClipGradKernel`类
+
+    </font>
+1. 按照同样方法，在`.cu`文件中注册GPU Kernel
+   -  <span style="background-color:#e1c4c4;">如果CUDA Kernel的实现基于Eigen，需在 `.cu`的开始加上宏定义 `#define EIGEN_USE_GPU` </span>
+
+</font>
+
+---
+
+##### 编译和Python端绑定
+
+<font size=5>
+
+- 运行下面命令可以仅编译新添加的Op：
+
+  ```
+  make mul_op
+  ```
+  - <span style="background-color:#e1c4c4;">需注意，运行单元测试需要编译整个工程</span>
+
+- 如果遵循前文的文件命名规则，构建过程中，会自动为新增的op添加Python端绑定，并链接到生成的lib库中
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step6</span>: 添加前向单测及梯度检测
+
+<font size=5>
+
+- 新增Op的单元测试统一添加至：[python/paddle/v2/fluid/tests/unittests](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/unittests)目录
+- 前向Operator单测
+
+    1. Op单元测试继承自`OpTest`，各项具体的单元测试在`TestClipOp`里完成，所有单测case都以`TestXX`命名
+    1. 单元测试Operator，需要：
+        1. 在`setUp`函数定义输入、输出，以及相关的属性参数
+        1. 生成随机的输入数据
+        1. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比
+        1. 反向梯度检测流程测试框架已经实现，直接调用相应接口`check_grad`即可
+
+- `clip_op` 单测代码请参考 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_clip_op.py)，这里不再展开
+
+</font>
+
+---
+#### 编译执行单测
+
+<font size=5>
+
+- `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译
+
+    - <span style="background-color:#e1c4c4;">运行单元测试测时需要编译整个工程，并且编译时需要打开`WITH_TESTING`</span>, 即`cmake paddle_dir -DWITH_TESTING=ON`
+- 编译成功后，执行下面的命令来运行单元测试：
+
+  ```bash
+  make test ARGS="-R test_mul_op -V"
+  ```
+
+  或者:
+
+  ```
+  ctest -R test_mul_op
+  ```
+</font>
+
+---
+
+### 添加Op的一些注意事项
+
+<font size=5>
+
+- 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。<span style="background-color:#e1c4c4;">不允许一个文件中包含多个Op</span>，将会导致编译出错。
+- 注册Op时的类型名，需要和该Op的名字一样。<span style="background-color:#e1c4c4;">不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`</span>，会导致单元测试出错。
+- 如果Op<span style="background-color:#e1c4c4;">没有实现CUDA Kernel，不要创建空的`*_op.cu`</span>，会导致单元测试出错。
+- 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
+
+</font>
+
+---
+
+### ==10.== 使用相关问题
+
+---
+
+### 定义前向计算
+
+<font size=5>
+
+- 当在python端执行时：
+    ```python
+    import paddle.v2.fluid as fluid
+    ```
+    [`framework.py`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#L1040)定义了两个全局`Program`:
+    ```python
+    # program is a global instance.
+    _main_program_ = Program()
+    _startup_program_ = Program()
+    ```
+
+- 前向定义的过程就是不断往`mian_program`中添加Op和Variable
+- 如果需要执行一个新的`mian_program`时，可以调用调用：
+    ```python
+    def switch_main_program(program):
+        """
+        Switch the main program to a new program.
+        This funtion returns the previous main program.
+        """
+        ……
+    ```
+</font>
+
+---
+
+### 自定义参数的初始化
+
+<font size=5>
+
+- 调用`fluid.ParamAttr(……)`接口，自定义参数的初始化
+
+  ```python
+  w_param_attrs = ParamAttr(name=None,
+      initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
+      learning_rate=1.0,
+      regularizer=L1Decay(1.0),
+      trainable=True,
+      clip=GradientClipByValue(-1.0, 1.0),
+  )
+  y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
+  ```
+
+- 补充问题：如何创建 `Variable`
+  ```python
+  cur_program = Program()
+  cur_block = cur_program.current_block()
+  new_var = cur_block.create_var(name="X", shape=[-1, 16, 16], dtype="float32")
+  ```
+
+</font>
+
+---
+
+### 添加反向Op
+
+<font size=5>
+
+- 调用`fluid.backward.append_backward(X)`（`X`是一个Variable），来为一段前向`ProgramDesc`添加反Op
+
+    ```python
+    data = fluid.layers.data(name="data", shape=(2,3,4))
+    out = fluid.layers.fc(input=data,size=128,act=None)
+    loss = fluid.layers.reduce_sum(out)
+    fluid.backward.append_backward(loss=loss)
+    ```
+
+- 添加优化相关的Op
+    ```python
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(loss)
+    ```
+
+- 可以随时调用`print(fluid.default_main_program())`来输出当前的`main_program`
+
+- 当构建完成整个`Program`后，调用下面的接口执行内存优化：
+  ```python
+  fluid.memory_optimize(fluid.default_main_program())
+  ```
+  - _<span style="background-color:#e1c4c4;">注：内存优化目前仍在持续开发中，有可能不够稳定。</span>_
+
+</font>
+
+---
+
+### 总结：编译时执行流程
+
+<font size=5>
+
+- 用户定义前向计算
+- 添加反向Op到`default_main_program`
+- 添加 gradient clipping Op 到
+- 添加 regularization Op 到`default_main_program`
+- 为指定的优化算法，添加相关的状态 variable of optimizer 到`default_startup_program`
+    - 状态相关 variable是指如学习率, 历史 momentum, 二阶momentum等
+- 添加初始化 variable 的Op 到 `default_startup_program`
+- 为整个网络最后一个op，添加设置其接受到的梯度的Op到`default_main_program`
+- 进行内存优化规划
+
+</font>
+
+---
+
+### Feed 数据 (一)：通过 feed 字典
+
+<font size=5>
+
+- 执行executor的run方法时，指定feed字典，feed op 会将指定的数据放到`x`和`y`两个Variable中
+  ```python
+  y_data = np.random.randint(0, 8, [1]).astype("int32")
+  y_tensor = core.Tensor()
+  y_tensor.set(y_data, place)
+
+  x_data = np.random.uniform(0.1, 1, [11, 8]).astype("float32")
+  x_tensor = core.Tensor()
+  x_tensor.set(x_data, place)
+  ……
+  cost = exe.run(
+      fluid.default_main_program(),
+      feed={'x': x_tensor,
+            'y': y_tensor},
+      fetchlist=[avg_cost])
+  ```
+
+- 这种方法较为底层，一般用于单测中
+
+</font>
+
+---
+
+### Feed 数据 (二)：使用 DataFeeder接口
+
+<font size=5>
+
+- 编写一个data_reader函数，data_reader是一个Python generator
+
+  ```python
+  def demo_reader():
+      def random_generator():
+          yield np.random.uniform(0.1, 1, [4]), np.random.randint(0, 1, [1])
+      return random_generator
+  ```
+- 在训练任务中使用 DataFeeder 接口
+  ```python
+  cost = exe.run(
+      fluid.default_main_program(),
+      feed={'x': x_tensor,
+            'y': y_tensor},
+      fetchlist=[avg_cost])
+
+  train_reader = paddle.batch(
+      paddle.reader.shuffle(demo_reader(), buf_size=500), batch_size=4)
+  feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+  for data in train_reader():
+      cost = exe.run(
+          fluid.default_main_program(),
+          feed=feeder.feed(data),
+          fetch_list=[cost])
+  ```
+
+</font>
+
+---
+
+### 常见问题
+
+<font size=5>
+
+- 如何使用 evaluator ? [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py#L168)
+
+    ```python
+    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    for pass_id in range(PASS_NUM):
+        accuracy.reset()
+        for data in train_reader():
+            loss, acc = exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+             pass_acc = accuracy.eval(exe)
+             # acc 当前一个batch 的 accuracy
+             # pass_acc 当前batch 的 accuracy
+         pass_total_acc = accuracy.eval(exe)  # 整个pass的accuracy
+    ```
+
+- 如何在训练中测试？[->](https://github.com/dzhwinter/benchmark/blob/master/fluid/vgg16.py#L144)
+- 如何保存训练好的模型？[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L143)
+- 如何加载训练好的模型进行预测？[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L154)
+- 如何在同一个训练任务中定义多个Program，并交替运行？ [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/demo/fc_gan.py)
+- 如何profile？Fluid 实现了profile 工具，可以直接调用。请参考示例 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_profiler.py)
+
+
+</font>
+
+---
diff --git a/doc/fluid/getstarted/index_cn.rst b/doc/fluid/getstarted/index_cn.rst
index 75af7354be..3daea71d09 100644
--- a/doc/fluid/getstarted/index_cn.rst
+++ b/doc/fluid/getstarted/index_cn.rst
@@ -17,3 +17,4 @@
   :maxdepth: 1
 
   concepts/use_concepts_cn.rst
+  developer's_guide_to_paddle_fluid.md
diff --git a/doc/fluid/getstarted/index_en.rst b/doc/fluid/getstarted/index_en.rst
index 75a43f4af8..fb20bb4f24 100644
--- a/doc/fluid/getstarted/index_en.rst
+++ b/doc/fluid/getstarted/index_en.rst
@@ -16,3 +16,4 @@ Here is an example of linear regression. It introduces workflow of PaddlePaddle,
   :maxdepth: 1
 
   concepts/index_en.rst
+  developer's_guide_to_paddle_fluid.md
diff --git a/doc/fluid/getstarted/quickstart_cn.rst b/doc/fluid/getstarted/quickstart_cn.rst
index 135beb75d0..6a964d4f85 100644
--- a/doc/fluid/getstarted/quickstart_cn.rst
+++ b/doc/fluid/getstarted/quickstart_cn.rst
@@ -11,7 +11,7 @@ PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.
 
      pip install paddlepaddle
 
-如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
+如果需要安装支持GPU的版本（cuda8.0_cudnn5_avx_openblas），需要执行：
 
   .. code-block:: bash
 
@@ -28,18 +28,18 @@ PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.
 
      import paddle.dataset.uci_housing as uci_housing
      import paddle.fluid as fluid
-  
+
      with fluid.scope_guard(fluid.core.Scope()):
          # initialize executor with cpu
          exe = fluid.Executor(place=fluid.CPUPlace())
-         # load inference model 
+         # load inference model
          [inference_program, feed_target_names,fetch_targets] =  \
              fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
          # run inference
-         result = exe.run(inference_program, 
-                          feed={feed_target_names[0]: uci_housing.predict_reader()}, 
+         result = exe.run(inference_program,
+                          feed={feed_target_names[0]: uci_housing.predict_reader()},
                           fetch_list=fetch_targets)
-         # print predicted price is $12,273.97 
+         # print predicted price is $12,273.97
          print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
 
 执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
diff --git a/doc/fluid/getstarted/quickstart_en.rst b/doc/fluid/getstarted/quickstart_en.rst
index df6619cfd0..680122f258 100644
--- a/doc/fluid/getstarted/quickstart_en.rst
+++ b/doc/fluid/getstarted/quickstart_en.rst
@@ -12,7 +12,7 @@ Simply run the following command to install, the version is cpu_avx_openblas:
 
      pip install paddlepaddle
 
-If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
+If you need to install GPU version (cuda8.0_cudnn5_avx_openblas), run:
 
   .. code-block:: bash
 
@@ -31,18 +31,18 @@ code:
 
      import paddle.dataset.uci_housing as uci_housing
      import paddle.fluid as fluid
-  
+
      with fluid.scope_guard(fluid.core.Scope()):
          # initialize executor with cpu
          exe = fluid.Executor(place=fluid.CPUPlace())
-         # load inference model 
+         # load inference model
          [inference_program, feed_target_names,fetch_targets] =  \
              fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
          # run inference
-         result = exe.run(inference_program, 
-                          feed={feed_target_names[0]: uci_housing.predict_reader()}, 
+         result = exe.run(inference_program,
+                          feed={feed_target_names[0]: uci_housing.predict_reader()},
                           fetch_list=fetch_targets)
-         # print predicted price is $12,273.97 
+         # print predicted price is $12,273.97
          print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
 
 Run :code:`python housing.py` and voila! It should print out a list of predictions
diff --git a/doc/fluid/howto/cluster/fluid_recordio.md b/doc/fluid/howto/cluster/fluid_recordio.md
new file mode 100644
index 0000000000..55ce63ec19
--- /dev/null
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
@@ -0,0 +1,127 @@
+# How to use RecordIO in Fluid
+
+If you want to use RecordIO as your training data format, you need to convert to your training data
+to RecordIO files and reading them in the process of training, PaddlePaddle Fluid provides some
+interface to deal with the RecordIO files.
+
+## Generate RecordIO File
+
+Before start training with RecordIO files, you need to convert your training data
+to RecordIO format by `fluid.recordio_writer.convert_reader_to_recordio_file`, the sample codes
+as follows:
+
+```python
+    reader = paddle.batch(mnist.train(), batch_size=1)
+    feeder = fluid.DataFeeder(
+        feed_list=[  # order is image and label
+            fluid.layers.data(
+            name='image', shape=[784]),
+            fluid.layers.data(
+            name='label', shape=[1], dtype='int64'),
+        ],
+        place=fluid.CPUPlace())
+    fluid.recordio_writer.convert_reader_to_recordio_file('./mnist.recordio', reader, feeder)
+```
+
+The above code snippet would generate a RecordIO `./mnist.recordio` on your host.
+
+**NOTE**: we recommend users to set `batch_size=1` when generating the recordio files so that users can
+adjust it flexibly while reading it.
+
+## Use the RecordIO file in a Local Training Job
+
+PaddlePaddle Fluid provides an interface `fluid.layers.io.open_recordio_file` to load your RecordIO file
+and then you can use them as a Layer in your network configuration, the sample codes as follows:
+
+```python
+    data_file = fluid.layers.io.open_recordio_file(
+        filename="./mnist.recordio",
+        shapes=[(-1, 784),(-1, 1)],
+        lod_levels=[0, 0],
+        dtypes=["float32", "int32"])
+    data_file = fluid.layers.io.batch(data_file, batch_size=4)
+
+    img, label = fluid.layers.io.read_file(data_file)
+    hidden = fluid.layers.fc(input=img, size=100, act='tanh')
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+
+    fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss)
+
+    place = fluid.CPUPlace()
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    avg_loss_np = []
+
+    # train a pass
+    batch_id = 0
+    while True:
+        tmp, = exe.run(fetch_list=[avg_loss])
+
+        avg_loss_np.append(tmp)
+        print(batch_id)
+        batch_id += 1
+```
+
+## Use the RecordIO files in Distributed Training
+
+1. generate multiple RecordIO files
+
+For a distributed training job, you may have multiple trainer nodes,
+and one or more RecordIO files for one trainer node, you can use the interface
+`fluid.recordio_writer.convert_reader_to_recordio_files` to convert your training data
+into multiple RecordIO files, the sample codes as follows:
+
+```python
+    reader = paddle.batch(mnist.train(), batch_size=1)
+    feeder = fluid.DataFeeder(
+        feed_list=[  # order is image and label
+            fluid.layers.data(
+            name='image', shape=[784]),
+            fluid.layers.data(
+            name='label', shape=[1], dtype='int64'),
+        ],
+        place=fluid.CPUPlace())
+    fluid.recordio_writer.convert_reader_to_recordio_files(
+          filename_suffix='./mnist.recordio', batch_per_file=100, reader, feeder)
+```
+
+The above codes would generate multiple RecordIO files on your host like:
+
+```bash
+.
+ \_mnist-00000.recordio
+ |-mnist-00001.recordio
+ |-mnist-00002.recordio
+ |-mnist-00003.recordio
+ |-mnist-00004.recordio
+```
+
+2. open multiple RecordIO files by `fluid.layers.io.open_files`
+
+For a distributed training job, the distributed operator system will schedule trainer process on multiple nodes,
+each trainer process reads parts of the whole training data, we usually take the following approach to make the training
+data allocated by each trainer process as uniform as possiable:
+
+```python
+def gen_train_list(file_pattern, trainers, trainer_id):
+   file_list = glob.glob(file_pattern)
+   ret_list = []
+   for idx, f in enumerate(file_list):
+       if (idx + trainers) % trainers == trainer_id:
+           ret_list.append(f)
+   return ret_list
+
+trainers = int(os.getenv("TRAINERS"))
+trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+data_file = fluid.layers.io.open_files(
+    filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
+    thread_num=1,
+    shapes=[(-1, 784),(-1, 1)],
+    lod_levels=[0, 0],
+    dtypes=["float32", "int32"])
+img, label = fluid.layers.io.read_file(data_files)
+...
+```
diff --git a/doc/fluid/howto/index_cn.rst b/doc/fluid/howto/index_cn.rst
index 97aeaf167d..b57af64f44 100644
--- a/doc/fluid/howto/index_cn.rst
+++ b/doc/fluid/howto/index_cn.rst
@@ -3,5 +3,6 @@
 
 .. toctree::
   :maxdepth: 1
-  
+
+  inference/index_cn.rst
   optimization/index_cn.rst
diff --git a/doc/fluid/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
new file mode 100644
index 0000000000..c8d9992fcc
--- /dev/null
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
@@ -0,0 +1,96 @@
+安装与编译C++预测库
+===========================
+
+直接下载安装
+-------------
+
+======================   ========================================
+版本说明                            C++预测库   
+======================   ========================================
+cpu_avx_mkl              `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz>`_ 
+cpu_avx_openblas         `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz>`_
+cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz>`_
+cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+======================   ========================================
+
+从源码编译
+----------
+用户也可以从 PaddlePaddle 核心代码编译C++预测库，只需在编译时配制下面这些编译选项：
+
+=================   =========
+选项                 值   
+=================   =========
+CMAKE_BUILD_TYPE    Release
+FLUID_INSTALL_DIR   安装路径    
+WITH_FLUID_ONLY     ON（推荐）
+WITH_SWIG_PY        OFF（推荐
+WITH_PYTHON         OFF（推荐）
+WITH_GPU            ON/OFF
+WITH_MKL            ON/OFF
+=================   =========
+
+建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码，配制编译选项（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+     PADDLE_ROOT=/path/of/capi
+     git clone https://github.com/PaddlePaddle/Paddle.git
+     cd Paddle
+     mkdir build
+     cd build
+     cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \
+           -DCMAKE_BUILD_TYPE=Release \
+           -DWITH_FLUID_ONLY=ON \
+           -DWITH_SWIG_PY=OFF \
+           -DWITH_PYTHON=OFF \
+           -DWITH_MKL=OFF \
+           -DWITH_GPU=OFF  \
+           ..
+      make
+      make inference_lib_dist
+
+成功编译后，使用C++预测库所需的依赖（包括：（1）编译出的PaddlePaddle预测库和头文件；（2）第三方链接库和头文件；（3）版本信息与编译选项信息）
+均会存放于PADDLE_ROOT目录中。目录结构如下：
+
+  .. code-block:: text
+
+     PaddleRoot/
+     ├── CMakeCache.txt
+     ├── paddle
+     │   └── fluid
+     │       ├── framework
+     │       ├── inference
+     │       ├── memory
+     │       ├── platform
+     │       ├── pybind
+     │       └── string
+     ├── third_party
+     │   ├── boost
+     │   │   └── boost
+     │   ├── eigen3
+     │   │   ├── Eigen
+     │   │   └── unsupported
+     │   └── install
+     │       ├── gflags
+     │       ├── glog
+     │       ├── mklml
+     │       ├── protobuf
+     │       ├── snappy
+     │       ├── snappystream
+     │       └── zlib
+     └── version.txt
+     
+version.txt 中记录了该预测库的版本信息，包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号，如：
+
+  .. code-block:: text
+
+     GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8
+     WITH_MKL: ON
+     WITH_GPU: ON
+     CUDA version: 8.0
+     CUDNN version: v5
diff --git a/doc/fluid/howto/inference/index_cn.rst b/doc/fluid/howto/inference/index_cn.rst
new file mode 100644
index 0000000000..a903423548
--- /dev/null
+++ b/doc/fluid/howto/inference/index_cn.rst
@@ -0,0 +1,8 @@
+预测库
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  build_and_install_lib_cn.rst
+  inference_support_in_fluid_cn.md
diff --git a/doc/fluid/howto/inference/inference_support_in_fluid_cn.md b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
new file mode 100644
index 0000000000..309b17fccd
--- /dev/null
+++ b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
@@ -0,0 +1,304 @@
+# 使用指南
+
+## 目录：
+
+- Python Inference API
+- Inference C++ API
+- Inference实例
+- Inference计算优化
+
+## Python Inference API **[改进中]**
+- 保存Inference模型 ([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L295))
+
+  ```python
+  def save_inference_model(dirname,
+                           feeded_var_names,
+                           target_vars,
+                           executor,
+                           main_program=None,
+                           model_filename=None,
+                           params_filename=None):
+  ```
+  Inference模型和参数将会保存到`dirname`目录下：
+  - 序列化的模型
+    - `model_filename`为`None`，保存到`dirname/__model__`
+    - `model_filename`非`None`，保存到`dirname/model_filename`
+  - 参数
+    - `params_filename`为`None`，单独保存到各个独立的文件，各文件以参数变量的名字命名
+    - `params_filename`非`None`，保存到`dirname/params_filename`
+
+- 两种存储格式
+  - 参数保存到各个独立的文件
+    - 如，设置`model_filename`为`None`、`params_filename`为`None`
+
+    ```bash
+    $ cd recognize_digits_conv.inference.model
+    $ ls
+    $ __model__ batch_norm_1.w_0 batch_norm_1.w_2 conv2d_2.w_0 conv2d_3.w_0 fc_1.w_0 batch_norm_1.b_0 batch_norm_1.w_1 conv2d_2.b_0 conv2d_3.b_0 fc_1.b_0
+    ```
+  - 参数保存到同一个文件
+    - 如，设置`model_filename`为`None`、`params_filename`为`__params__`
+
+    ```bash
+    $ cd recognize_digits_conv.inference.model
+    $ ls
+    $ __model__ __params__
+    ```
+- 加载Inference模型([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L380))
+  ```python
+  def load_inference_model(dirname,
+                           executor,
+                           model_filename=None,
+                           params_filename=None):
+    ...
+    return [program, feed_target_names, fetch_targets]
+  ```
+
+## 链接Fluid Inference库
+- 示例项目([链接](https://github.com/luotao1/fluid_inference_example.git))
+
+  - GCC配置
+    ```bash
+    $ g++ -o a.out -std=c++11 main.cc \
+          -I${PADDLE_ROOT}/ \
+          -I${PADDLE_ROOT}/third_party/install/gflags/include \
+          -I${PADDLE_ROOT}/third_party/install/glog/include \
+          -I${PADDLE_ROOT}/third_party/install/protobuf/include \
+          -I${PADDLE_ROOT}/third_party/eigen3 \
+          -L${PADDLE_ROOT}/paddle/fluid/inference -lpaddle_fluid \
+          -lrt -ldl -lpthread
+    ```
+
+  - CMake配置
+    ```cmake
+    include_directories(${PADDLE_ROOT}/)
+    include_directories(${PADDLE_ROOT}/third_party/install/gflags/include)
+    include_directories(${PADDLE_ROOT}/third_party/install/glog/include)
+    include_directories(${PADDLE_ROOT}/third_party/install/protobuf/include)
+    include_directories(${PADDLE_ROOT}/third_party/eigen3)
+    target_link_libraries(${TARGET_NAME}
+                          ${PADDLE_ROOT}/paddle/fluid/inference/libpaddle_fluid.so
+                          -lrt -ldl -lpthread)
+    ```
+
+  - 设置环境变量：
+  `export LD_LIBRARY_PATH=${PADDLE_ROOT}/paddle/fluid/inference:$LD_LIBRARY_PATH`
+
+
+
+## C++ Inference API
+
+- 推断流程([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_helper.h#L91))
+
+  - 1、 初始化设备
+    ```cpp
+    #include "paddle/fluid/framework/init.h"
+    paddle::framework::InitDevices(false);
+    ```
+
+  - 2、 定义place，executor，scope
+    ```cpp
+    auto place = paddle::platform::CPUPlace();
+    auto executor = paddle::framework::Executor(place);
+    auto* scope = new paddle::framework::Scope();
+    ```
+
+  - 3、 加载模型
+    ```cpp
+    #include "paddle/fluid/inference/io.h"
+    auto inference_program = paddle::inference::Load(executor, *scope, dirname);
+    // or
+    auto inference_program = paddle::inference::Load(executor,
+                                                     *scope,
+                                                     dirname + "/" + model_filename,
+                                                     dirname + "/" + params_filename);
+    ```
+
+  - 4、 获取`feed_target_names`和`fetch_target_names`
+    ```cpp
+    const std::vector<std::string>& feed_target_names = inference_program->GetFeedTargetNames();
+    const std::vector<std::string>& fetch_target_names = inference_program->GetFetchTargetNames();
+    ```
+
+  - 5、 准备`feed`数据
+    ```cpp
+    #include "paddle/fluid/framework/lod_tensor.h"
+    std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+    ...
+    std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+    for (size_t i = 0; i < feed_target_names.size(); ++i) {
+      // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+      feed_targets[feed_target_names[i]] = cpu_feeds[i];
+    }
+    ```
+
+  - 6、 定义`Tensor`来`fetch`结果
+    ```cpp
+    std::vector<paddle::framework::LoDTensor*> cpu_fetchs;
+    std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+    for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+      fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+    }
+    ```
+
+  - 7、 执行`inference_program`
+    ```cpp
+    executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+    ```
+
+  - 8、 使用`fetch`数据
+    ```cpp
+    for (size_t i = 0; i < cpu_fetchs.size(); ++i) {
+      std::cout << "lod_i: " << cpu_fetchs[i]->lod();
+      std::cout << "dims_i: " << cpu_fetchs[i]->dims();
+      std::cout << "result:";
+      float* output_ptr = cpu_fetchs[i]->data<float>();
+      for (int j = 0; j < cpu_fetchs[i]->numel(); ++j) {
+        std::cout << " " << output_ptr[j];
+      }
+      std::cout << std::endl;
+    }
+    ```
+    针对不同的数据，4. - 8.可执行多次。
+
+  - 9、 释放内存
+    ```cpp
+    delete scope;
+    ```
+
+
+- 接口说明
+
+  ```cpp
+  void Run(const ProgramDesc& program, Scope* scope,
+           std::map<std::string, const LoDTensor*>& feed_targets,
+           std::map<std::string, LoDTensor*>& fetch_targets,
+           bool create_vars = true,
+           const std::string& feed_holder_name = "feed",
+           const std::string& fetch_holder_name = "fetch");
+  ```
+  - 使用Python API `save_inference_model`保存的`program`里面包含了`feed_op`和`fetch_op`，用户提供的`feed_targets`、`fetch_targets`必须和`inference_program`中的`feed_op`、`fetch_op`保持一致。
+  - 用户提供的`feed_holder_name`和`fetch_holder_name`也必须和`inference_program`中`feed_op`、`fetch_op`保持一致，可使用`SetFeedHolderName`和`SetFetchHolderName`接口重新设置`inferece_program`
+  - 默认情况下，除了`persistable`属性设置为`True`的`Variable`之外，每次执行`executor.Run`会创建一个局部`Scope`，并且在这个局部`Scope`中创建和销毁所有的`Variable`，以最小化空闲时的内存占用。
+  - `persistable`属性为`True`的`Variable`有：
+    - Operators的参数`w`、`b`等
+    - `feed_op`的输入变量
+    - `fetch_op`的输出变量
+
+
+- **不在每次执行时创建和销毁变量
+ ([PR](https://github.com/PaddlePaddle/Paddle/pull/9301))**
+  - 执行`inference_program`
+    ```cpp
+    // Call once
+    executor.CreateVariables(*inference_program, scope, 0);
+    // Call as many times as you like
+    executor.Run(
+        *inference_program, scope, feed_targets, fetch_targets, false);
+    ```
+  - **优点**
+    - 节省了频繁创建、销毁变量的时间（约占每次`Run`总时间的1% ~ 12%）
+    - 执行结束后可获取所有Operators的计算结果
+  - **缺点**
+    - 空闲时也会占用大量的内存
+    - 在同一个`Scope`中，相同的变量名是公用同一块内存的，容易引起意想不到的错误
+
+
+- **不在每次执行时创建Op([PR](https://github.com/PaddlePaddle/Paddle/pull/9630))**
+  - 执行`inference_program`
+    ```cpp
+    // Call once
+    auto ctx = executor.Prepare(*inference_program, 0);
+    // Call as many times as you like if you have no need to change the inference_program
+    executor.RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets);
+    ```
+  - **优点**
+    - 节省了频繁创建、销毁Op的时间
+  - **缺点**
+    - 一旦修改了`inference_program`，则需要重新创建`ctx`
+
+
+- **多线程共享Parameters([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_multi_thread_helper.h))**
+  - 主线程
+    - 1、 初始化设备
+    - 2、 定义`place`，`executor`，`scope`
+    - 3、 加载模型，得到`inference_program`
+  - 从线程
+    - **复制`inference_program`得到`copy_program`，修改`copy_program`的`feed_holder_name`和`fetch_holder_name`**
+      ```cpp
+      auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
+                 new paddle::framework::ProgramDesc(*inference_program));
+      std::string feed_holder_name = "feed_" + paddle::string::to_string(thread_id);
+      std::string fetch_holder_name = "fetch_" + paddle::string::to_string(thread_id);
+      copy_program->SetFeedHolderName(feed_holder_name);
+      copy_program->SetFetchHolderName(fetch_holder_name);
+      ```
+    - 4、 获取`copy_program`的`feed_target_names`和`fetch_target_names`
+    - 5、 准备feed数据，定义Tensor来fetch结果
+    - 6、 执行`copy_program`
+      ```cpp
+      executor->Run(*copy_program, scope, feed_targets, fetch_targets, true, feed_holder_name, fetch_holder_name);
+      ```
+    - 7、 使用fetch数据
+  - 主线程
+    - 8、 释放资源
+
+
+- 基本概念
+  - 数据相关：
+    - [Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor.md)，一个N维数组，数据可以是任意类型（int，float，double等）
+    - [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md)，带LoD(Level-of-Detail)即序列信息的Tensor
+    - [Scope](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md)，记录了变量Variable
+  - 执行相关：
+    - [Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md)，无状态执行器，只跟设备相关
+    - Place
+      - CPUPlace，CPU设备
+      - CUDAPlace，CUDA GPU设备
+  - 神经网络表示：
+    - [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md).
+
+    详细介绍请参考[**Paddle Fluid开发者指南**](https://github.com/lcy-seso/learning_notes/blob/master/Fluid/developer's_guid_for_Fluid/Developer's_Guide_to_Paddle_Fluid.md)
+
+
+
+## Inference实例
+
+  1. fit a line: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc)
+  1. image classification: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_image_classification.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_image_classification.cc)
+  1. label semantic roles: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc)
+  1. recognize digits: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc)
+  1. recommender system: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recommender_system.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc)
+  1. understand sentiment: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_understand_sentiment.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc)
+  1. word2vec: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_word2vec.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_word2vec.cc)
+
+
+## Inference计算优化
+- 使用Python推理优化工具([inference_transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/inference_transpiler.py))
+  ```python
+  class InferenceTranspiler:
+    def transpile(self, program, place, scope=None):
+        ...
+        if scope is None:
+            scope = global_scope()
+        ...
+  ```
+  - 使用`InferenceTranspiler`将会直接修改`program`。
+  - 使用`InferenceTranspiler`会修改参数的值，请确保`program`的参数在`scope`内。
+- 支持的优化
+  - 融合batch_norm op的计算
+- 使用示例([链接](https://github.com/Xreki/Xreki.github.io/blob/master/fluid/inference/inference_transpiler.py))
+  ```python
+  import paddle.fluid as fluid
+  # NOTE: Applying the inference transpiler will change the inference_program.
+  t = fluid.InferenceTranspiler()
+  t.transpile(inference_program, place, inference_scope)
+  ```
+
+
+
+
+## 内存使用优化
+- 使用Python内存优化工具([memory_optimization_transipiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/memory_optimization_transpiler.py))
+  ```python
+  fluid.memory_optimize(inference_program)
+  ```
diff --git a/doc/fluid/howto/optimization/benchmark/README.md b/doc/fluid/howto/optimization/benchmark/README.md
deleted file mode 120000
index db30af7f53..0000000000
--- a/doc/fluid/howto/optimization/benchmark/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../benchmark/cluster/README.md
\ No newline at end of file
diff --git a/doc/fluid/howto/optimization/benchmark/vgg16/README.md b/doc/fluid/howto/optimization/benchmark/vgg16/README.md
deleted file mode 120000
index ca963ef5f0..0000000000
--- a/doc/fluid/howto/optimization/benchmark/vgg16/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../benchmark/cluster/vgg16/README.md
\ No newline at end of file
diff --git a/doc/fluid/howto/optimization/host_memory_profiling_cn.md b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
new file mode 100644
index 0000000000..9b55a66ded
--- /dev/null
+++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
@@ -0,0 +1,89 @@
+## 堆内存分析和优化
+
+计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放，随着程序的运行占用的内存越来越大，一方面会影响程序的稳定性，可能让运行速度越来越慢，或者造成oom，甚至会影响运行程序的机器的稳定性，造成宕机。
+
+
+目前有很多内存泄漏分析工具，比较经典的有[valgrind](http://valgrind.org/docs/manual/quick-start.html#quick-start.intro), [gperftools](https://gperftools.github.io/gperftools/)。
+
+因为Fluid是用Python驱动C++ core来运行，valgrind直接分析非常困难，需要自己编译debug版本的、带valgrind支持的专用Python版本，而且输出的信息中大部分是Python自己的符号和调用信息，分析起来很困难，另外使用valgrind会让程序运行速度变得非常慢，所以不建议使用。
+
+本教程主要介绍[gperftools](https://gperftools.github.io/gperftools/)的使用。
+
+gperftool主要支持以下四个功能：
+
+- thread-caching malloc
+- heap-checking using tcmalloc
+- heap-profiling using tcmalloc
+- CPU profiler
+
+Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/cpu_profiling_cn.md)。
+
+对于堆内存的分析，主要用到thread-caching malloc和heap-profiling using tcmalloc。
+
+## 使用流程
+#### 环境
+本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev，基于Ubuntu 16.04.4 LTS环境。
+
+#### 使用流程
+
+- 安装google-perftools
+
+```
+apt-get install libunwind-dev 
+apt-get install google-perftools
+```
+
+- 安装pprof
+
+```
+go get -u github.com/google/pprof
+```
+
+- 设置运行环境
+
+```
+export PPROF_PATH=/root/gopath/bin/pprof
+export PPROF_BINARY_PATH=/root/gopath/bin/pprof
+export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
+```
+
+- 使用heap profile来运行python程序。本质上是周期性的对堆的分配情况做一次快照。
+
+```
+# HEAPPROFILE 设置生成的堆分析文件的目录和文件前缀
+# HEAP_PROFILE_ALLOCATION_INTERVAL 设置每分配多少存储dump一次dump，默认1GB
+env HEAPPROFILE="./perf_log/test.log" HEAP_PROFILE_ALLOCATION_INTERVAL=209715200 python trainer.py
+```
+
+随着程序的运行，会在perf_log这个文件夹下生成很多文件，如下：
+
+```
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0001.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0002.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0003.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0004.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0005.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0006.heap
+```
+
+- 使用pprof对heap文件进行分析。分析有两种模式：
+	- 完整模式。会对当前heap做一个分析，显示目前分配内存一些调用路径。
+
+	```
+	pprof --pdf python test.log.0012.heap
+	```
+	上述命令会生成一个profile00x.pdf的文件，可以直接打开，例如：[memory_cpu_allocator](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_cpu_allocator.pdf)。从下图可以看出，在CPU版本fluid的运行过程中，分配存储最多的模块式CPUAllocator. 而别的模块相对而言分配内存较少，所以被忽略了，这对于分配内存泄漏是很不方便的，因为泄漏是一个缓慢的过程，在这种图中是无法看到的。
+	
+	![result](https://user-images.githubusercontent.com/3048612/40964027-a54033e4-68dc-11e8-836a-144910c4bb8c.png)
+	
+	- Diff模式。可以对两个时刻的heap做diff，把一些内存分配没有发生变化的模块去掉，而把增量部分显示出来。
+	```
+	pprof --pdf --base test.log.0010.heap python test.log.1045.heap
+	```
+	生成的结果为：[`memory_leak_protobuf`](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_leak_protobuf.pdf)
+	
+	从图中可以看出：ProgramDesc这个结构，在两个版本之间增长了200MB+，所以这里有很大的内存泄漏的可能性，最终结果也确实证明是这里造成了泄漏。
+	
+	![result](https://user-images.githubusercontent.com/3048612/40964057-b434d5e4-68dc-11e8-894b-8ab62bcf26c2.png)
+	![result](https://user-images.githubusercontent.com/3048612/40964063-b7dbee44-68dc-11e8-9719-da279f86477f.png)
+	
diff --git a/doc/fluid/images/1.png b/doc/fluid/images/1.png
new file mode 100644
index 0000000000..67daf566f9
Binary files /dev/null and b/doc/fluid/images/1.png differ
diff --git a/doc/fluid/images/2.png b/doc/fluid/images/2.png
new file mode 100644
index 0000000000..43367777f4
Binary files /dev/null and b/doc/fluid/images/2.png differ
diff --git a/doc/fluid/images/3.png b/doc/fluid/images/3.png
new file mode 100644
index 0000000000..481021ef30
Binary files /dev/null and b/doc/fluid/images/3.png differ
diff --git a/doc/fluid/images/4.png b/doc/fluid/images/4.png
new file mode 100644
index 0000000000..4279f41e06
Binary files /dev/null and b/doc/fluid/images/4.png differ
diff --git a/doc/fluid/images/LoDTensor.png b/doc/fluid/images/LoDTensor.png
new file mode 100644
index 0000000000..75369f5378
Binary files /dev/null and b/doc/fluid/images/LoDTensor.png differ
diff --git a/doc/fluid/images/compile_run_time.png b/doc/fluid/images/compile_run_time.png
new file mode 100644
index 0000000000..0bc9b2fd0e
Binary files /dev/null and b/doc/fluid/images/compile_run_time.png differ
diff --git a/doc/fluid/images/executor.png b/doc/fluid/images/executor.png
new file mode 100644
index 0000000000..b29c0d779e
Binary files /dev/null and b/doc/fluid/images/executor.png differ
diff --git a/doc/fluid/images/fluid_examples.png b/doc/fluid/images/fluid_examples.png
new file mode 100644
index 0000000000..aa99472c0f
Binary files /dev/null and b/doc/fluid/images/fluid_examples.png differ
diff --git a/doc/fluid/images/fluid_module_1.png b/doc/fluid/images/fluid_module_1.png
new file mode 100644
index 0000000000..554782ba54
Binary files /dev/null and b/doc/fluid/images/fluid_module_1.png differ
diff --git a/doc/fluid/images/fluid_module_2.png b/doc/fluid/images/fluid_module_2.png
new file mode 100644
index 0000000000..4219efccbb
Binary files /dev/null and b/doc/fluid/images/fluid_module_2.png differ
diff --git a/doc/fluid/images/layer.png b/doc/fluid/images/layer.png
new file mode 100644
index 0000000000..e46db4c9c6
Binary files /dev/null and b/doc/fluid/images/layer.png differ
diff --git a/doc/fluid/images/operator1.png b/doc/fluid/images/operator1.png
new file mode 100644
index 0000000000..3975b06f61
Binary files /dev/null and b/doc/fluid/images/operator1.png differ
diff --git a/doc/fluid/images/operator2.png b/doc/fluid/images/operator2.png
new file mode 100644
index 0000000000..b7bb1fae20
Binary files /dev/null and b/doc/fluid/images/operator2.png differ
diff --git a/doc/fluid/images/place.png b/doc/fluid/images/place.png
new file mode 100644
index 0000000000..14e77511d6
Binary files /dev/null and b/doc/fluid/images/place.png differ
diff --git a/doc/fluid/images/print_fluid_program.png b/doc/fluid/images/print_fluid_program.png
new file mode 100644
index 0000000000..e8e459e1b3
Binary files /dev/null and b/doc/fluid/images/print_fluid_program.png differ
diff --git a/doc/fluid/images/program_desc1.png b/doc/fluid/images/program_desc1.png
new file mode 100644
index 0000000000..0656336914
Binary files /dev/null and b/doc/fluid/images/program_desc1.png differ
diff --git a/doc/fluid/images/program_desc2.png b/doc/fluid/images/program_desc2.png
new file mode 100644
index 0000000000..db5bfa1231
Binary files /dev/null and b/doc/fluid/images/program_desc2.png differ
diff --git a/doc/fluid/images/raw_input.png b/doc/fluid/images/raw_input.png
new file mode 100644
index 0000000000..0725f92d2b
Binary files /dev/null and b/doc/fluid/images/raw_input.png differ
diff --git a/doc/fluid/images/scope_variable_tensor.png b/doc/fluid/images/scope_variable_tensor.png
new file mode 100644
index 0000000000..59b0de6fb3
Binary files /dev/null and b/doc/fluid/images/scope_variable_tensor.png differ
diff --git a/doc/fluid/images/sorted_input.png b/doc/fluid/images/sorted_input.png
new file mode 100644
index 0000000000..ff60112836
Binary files /dev/null and b/doc/fluid/images/sorted_input.png differ
diff --git a/doc/fluid/images/transpiler.png b/doc/fluid/images/transpiler.png
new file mode 100644
index 0000000000..422973c0dc
Binary files /dev/null and b/doc/fluid/images/transpiler.png differ
diff --git a/doc/fluid/images/user_interface.png b/doc/fluid/images/user_interface.png
new file mode 100644
index 0000000000..ffc94e3d89
Binary files /dev/null and b/doc/fluid/images/user_interface.png differ
diff --git a/doc/mobile/CMakeLists.txt b/doc/mobile/CMakeLists.txt
index b104a6318d..7b34ba8d07 100644
--- a/doc/mobile/CMakeLists.txt
+++ b/doc/mobile/CMakeLists.txt
@@ -15,6 +15,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 
+set(IMPORT_PADDLE_STRING "")
+set(IMPORT_PADDLEV2_STRING "")
+
 configure_file(
         "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
         "${BINARY_BUILD_DIR_EN}/conf.py"
@@ -27,8 +30,6 @@ sphinx_add_target(paddle_mobile_docs
         ${CMAKE_CURRENT_SOURCE_DIR}
         ${SPHINX_HTML_DIR_EN})
 
-add_dependencies(paddle_mobile_docs gen_proto_py paddle_python)
-
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
 
@@ -49,5 +50,3 @@ sphinx_add_target(paddle_mobile_docs_cn
         ${SPHINX_CACHE_DIR_CN}
         ${CMAKE_CURRENT_SOURCE_DIR}
         ${SPHINX_HTML_DIR_CN})
-
-add_dependencies(paddle_mobile_docs_cn gen_proto_py paddle_python)
diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
index cdd6917239..0607748b75 100644
--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -63,16 +63,16 @@ Android的Docker开发镜像向用户提供两个可配置的参数：
 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
 
 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
 ```
 
 - 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库
 
 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
 ```
 
-执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
+执行上述`docker run`命令时，容器执行[paddle/scripts/paddle_build.sh build_android](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
 
 ## 基于Linux交叉编译环境的编译方式
 本文档将以Linux x86-64平台为例，介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md
index 6af16fc114..572063e801 100644
--- a/doc/mobile/cross_compiling_for_android_en.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -36,7 +36,7 @@ $ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
 We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
 
 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android ./paddle/scripts/paddle_build.sh build_android
 ```
 
 The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
@@ -70,7 +70,7 @@ The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
 
 The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.
 
-The default entry-point of the Docker image, [`paddle/scripts/docker/build_android.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`.  For information about other configuration arguments, please continue reading.
+The build command, [`paddle/scripts/paddle_build.sh build_android`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`.  For information about other configuration arguments, please continue reading.
 
 The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`.
 
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
index 8297316e8f..56d1515005 100644
--- a/doc/mobile/index_cn.rst
+++ b/doc/mobile/index_cn.rst
@@ -1,9 +1,9 @@
 移动端
-=====
+======
 
 ..  toctree::
   :maxdepth: 1
 
   cross_compiling_for_android_cn.md
   cross_compiling_for_ios_cn.md
-  cross_compiling_for_raspberry_cn.md
\ No newline at end of file
+  cross_compiling_for_raspberry_cn.md
diff --git a/doc/survey/dynamic_graph.md b/doc/survey/dynamic_graph.md
new file mode 100644
index 0000000000..553a9dbe15
--- /dev/null
+++ b/doc/survey/dynamic_graph.md
@@ -0,0 +1,378 @@
+# Automatic Differentiation with the Tape
+
+## Automatic Differentiation
+
+A key challenge in the field of deep learning is to automatically derive the backward pass from the forward pass described algorithmically by researchers.  Such a derivation, or a transformation of the forward pass program, has been long studied before the recent prosperity of deep learning in the field known as [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf).
+
+## The Tape
+
+Given the forward pass program (usually in Python in practices), there are two strategies to derive the backward pass:
+
+1. from the forward pass program itself, or
+1. from the execution trace of the forward pass program, which is often known as the *tape*.
+
+This article surveys systems that follow the latter strategy.
+
+## Dynamic Network
+
+When we train a deep learning model, the tape changes every iteration as the input data change, so we have to re-derive the backward pass every iteration.  This is known as *dynamic network*.
+
+Deep learning systems that utilize the idea of dynamic network gained their popularities in recent years.  This article surveys two representative systems: [PyTorch](https://pytorch.org/) and [DyNet](https://dynet.readthedocs.io/en/latest/).
+
+## An Overview
+
+Both frameworks record a ‘tape’ of the computation and interpreting (or run-time compiling) a transformation of the tape played back in reverse. This tape is a different kind of entity than the original program.[[link]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf)
+
+Consider the following code feedforward model.
+
+```python
+x = Variable(randn(20, 1)))
+label = Variable(randint(1))
+W_1, W_2 = Variable(randn(20, 20)), Variable(randn(10, 20))
+h = matmul(W_1, x)
+pred = matmul(W_2, x)
+loss = softmax(pred, label)
+loss.backward()
+```
+
+### 1) Dynet uses List to encode the Tape
+
+During the forward execution, a list of operators, in this case `matmul`, `matmul` and `softmax`, are recorded in the tape, along with the necessary information needed to do the backward such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward()`.
+
+<details> 
+<summary></summary>
+digraph g {
+    graph [
+        rankdir = "LR"
+    ];
+    node [
+        fontsize = "16"
+        shape = "ellipse"
+    ];
+    edge [];
+    "node0" [
+        label = "<f0> type: matmul | <f1> input: W_1, x | <f2> output: h"
+        shape = "record"
+    ];
+    "node1" [
+        label = "<f0> type: matmul | <f1> input: W_2, h | <f2> output: pred"
+        shape = "record"
+    ];
+    "node2" [
+        label = "<f0> type: softmax | <f1> input: pred, label | <f2> output: loss"
+        shape = "record"
+    ];
+    "node0":f0 -> "node1":f0 [];
+    "node1":f0 -> "node2":f0 [];
+}
+</details>
+
+![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22ellipse%22%20];%20edge%20[];%20%22node0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_1,%20x%20|%20%3Cf2%3E%20output:%20h%22%20shape%20=%20%22record%22%20];%20%22node1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_2,%20h%20|%20%3Cf2%3E%20output:%20pred%22%20shape%20=%20%22record%22%20];%20%22node2%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20%3Cf1%3E%20input:%20pred,%20label%20|%20%3Cf2%3E%20output:%20loss%22%20shape%20=%20%22record%22%20];%20%22node0%22:f0%20-%3E%20%22node1%22:f0%20[%20id%20=%200%20];%20%22node1%22:f0%20-%3E%20%22node2%22:f0%20[%20id%20=%201%20];%20})
+
+### 2) Pytorch uses Node Graph to encode the Tape
+
+The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order.
+
+<details> 
+<summary></summary>
+digraph g {
+    graph [
+        rankdir = "LR"
+    ];
+    
+    subgraph function {
+        node [
+            fontsize = "16"
+            style = filled
+            shape = "record"
+        ];
+        "matmul0" [ label = "<f0> type: matmul | prev_func: None" ];
+        "matmul1" [ label = "<f0> type: matmul | prev_func: matmul" ];
+        "softmax" [ label = "<f0> type: softmax | prev_func: matmul" ];
+    }
+    
+    subgraph variable {
+        node [
+            fontsize = "16"
+            shape = "Mrecord"
+            style = filled
+            fillcolor = white
+        ];
+        "x" [ label = "<f0> x | <f1> creator: None" ];
+        "label" [ label = "<f0> label | <f1> creator: None" ];
+        "W_1" [ label = "<f0> W_1 | <f1> creator: None" ];
+        "W_2" [ label = "<f0> W_2 | <f1> creator: None" ];
+        "h" [ label = "<f0> h | <f1> creator: None" ];
+        "pred" [ label = "<f0> pred | <f1> creator: matmul" ];
+        "loss" [ label = "<f0> loss | <f1> creator: softmax" ];
+    }
+    
+    subgraph data_flow {
+        "x":f0 -> "matmul0":f0;
+        "W_1":f0 -> "matmul0":f0;
+        "matmul0":f0 -> "h":f0;
+    
+        "h":f0 -> "matmul1":f0;
+        "W_2":f0 -> "matmul1":f0;
+        "matmul1":f0 -> "pred":f0;
+    
+        "pred":f0 -> "softmax":f0;
+        "label":f0 -> "softmax":f0;
+        "softmax":f0 -> "loss":f0;
+    }
+
+    subgraph prev_func {
+        edge [color="red", arrowsize="0.6", penwidth="1", constraint=false];
+        "matmul1":f1 -> "matmul0":f0;
+        "softmax":f1 -> "matmul1":f0;
+        label = "prev_func";
+    }
+}
+</details>
+
+![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20subgraph%20function%20{%20node%20[%20fontsize%20=%20%2216%22%20style%20=%20filled%20shape%20=%20%22record%22%20];%20%22matmul0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20None%22%20];%20%22matmul1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20matmul%22%20];%20%22softmax%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20prev_func:%20matmul%22%20];%20}%20subgraph%20variable%20{%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22Mrecord%22%20style%20=%20filled%20fillcolor%20=%20white%20];%20%22x%22%20[%20label%20=%20%22%3Cf0%3E%20x%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22label%22%20[%20label%20=%20%22%3Cf0%3E%20label%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_1%22%20[%20label%20=%20%22%3Cf0%3E%20W_1%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_2%22%20[%20label%20=%20%22%3Cf0%3E%20W_2%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22h%22%20[%20label%20=%20%22%3Cf0%3E%20h%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22pred%22%20[%20label%20=%20%22%3Cf0%3E%20pred%20|%20%3Cf1%3E%20creator:%20matmul%22%20];%20%22loss%22%20[%20label%20=%20%22%3Cf0%3E%20loss%20|%20%3Cf1%3E%20creator:%20softmax%22%20];%20}%20subgraph%20data_flow%20{%20%22x%22:f0%20-%3E%20%22matmul0%22:f0;%20%22W_1%22:f0%20-%3E%20%22matmul0%22:f0;%20%22matmul0%22:f0%20-%3E%20%22h%22:f0;%20%22h%22:f0%20-%3E%20%22matmul1%22:f0;%20%22W_2%22:f0%20-%3E%20%22matmul1%22:f0;%20%22matmul1%22:f0%20-%3E%20%22pred%22:f0;%20%22pred%22:f0%20-%3E%20%22softmax%22:f0;%20%22label%22:f0%20-%3E%20%22softmax%22:f0;%20%22softmax%22:f0%20-%3E%20%22loss%22:f0;%20}%20subgraph%20prev_func%20{%20edge%20[color=%22red%22,%20arrowsize=%220.6%22,%20penwidth=%221%22,%20constraint=false];%20%22matmul1%22:f1%20-%3E%20%22matmul0%22:f0;%20%22softmax%22:f1%20-%3E%20%22matmul1%22:f0;%20label%20=%20%22prev_func%22;%20}%20})
+
+Chainer and Autograd uses the similar techniques to record the forward pass. For details please refer to the appendix.
+
+## Design choices
+
+### 1) Dynet's List vs Pytorch's Node Graph
+
+What's good about List:
+1. It avoids a topological sort. One only needs to traverse the list of operators in reverse and calling the corresponding backward operator.
+1. It promises effient data parallelism implementations. One could count the time of usage of a certain variable during the construction list. Then in the play back, one knows the calculation of a variable has completed. This enables communication and computation overlapping.
+
+What's good about Node Graph:
+1. More flexibility. PyTorch users can mix and match independent graphs however they like, in whatever threads they like (without explicit synchronization). An added benefit of structuring graphs this way is that when a portion of the graph becomes dead, it is automatically freed. [[2]](https://openreview.net/pdf?id=BJJsrmfCZ) Consider the following example, Pytorch only does backward on SmallNet while Dynet does both BigNet and SmallNet.
+```python
+result = BigNet(data)
+loss = SmallNet(data)
+loss.backward()
+```
+
+### 2) Dynet's Lazy evaluation vs Pytorch's Immediate evaluation
+
+Dynet builds the list in a symbolic matter. Consider the following example
+```python
+for epoch in range(num_epochs):
+    for in_words, out_label in training_data:
+        dy.renew_cg()
+        W = dy.parameter(W_p)
+        b = dy.parameter(b_p)
+        score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b)
+        loss_sym = dy.pickneglogsoftmax(score_sym, out_label)
+        loss_val = loss_sym.value()
+        loss_sym.backward()
+```
+The computation of `lookup`, `concat`, `matmul` and `softmax` didn't happen until the call of `loss_sym.value()`. This defered execution is useful because it allows some graph-like optimization possible, e.g. kernel fusion.
+
+Pytorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`.
+
+
+## What can fluid learn from them?
+
+TBD
+
+# Appendix
+
+### Overview
+
+| Framework | Has Tape | Core in C++ | First Release Date |
+|-----------|----------|-------------|--------------------|
+| Autograd  | No       | No          | Mar 5, 2015        |
+| Chainer   | No       | No          | Jun 5, 2015        |
+| Pytorch   | No       | Yes         | Aug 31, 2016       |
+| Dynet     | Yes      | Yes         | Oct 12, 2016       |
+
+### Source Code
+#### Autograd
+[Backward code](https://github.com/HIPS/autograd/blob/442205dfefe407beffb33550846434baa90c4de7/autograd/core.py#L8-L40). In the forward pass, a graph of VJPNode is constructed.
+```python
+# User API
+def make_grad(fun, x):
+    start_node = VJPNode.new_root()
+    end_value, end_node =  trace(start_node, fun, x)
+    return backward_pass(g, end_node), end_value
+
+# trace the forward pass by creating VJPNodes
+def trace(start_node, fun, x):
+    with trace_stack.new_trace() as t:
+        start_box = new_box(x, t, start_node)
+        end_box = fun(start_box)
+        return end_box._value, end_box._node
+
+def backward_pass(g, end_node):
+    outgrads = {end_node : (g, False)}
+    for node in toposort(end_node):
+        outgrad = outgrads.pop(node)
+        ingrads = node.vjp(outgrad[0])
+        for parent, ingrad in zip(node.parents, ingrads):
+            outgrads[parent] = add_outgrads(outgrads.get(parent), ingrad)
+    return outgrad[0]
+
+# Every VJPNode corresponds to a op_grad
+class VJPNode(Node):
+    __slots__ = ['parents', 'vjp']
+    def __init__(self, value, fun, args, kwargs, parent_argnums, parents):
+        self.parents = parents
+        vjpmaker = primitive_vjps[fun]
+        self.vjp = vjpmaker(parent_argnums, value, args, kwargs)
+```
+#### Chainer
+Example Code
+```python
+# (1) Function Set definition, creates FunctionNode
+model = FunctionSet(
+    l1=F.Linear(784, 100),
+    l2=F.Linear(100, 100),
+    l3=F.Linear(100, 10)).to_gpu()
+
+# (2) Optimizer Setup
+opt = optimizers.SGD()
+opt.setup(model)
+
+# (3) Forward computation
+def forward(x, t):
+    h1 = F.relu(model.l1(x))
+    h2 = F.relu(model.l2(h1))
+    y = model.l3(h2)
+    return F.softmax_cross_entropy(y, t)
+
+# (4) Training loop
+for epoch in xrange(n_epoch):
+    for i in xrange(0, N, b_size):
+        x = Variable(to_gpu(...))
+        t = Variable(to_gpu(...))
+        opt.zero_grads()
+        loss = forward(x, t)
+        loss.backward()
+        opt.update()
+```
+In `forward(x, t)`, a graph of [`VariableNode`](https://github.com/chainer/chainer/blob/master/chainer/variable.py#L110) and [`FunctionNode`](https://github.com/chainer/chainer/blob/a69103a4aa59d5b318f39b01dbcb858d465b89cf/chainer/function_node.py#L19) is constructed. Every output's `VariableNode.creator` is pointed to the `FunctionNode`.
+```python
+class FunctionNode(object):
+    ...
+    def apply(self, inputs):
+        outputs = self.forward(inputs)
+        ret = tuple([variable.Variable(y, requires_grad=requires_grad)
+                     for y in outputs])
+        # Topological ordering
+        self.rank = max([x.rank for x in inputs]) if input_vars else 0
+        # Add backward edges
+        for y in ret:
+            y.creator_node = self
+        self.inputs = tuple([x.node for x in input_vars])
+        self.outputs = tuple([y.node for y in ret])
+
+        return ret
+```
+`loss.backward()` will calculate the accumulated gradient of all variables. All the backward of `FunctionNode`s will be called based on the topological order.
+```python
+class VariableNode(object):
+    ...
+    def backward(self, retain_grad, loss_scale):
+        if self.creator_node is None:
+            return
+
+        cand_funcs = []
+        seen_set = set()
+        grads = {}
+
+        # Initialize error by 1, if this is a loss variable
+        if self.data.size == 1 and self._grad_var is None:
+            self.grad = numpy.ones_like(self.data)
+        grads[self._node] = self._grad_var
+
+        def add_cand(cand):
+            if cand not in seen_set:
+                # Negate since heapq is min-heap. This is a global variable
+                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
+                seen_set.add(cand)
+
+        add_cand(self.creator_node)
+
+        while cand_funcs:
+            _, _, func = heapq.heappop(cand_funcs)
+            gxs = func.backward_accumulate(func.inputs, func.outputs, func.outputs.grad)
+
+            for x, gx in enumerate(gxs):
+                if x in grads:
+                    grads[x] += gx
+                else:
+                    grads[x] = gx
+
+                if x.creator_node is not None:
+                    add_cand(x.creator_node)
+```
+
+#### PyTorch
+Example Code
+```python
+x = Variable(torch.ones(5, 5))
+y = Variable(torch.ones(5, 5) * 4)
+z = x ** 2 + x * 2 + x * y + y
+z.backward(torch.ones(5, 5))
+```
+The trace is done by `Variable.creator` and `Function.previous_functions`.
+```python
+class Variable(object):
+    def __init__(self, tensor, creator=None, requires_grad=True):
+        if creator is None:
+            creator = Leaf(self, requires_grad)
+        self.data = tensor
+        self.creator = creator
+        self._grad = None
+
+    def backward(self, gradient=None):
+        if gradient is None:
+            if self.data.numel() != 1:
+                raise RuntimeError('backward should be called only on a scalar (i.e. 1-element tensor) or with gradient w.r.t. the variable')
+            gradient = self.data.new(1).fill_(1)
+        self._execution_engine.run_backward(self, gradient)
+
+class Function(obejct):
+    # ...
+    def _do_forward(self, *input):
+        unpacked_input = tuple(arg.data for arg in input)
+        raw_output = self.forward(*unpacked_input)
+
+        # mark output.creator = self for backward trace
+        output = tuple(Variable(tensor, self) for tensor in raw_output)
+
+        self.previous_functions = [(arg.creator, id(arg)) for arg in input]
+        self.output_ids = {id(var): i for i, var in enumerate(output)}
+        return output
+
+    def _do_backward(self, grad_output):
+        return self.backwaerd(grad_output)
+```
+The [backward](https://github.com/pytorch/pytorch/blob/v0.1.1/torch/autograd/engine.py) is similar to Autograd.
+
+#### DyNet
+Example code
+```python
+model = dy.model()
+W_p = model.add_parameters((20, 100))
+b_p = model.add_parameters(20)
+E = model.add_lookup_parameters((20000, 50))
+for epoch in range(num_epochs):
+    for in_words, out_label in training_data:
+        dy.renew_cg() # init tape
+        W = dy.parameter(W_p)
+        b = dy.parameter(b_p)
+        score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b)
+        loss_sym = dy.pickneglogsoftmax(score_sym, out_label)
+        loss_val = loss_sym.value()
+        loss_sym.backward()
+```
+[forward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L84-L158), [backward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L166-L284). The trace is done by creating a tape of expressions in every iteration. Backward is done by traverse the tape in the reverse order.
+```c++
+void SimpleExecutionEngine::backward(VariableIndex from_where, bool full) {
+  ...  
+  for (int i = num_nodes - 1; i >= 0; --i) {
+    // each node corresponds to an op
+    node->backward(xs, node_fx, node_dEdfx, ai, node_dEdxai);
+  }
+  ...
+}
+```
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
index 76b82fd97f..890f706155 100644
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -16,8 +16,8 @@ import os, subprocess
 sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
-import paddle
-import paddle.v2
+@IMPORT_PADDLE_STRING@
+@IMPORT_PADDLEV2_STRING@
 
 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
index 5aa5c1381f..5b09464cb9 100644
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -16,8 +16,8 @@ import os, subprocess
 sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
-import paddle
-import paddle.v2
+@IMPORT_PADDLE_STRING@
+@IMPORT_PADDLEV2_STRING@
 
 
 MarkdownParser = parser.CommonMarkParser
diff --git a/doc/v2/CMakeLists.txt b/doc/v2/CMakeLists.txt
index be957d37b1..d230a1b921 100644
--- a/doc/v2/CMakeLists.txt
+++ b/doc/v2/CMakeLists.txt
@@ -15,6 +15,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 
+set(IMPORT_PADDLE_STRING "")
+set(IMPORT_PADDLEV2_STRING "")
+
 configure_file(
     "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
     "${BINARY_BUILD_DIR_EN}/conf.py"
@@ -27,8 +30,6 @@ sphinx_add_target(paddle_v2_docs
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
-add_dependencies(paddle_v2_docs gen_proto_py paddle_python)
-
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
 
@@ -50,6 +51,4 @@ sphinx_add_target(paddle_v2_docs_cn
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
 
-add_dependencies(paddle_v2_docs_cn gen_proto_py paddle_python)
-
 add_subdirectory(api)
diff --git a/doc/v2/api/CMakeLists.txt b/doc/v2/api/CMakeLists.txt
index 2670a21a22..0c74522cb0 100644
--- a/doc/v2/api/CMakeLists.txt
+++ b/doc/v2/api/CMakeLists.txt
@@ -7,6 +7,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 
+set(IMPORT_PADDLE_STRING "import paddle")
+set(IMPORT_PADDLEV2_STRING "import paddle.v2")
+
 configure_file(
     "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
     "${BINARY_BUILD_DIR_EN}/conf.py"
diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst
index f846928954..6421c53082 100644
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -19,11 +19,11 @@
 ----------------
 
 PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
-可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到，您也可以
-在 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`_ 找到 paddle_manylinux_devel
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`__ 找到，您也可以
+在 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__ 找到 paddle_manylinux_devel
 镜像的编译以及使用方法。或者参考下述可选步骤，从源码中构建用于编译PaddlePaddle的Docker镜像。
 
-如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 :ref:`编译依赖 <_compile_deps>` 之后才能开始编译的步骤。
 
 编译PaddlePaddle，需要执行：
 
@@ -35,13 +35,11 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
    # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
    docker build -t paddle:dev .
    # 3. 执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
    # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
 
-注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。如果使用自行
-构建的镜像（上述第4步）会执行 :code:`Dockerfile` 描述的默认入口程序 :code:`build.sh` 可以省略步骤3中
-最后的执行脚本的命令。
+注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
 
 编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
 
@@ -72,15 +70,15 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
 
 如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）：
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
-   bash /paddle/paddle/scripts/docker/build.sh
-   cd /paddle/build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
    ctest -R test_sum_op -V
 
 .. _faq_docker:
@@ -108,7 +106,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 - 学习 Docker 有多难？
 
-  理解 Docker 并不难，大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+  理解 Docker 并不难，大概花十分钟看一下 `如何使用Docker <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
 
 - 我可以用 IDE 吗？
 
@@ -116,17 +114,16 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
   很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
 
-  ```emacs
-  (global-set-key "\C-cc" 'compile)
-  (setq compile-command
-   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
-  ```
+  .. code-block:: emacs
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
 
   就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
 
 - 可以并行编译吗？
 
-  是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+  是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
 
 - Docker 需要 sudo
 
@@ -134,11 +131,11 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 - 在 Windows/MacOS 上编译很慢
 
-  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。
 
 - 磁盘不够
 
-  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `如何删除Docker Container <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
 
 
 .. _compile_deps:
@@ -198,7 +195,7 @@ BLAS
 
 PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
 `OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
-还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+还会下载MKL-DNN数学库，详细参考 `mkldnn设计文档 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
 
 如果关闭MKL，则会使用OpenBLAS作为BLAS库。
 
@@ -214,7 +211,7 @@ PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，
 编译选项的设置
 ++++++++++++++
 
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如
 
 ..  code-block:: bash
 
diff --git a/doc/v2/build_and_install/build_from_source_en.rst b/doc/v2/build_and_install/build_from_source_en.rst
index d1b5b88dff..b08b45d43e 100644
--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -11,7 +11,7 @@ To build PaddlePaddle, you need
 1. A computer -- Linux, Windows, MacOS.
 2. Docker.
 
-Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image. 
+Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image.
 We run all the tools by running this image.
 
 .. _build_step:
@@ -23,9 +23,11 @@ You need to use Docker to build PaddlePaddle
 to avoid installing dependencies by yourself. We have several pre-built
 Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ ,
 you can also find how to build and use paddle_manylinux_devel Docker image from
-`here <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`_
+`here <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__
 Or you can build your own image from source as the optional step below:
 
+If you don't wish to use docker，you need to install several compile dependencies manually as :ref:`Compile Dependencies <_compile_deps>` shows to start compilation.
+
 .. code-block:: bash
 
    # 1. clone the source code
@@ -34,14 +36,12 @@ Or you can build your own image from source as the optional step below:
    # 2. Optional: build development docker image from source
    docker build -t paddle:dev .
    # 3. Run the following command to build a CPU-Only binaries
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
    # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
 
 NOTE: The above command try to mount the current working directory (root directory of source code)
-into :code:`/paddle` directory inside docker container. If you are using your own image
-(Step 4) it will run default entry-point :code:`build.sh` , so you could omit the last
-command in step 3.
+into :code:`/paddle` directory inside docker container.
 
 When the compile finishes, you can get the output whl package under
 build/python/dist, then you can choose to install the whl on local
@@ -74,21 +74,21 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU.
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
 
 If you wish to run only one unit test, like :code:`test_sum_op`:
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
-   bash /paddle/paddle/scripts/docker/build.sh
-   cd /paddle/build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
    ctest -R test_sum_op -V
 
 .. _faq_docker:
 
 Frequently Asked Questions
-----------------
+---------------------------
 
 - What is Docker?
 
@@ -110,7 +110,7 @@ Frequently Asked Questions
 
 - How difficult is it to learn Docker?
 
-    It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
+    It takes you ten minutes to read `an introductory article <https://docs.docker.com/get-started>`_ and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
 
 - Can I use my favorite IDE?
 
@@ -118,17 +118,16 @@ Frequently Asked Questions
 
   Many PaddlePaddle developers are using Emacs.  They add the following few lines into their `~/.emacs` configure file:
 
-  ```emacs
-  (global-set-key "\C-cc" 'compile)
-  (setq compile-command
-   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
-  ```
+  .. code-block:: emacs
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
 
   so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
 
 - Does Docker do parallel building?
 
-  Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+  Our building Docker image runs a  `Bash script <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ , which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
 
 - Docker requires sudo
 
@@ -136,16 +135,16 @@ Frequently Asked Questions
 
 - Docker on Windows/MacOS builds slowly
 
-  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
+  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to `this issue  <https://github.com/PaddlePaddle/Paddle/issues/627>`_ for details.
 
 - Not enough disk space
 
-  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
+  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to `this article <https://zaiste.net/posts/removing_docker_containers/>`_ .
 
 .. _compile_deps:
 
 Appendix: Compile Dependencies
-----------------
+-------------------------------
 
 PaddlePaddle need the following dependencies when compiling, other dependencies
 will be downloaded automatically.
@@ -166,11 +165,11 @@ will be downloaded automatically.
 .. _build_options:
 
 Appendix: Build Options
-----------------
+-------------------------
 
 Build options include whether build binaries for CPU or GPU, which BLAS
 library to use etc. You may pass these settings when running cmake.
-For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`_ 。
+For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`__ 。
 
 
 You can add :code:`-D` argument to pass such options, like:
@@ -219,7 +218,7 @@ keep on with latest cuDNN versions. Be sure to run with the same version of cuDN
 you built.
 
 Pass Compile Options
-++++++++++++++
+++++++++++++++++++++++
 
 You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
 When running cmake command, it will search system paths like
diff --git a/doc/v2/build_and_install/docker_install_cn.rst b/doc/v2/build_and_install/docker_install_cn.rst
index 79d214635a..106c86bace 100644
--- a/doc/v2/build_and_install/docker_install_cn.rst
+++ b/doc/v2/build_and_install/docker_install_cn.rst
@@ -73,6 +73,7 @@
 当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
 
   .. code-block:: bash
+
      docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
      cd /work
      python train.py
@@ -97,7 +98,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
 
 国内用户可以使用下面的镜像源来加速访问：
 
-  .. code-block: bash
+  .. code-block:: bash
 
     docker run -p 8888:8888 docker.paddlepaddlehub.com/book
 
diff --git a/doc/v2/build_and_install/docker_install_en.rst b/doc/v2/build_and_install/docker_install_en.rst
index e0e0559fb8..25aecb8d0d 100644
--- a/doc/v2/build_and_install/docker_install_en.rst
+++ b/doc/v2/build_and_install/docker_install_en.rst
@@ -80,6 +80,7 @@ Also, you can go into the container shell, run or debug your code
 interactively:
 
   .. code-block:: bash
+
      docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
      cd /work
      python train.py
@@ -104,7 +105,7 @@ We provide a packaged book image, simply issue the command:
 
 For users in China, we provide a faster mirror:
 
-  .. code-block: bash
+  .. code-block:: bash
 
     docker run -p 8888:8888 docker.paddlepaddlehub.com/book
 
diff --git a/doc/v2/build_and_install/index_cn.rst b/doc/v2/build_and_install/index_cn.rst
index e079bb661f..1a9305ac4b 100644
--- a/doc/v2/build_and_install/index_cn.rst
+++ b/doc/v2/build_and_install/index_cn.rst
@@ -6,7 +6,7 @@
 PaddlePaddle针对不同的用户群体提供了多种安装方式。
 
 专注深度学习模型开发
------------------
+--------------------
 
 PaddlePaddle提供了多种python wheel包，可通过pip一键安装：
 
@@ -18,7 +18,7 @@ PaddlePaddle提供了多种python wheel包，可通过pip一键安装：
 这是最便捷的安装方式，请根据机器配置和系统选择对应的安装包。
 
 关注底层框架
-----------
+-------------
 
 PaddlePaddle提供了基于Docker的安装方式，请参照以下教程：
 
@@ -45,7 +45,7 @@ PaddlePaddle提供了基于Docker的安装方式，请参照以下教程：
 
 
 常见问题汇总
------------
+--------------
 
 如果在安装过程中遇到了问题，请先尝试在下面的页面寻找答案：
 
diff --git a/doc/v2/build_and_install/index_en.rst b/doc/v2/build_and_install/index_en.rst
index 5b3de0f8c3..7990bacbd6 100644
--- a/doc/v2/build_and_install/index_en.rst
+++ b/doc/v2/build_and_install/index_en.rst
@@ -1,12 +1,12 @@
 install and Compile
-==========
+======================
 
 .. _install_steps:
 
 PaddlePaddle provides various methods of installation for many different users
 
 Focus on Deep Learning Model Development
------------------
+----------------------------------------
 
 PaddlePaddle provides lots of packages of python wheel , that pip can install:
 
@@ -18,7 +18,7 @@ PaddlePaddle provides lots of packages of python wheel , that pip can install:
 This is the most convenient way of installation. Please choose the right installation package with machine configure and system.
 
 Follow the Bottom Frame
-----------
+------------------------
 
 PaddlePaddle also supports installation using Docker. Please refer to the tutorial below:
 
diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst
index aa1dc6ee2c..853bdb21bb 100644
--- a/doc/v2/build_and_install/pip_install_cn.rst
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@@ -10,20 +10,38 @@ PaddlePaddle可以使用常用的Python包管理工具
 使用pip安装
 ------------------------------
 
-
-执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件，版本为cpu_avx_openblas。
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
 
   .. code-block:: bash
 
      pip install paddlepaddle
 
+当前的默认版本为0.12.0，cpu_avx_openblas，您可以通过指定版本号来安装其它版本，例如:
+
+  .. code-block:: bash
+
+      pip install paddlepaddle==0.11.0
+
 
-如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
+如果需要安装支持GPU的版本（cuda8.0_cudnn5_avx_openblas），需要执行：
 
   .. code-block:: bash
 
      pip install paddlepaddle-gpu
 
+当前的默认版本也是0.12.0，PaddlePaddle针对不同需求提供了更多版本的安装包，部分列表如下：
+
+=================================   ========================================
+版本号                               版本说明
+=================================   ========================================
+paddlepaddle-gpu==0.12.0            使用CUDA 8.0和cuDNN 5编译的0.12.0版本
+paddlepaddle-gpu==0.11.0.post87     使用CUDA 8.0和cuDNN 7编译的0.11.0版本
+paddlepaddle-gpu==0.11.0.post8      使用CUDA 8.0和cuDNN 5编译的0.11.0版本
+paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版本
+=================================   ========================================
+
+您可以在 `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_ 中找到paddlepaddle-gpu的各个发行版本。
+
 如果需要获取并安装最新的（开发分支）PaddlePaddle，可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装，
 您可以从下面的表格中找到需要的版本：
 
@@ -37,11 +55,11 @@ PaddlePaddle可以使用常用的Python包管理工具
     :header: "版本说明", "cp27-cp27mu", "cp27-cp27m"
     :widths: 1, 3, 3
 
-    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
 
 .. _pip_dependency:
 
@@ -68,7 +86,7 @@ PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.or
 ------------------------------
 
 - paddlepaddle*.whl is not a supported wheel on this platform.
-  
+
   出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip：
 
     .. code-block:: bash
diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst
index a70821eb48..fecf6d3712 100644
--- a/doc/v2/build_and_install/pip_install_en.rst
+++ b/doc/v2/build_and_install/pip_install_en.rst
@@ -12,20 +12,38 @@ Install using pip
 ------------------------------
 
 Run the following command to install PaddlePaddle on the current
-machine, it will also download requirements, the version is cpu_avx_openblas.
+machine, it will also download requirements.
 
   .. code-block:: bash
 
      pip install paddlepaddle
 
+the default version is 0.12.0, cpu_avx_openblas, you can specify the versions to satisfy your demands, like:
 
-If you wish to install GPU version (cuda7.5_cudnn5_avx_openblas), just run:
+  .. code-block:: bash
+
+      pip install paddlepaddle==0.11.0
+
+If you need to install a GPU-enabled version (cuda8.0_cudnn5_avx_openblas), you need to run:
 
   .. code-block:: bash
 
      pip install paddlepaddle-gpu
 
-If you wish to install the latest develop branch PaddlePaddle, 
+The default version is also 0.12.0, PaddlePaddle provides several versions of packages for different needs, as shown in the table:
+
+=================================   ========================================
+版本号                               版本说明
+=================================   ========================================
+paddlepaddle-gpu==0.12.0            0.12.0 built with CUDA 8.0 and cuDNN 5
+paddlepaddle-gpu==0.11.0.post87     0.11.0 built with CUDA 8.0 and cuDNN 7
+paddlepaddle-gpu==0.11.0.post8      0.11.0 built with CUDA 8.0 and cuDNN 5
+paddlepaddle-gpu==0.11.0            0.11.0 built with CUDA 7.5 and cuDNN 5
+=================================   ========================================
+
+You can find all versions released of paddlepaddle-gpu in `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_ .
+
+If you wish to install the latest develop branch PaddlePaddle,
 you can download the latest whl package from our CI system. Access
 the below links, log in as guest, then click at the "Artifact"
 tab, you'll find the download link of whl packages.
@@ -40,11 +58,11 @@ If the links below shows up the login form, just click "Log in as guest" to star
     :header: "version", "cp27-cp27mu", "cp27-cp27m"
     :widths: 1, 3, 3
 
-    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
 
 .. _pip_dependency:
 
@@ -78,7 +96,7 @@ FAQ
 ------------------------------
 
 - paddlepaddle*.whl is not a supported wheel on this platform.
-  
+
   The main cause of this issue is that your current platform is
   not supported. Please check that you are using Python 2.7 series.
   Besides, pypi only supports manylinux1 standard, you'll need to
diff --git a/doc/v2/dev/contribute_to_paddle_cn.md b/doc/v2/dev/contribute_to_paddle_cn.md
index d8bf093e09..add06e42f1 100644
--- a/doc/v2/dev/contribute_to_paddle_cn.md
+++ b/doc/v2/dev/contribute_to_paddle_cn.md
@@ -51,6 +51,8 @@ Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 G
 
 Paddle 使用 `clang-format` 来调整 C/C++ 源代码格式，请确保 `clang-format` 版本在 3.8 以上。
 
+注：通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的，Paddle 开发人员使用的是`pip install pre-commit`。
+
 ## 开始开发
 
 在本例中，我删除了 README.md 中的一行，并创建了一个新文件。
diff --git a/doc/v2/howto/capi/workflow_of_capi_cn.md b/doc/v2/howto/capi/workflow_of_capi_cn.md
index 1968c1099a..3acdbae28e 100644
--- a/doc/v2/howto/capi/workflow_of_capi_cn.md
+++ b/doc/v2/howto/capi/workflow_of_capi_cn.md
@@ -59,7 +59,7 @@
     代码示例如下：
 
     ```python
-    from paddle.utils.merge_model import merge_v2_modelss
+    from paddle.utils.merge_model import merge_v2_model
     from mnist_v2 import network
 
     net = network(is_infer=True)
diff --git a/go/pserver/client/c/test/CMakeLists.txt b/go/pserver/client/c/test/CMakeLists.txt
index 411dc50332..4500b1f288 100644
--- a/go/pserver/client/c/test/CMakeLists.txt
+++ b/go/pserver/client/c/test/CMakeLists.txt
@@ -13,4 +13,3 @@
 # limitations under the License.
 #
 cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
-add_style_check_target(test_cclient test_cclient.c)
diff --git a/paddle/.gitignore b/paddle/.gitignore
index 1c1c0c2c82..01904aa6ef 100644
--- a/paddle/.gitignore
+++ b/paddle/.gitignore
@@ -11,7 +11,6 @@ GTAGS
 *.pb.cc
 *.pb.h
 *_pb2.py
-paddle_*
 output/
 google/
 Makefile
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index a3d6f0f080..0d9ad30de9 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -94,7 +94,7 @@ void UpdateCallback::apply(Parameter* p) {
 }
 
 class UpdateCallbackWrapper {
-public:
+ public:
   explicit UpdateCallbackWrapper(const UpdateCallback& callback)
       : callback(const_cast<UpdateCallback&>(callback)) {}
 
@@ -105,7 +105,7 @@ public:
     delete p;
   }
 
-private:
+ private:
   UpdateCallback& callback;
 };
 
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 67368d1a99..7866122006 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -59,9 +59,10 @@ class RangeError {};
 
 /// Not support Error, such as access GPU memory directly, etc.
 class UnsupportError : public std::runtime_error {
-public:
-  UnsupportError() : std::runtime_error(" "){};
-  UnsupportError(const std::string& message) : std::runtime_error(message){};
+ public:
+  UnsupportError() : std::runtime_error(" ") {}
+  explicit UnsupportError(const std::string& message)
+      : std::runtime_error(message) {}
 };
 
 /// This type will map to python's list of float.
@@ -105,7 +106,7 @@ class Matrix {
   DISABLE_COPY(Matrix);
   static Matrix* createByPaddleMatrixPtr(void* sharedPtr);
 
-public:
+ public:
   virtual ~Matrix();
 
   /**
@@ -231,7 +232,7 @@ public:
 
   bool isGpu() const;
 
-private:
+ private:
   void* getSharedPtr() const;
 
   MatrixPrivate* m;
@@ -248,7 +249,7 @@ class Vector {
 
   void* getSharedPtr();
 
-public:
+ public:
   ~Vector();
 
   /// Create Vector filled with zero.
@@ -310,10 +311,10 @@ public:
   /// __len__ in python
   size_t getSize() const;
 
-private:
+ private:
   VectorPrivate* m;
 
-private:
+ private:
   friend class Parameter;
   friend class ParameterOptimizer;
   friend struct ParameterTraverseCallbackPrivate;
@@ -325,7 +326,7 @@ class IVector {
   DISABLE_COPY(IVector);
   static IVector* createByPaddleVectorPtr(void* ptr);
 
-public:
+ public:
   /// Create IVector filled with zero
   static IVector* createZero(size_t sz, bool useGpu = isUsingGpu());
 
@@ -389,7 +390,7 @@ public:
   /// This method will map to python __len__();
   size_t getSize() const;
 
-private:
+ private:
   void* getSharedPtr() const;
 
   friend class Arguments;
@@ -400,11 +401,11 @@ struct ArgumentsPrivate;
 
 /// The Arguments is actual a std::vector<paddle::Argument> in paddle.
 class Arguments {
-private:
+ private:
   Arguments();  // Internal Create.
   DISABLE_COPY(Arguments);
 
-public:
+ public:
   /**
    * Create a arguments with size.
    * Note that it can be zero.
@@ -475,12 +476,12 @@ public:
 
   float sum() const;
 
-private:
+ private:
   static Arguments* createByPaddleArgumentVector(void* ptr);
   static Arguments* createByPaddleArgument(const void* ptr);
   void* getInternalArgumentsPtr() const;
 
-private:
+ private:
   ArgumentsPrivate* m;
   friend class Trainer;
   friend class GradientMachine;
@@ -507,7 +508,7 @@ class ParameterConfig {
   static ParameterConfig* createParameterConfigFromParameterPtr(void* ptr);
   void* getRawPtr();
 
-public:
+ public:
   ~ParameterConfig();
 
   /**
@@ -515,10 +516,10 @@ public:
    */
   std::string toProtoString() const;
 
-private:
+ private:
   ParameterConfigPrivate* m;
 
-private:
+ private:
   friend class Parameter;
   friend class ParameterOptimizer;
   friend struct ParameterTraverseCallbackPrivate;
@@ -529,7 +530,7 @@ class OptimizationConfig {
   DISABLE_COPY(OptimizationConfig);
   OptimizationConfig();
 
-public:
+ public:
   static OptimizationConfig* createFromProtoString(const std::string& str);
   ~OptimizationConfig();
 
@@ -538,7 +539,7 @@ public:
    */
   std::string toProtoString();
 
-private:
+ private:
   OptimizationConfigPrivate* m;
 
   friend class TrainerConfig;
@@ -549,11 +550,11 @@ private:
 
 struct ParameterPrivate;
 class Parameter {
-private:
+ private:
   Parameter();
   DISABLE_COPY(Parameter);
 
-public:
+ public:
   virtual ~Parameter();
 
   /**
@@ -580,11 +581,11 @@ public:
 
   size_t getSize() const;
 
-private:
+ private:
   static Parameter* createFromRawPtr(void* ptr);
   static Parameter* createFromSharedPtr(void* ptr);
 
-private:
+ private:
   ParameterPrivate* m;
   friend class UpdateCallbackWrapper;
   friend class GradientMachine;
@@ -598,14 +599,14 @@ struct ModelConfigPrivate;
  * It is used by GradientMachine.
  */
 class ModelConfig {
-private:
+ private:
   ModelConfig();
   DISABLE_COPY(ModelConfig);
 
-public:
+ public:
   virtual ~ModelConfig();
 
-private:
+ private:
   ModelConfigPrivate* m;
   friend class TrainerConfig;
   friend struct TrainerConfigPrivate;
@@ -619,11 +620,11 @@ struct TrainerConfigPrivate;
  * It is used by GradientMachine.
  */
 class TrainerConfig {
-private:
+ private:
   TrainerConfig();
   DISABLE_COPY(TrainerConfig);
 
-public:
+ public:
   virtual ~TrainerConfig();
 
   static TrainerConfig* createFromTrainerConfigFile(
@@ -634,7 +635,7 @@ public:
 
   OptimizationConfig* getOptimizationConfig() const;
 
-private:
+ private:
   TrainerConfigPrivate* m;
   friend class Trainer;
 };
@@ -654,7 +655,7 @@ private:
  * @endcode
  */
 class UpdateCallback {
-public:
+ public:
   virtual ~UpdateCallback();
   virtual void apply(Parameter* p);
 };
@@ -664,14 +665,14 @@ class ParameterTraverseCallback {
   DISABLE_COPY(ParameterTraverseCallback);
   ParameterTraverseCallback();
 
-public:
+ public:
   ~ParameterTraverseCallback();
 
   void apply(const std::vector<Vector*>& vecs,
              const ParameterConfig& config,
              size_t sparseId);
 
-private:
+ private:
   ParameterTraverseCallbackPrivate* m;
   friend class ParameterOptimizer;
 };
@@ -686,7 +687,7 @@ class ParameterOptimizer {
   DISABLE_COPY(ParameterOptimizer);
   ParameterOptimizer();
 
-public:
+ public:
   static ParameterOptimizer* create(OptimizationConfig* config);
 
   ~ParameterOptimizer();
@@ -710,7 +711,7 @@ public:
   ParameterTraverseCallback* needSpecialTraversal(
       const ParameterConfig& config) const;
 
-private:
+ private:
   ParameterOptimizerPrivate* m;
 };
 
@@ -718,11 +719,11 @@ class SequenceGenerator;
 class Evaluator;
 struct GradientMachinePrivate;
 class GradientMachine {
-private:
+ private:
   GradientMachine();
   DISABLE_COPY(GradientMachine);
 
-public:
+ public:
   virtual ~GradientMachine();
 
   /**
@@ -817,7 +818,7 @@ public:
 
   void eval(Evaluator* evaluator);
 
-private:
+ private:
   GradientMachinePrivate* m;
 
   static GradientMachine* createFromPaddleModelPtr(
@@ -833,10 +834,10 @@ private:
 
 struct ParameterUpdaterPrivate;
 class ParameterUpdater {
-private:
+ private:
   ParameterUpdater();
 
-public:
+ public:
   static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
   static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
                                                int passCount,
@@ -911,17 +912,17 @@ public:
    */
   void catchUpWith();
 
-private:
+ private:
   ParameterUpdaterPrivate* m;
 };
 
 struct EvaluatorPrivate;
 class Evaluator {
-private:
+ private:
   Evaluator();
   DISABLE_COPY(Evaluator);
 
-public:
+ public:
   ~Evaluator();
 
   /**
@@ -945,7 +946,7 @@ public:
 
   double getValue(const std::string name) const;
 
-private:
+ private:
   EvaluatorPrivate* m;
 
   friend class GradientMachine;
@@ -953,13 +954,13 @@ private:
 
 struct TrainerPrivate;
 class Trainer {
-private:
+ private:
   TrainerPrivate* m;
   Trainer();
   Trainer(TrainerConfig* optConfig, GradientMachine* gm);
   DISABLE_COPY(Trainer);
 
-public:
+ public:
   virtual ~Trainer();
 
   /// Create A Trainer By TrainerConfig. using paddle command line.
@@ -1002,7 +1003,7 @@ public:
 
 /// the N-Best results generated from one input sequence.
 class ISequenceResults {
-public:
+ public:
   virtual ~ISequenceResults();
 
   /// Number of result.
@@ -1026,7 +1027,7 @@ class SequenceGenerator {
   DISABLE_COPY(SequenceGenerator);
   SequenceGenerator();
 
-public:
+ public:
   virtual ~SequenceGenerator();
 
   /**
@@ -1044,10 +1045,10 @@ public:
   void setMaxLength(size_t maxlength);
   void setBeamSize(size_t beamSize);
 
-private:
+ private:
   static SequenceGenerator* createByGradientMachineSharedPtr(void* ptr);
   friend class GradientMachine;
 
-private:
+ private:
   SequenceGeneratorPrivate* m;
 };
diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/api/SequenceGenerator.cpp
index 1b30aec8f6..1446c30842 100644
--- a/paddle/api/SequenceGenerator.cpp
+++ b/paddle/api/SequenceGenerator.cpp
@@ -138,7 +138,7 @@ struct SequenceGeneratorPrivate {
         maxLength(0UL),
         feedback(__create_feedback__()) {}
 
-private:
+ private:
   static paddle::Argument __create_feedback__() {
     paddle::Argument feedback;
     feedback.ids = paddle::IVector::create(/* size= */ 1, FLAGS_use_gpu);
@@ -157,7 +157,7 @@ SequenceGenerator::~SequenceGenerator() { delete m; }
 
 class PathSequenceResults : public ISequenceResults {
   // ISequenceResults interface
-public:
+ public:
   PathSequenceResults(const std::shared_ptr<std::vector<Path>>& path,
                       const std::shared_ptr<std::vector<std::string>>& dict)
       : path_(path), dict_(dict) {}
@@ -196,7 +196,7 @@ public:
     }
   }
 
-private:
+ private:
   std::shared_ptr<std::vector<Path>> path_;
   std::shared_ptr<std::vector<std::string>> dict_;
 };
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index e06e9a2b36..957b1a3e6b 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -33,9 +33,6 @@ add_library(paddle_capi STATIC ${CAPI_HEADERS} ${CAPI_PRIVATE_HEADER}
 
 target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
 
-add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
-  ${CAPI_PRIVATE_HEADER})
-
 add_dependencies(paddle_capi paddle_proto paddle_gserver)
 
 # TODO: paddle_capi_whole will be removed.
diff --git a/paddle/capi/gradient_machine.cpp b/paddle/capi/gradient_machine.cpp
index ea9aab00e3..8c3f504e5a 100644
--- a/paddle/capi/gradient_machine.cpp
+++ b/paddle/capi/gradient_machine.cpp
@@ -26,7 +26,7 @@ enum GradientMatchineCreateMode {
 namespace paddle {
 
 class MyNeuralNetwork : public NeuralNetwork {
-public:
+ public:
   MyNeuralNetwork(const std::string& name, NeuralNetwork* network)
       : NeuralNetwork(name, network) {}
 };
diff --git a/paddle/contrib/CMakeLists.txt b/paddle/contrib/CMakeLists.txt
new file mode 100644
index 0000000000..4b19256ef4
--- /dev/null
+++ b/paddle/contrib/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+add_subdirectory(inference)
diff --git a/contrib/float16/.gitignore b/paddle/contrib/float16/.gitignore
similarity index 100%
rename from contrib/float16/.gitignore
rename to paddle/contrib/float16/.gitignore
diff --git a/contrib/float16/README.md b/paddle/contrib/float16/README.md
similarity index 99%
rename from contrib/float16/README.md
rename to paddle/contrib/float16/README.md
index ded959c47c..58b4a50666 100644
--- a/contrib/float16/README.md
+++ b/paddle/contrib/float16/README.md
@@ -89,7 +89,7 @@ cd Paddle
 # to `FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04` and similarly for other configurations
 nvidia-docker build -t paddle:float16 .
 # After running this, different results will be written to different log files in Paddle/contrib/float16/
-nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/contrib/float16/run_float16_demo.sh
+nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/paddle/contrib/float16/run_float16_demo.sh
 ```
 
 #### Accuracy
diff --git a/contrib/float16/float16_benchmark.md b/paddle/contrib/float16/float16_benchmark.md
similarity index 100%
rename from contrib/float16/float16_benchmark.md
rename to paddle/contrib/float16/float16_benchmark.md
diff --git a/contrib/float16/float16_inference_demo.py b/paddle/contrib/float16/float16_inference_demo.py
similarity index 100%
rename from contrib/float16/float16_inference_demo.py
rename to paddle/contrib/float16/float16_inference_demo.py
diff --git a/contrib/float16/float16_transpiler.py b/paddle/contrib/float16/float16_transpiler.py
similarity index 100%
rename from contrib/float16/float16_transpiler.py
rename to paddle/contrib/float16/float16_transpiler.py
diff --git a/contrib/float16/run_float16_demo.sh b/paddle/contrib/float16/run_float16_demo.sh
similarity index 95%
rename from contrib/float16/run_float16_demo.sh
rename to paddle/contrib/float16/run_float16_demo.sh
index d8a34ee67b..031225a85d 100755
--- a/contrib/float16/run_float16_demo.sh
+++ b/paddle/contrib/float16/run_float16_demo.sh
@@ -3,7 +3,7 @@
 BUILD_PATH=/paddle/fp16_build
 WHEEL_PATH=$BUILD_PATH/python/dist
 INFER_PATH=$BUILD_PATH/paddle/fluid/inference/tests/book
-DEMO_PATH=/paddle/contrib/float16
+DEMO_PATH=/paddle/paddle/contrib/float16
 
 # Use the single most powerful CUDA GPU on your machine
 export CUDA_VISIBLE_DEVICES=0
@@ -50,7 +50,6 @@ do
          --repeat=1 \
 
   $INFER_PATH/test_inference_image_classification_vgg \
-      --data_set=imagenet \
       --dirname=$DEMO_PATH/image_classification_imagenet_vgg.inference.model \
       --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_vgg.inference.model \
       --repeat=$REPEAT \
@@ -68,7 +67,6 @@ do
          --repeat=1 \
 
   $INFER_PATH/test_inference_image_classification_resnet \
-      --data_set=imagenet \
       --dirname=$DEMO_PATH/image_classification_imagenet_resnet.inference.model \
       --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_resnet.inference.model \
       --repeat=$REPEAT \
@@ -86,7 +84,6 @@ do
          --repeat=1 \
 
   $INFER_PATH/test_inference_image_classification_vgg \
-      --data_set=cifar10 \
       --dirname=$DEMO_PATH/image_classification_cifar10_vgg.inference.model \
       --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_vgg.inference.model \
       --repeat=$REPEAT \
@@ -104,7 +101,6 @@ do
          --repeat=1 \
 
   $INFER_PATH/test_inference_image_classification_vgg \
-      --data_set=cifar10 \
       --dirname=$DEMO_PATH/image_classification_cifar10_resnet.inference.model \
       --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_resnet.inference.model \
       --repeat=$REPEAT \
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
new file mode 100644
index 0000000000..f279020e93
--- /dev/null
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -0,0 +1,99 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if(APPLE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
+endif(APPLE)
+
+set(ANAKIN_INCLUDE "" CACHE STRING "root of Anakin header files")
+set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
+
+
+set(inference_deps paddle_inference_api paddle_fluid_api)
+
+# if anakin is set enable anakin api implementation
+if(ANAKIN_INCLUDE AND ANAKIN_LIBRARY)
+    set(ANAKIN_FOUND ON)
+else()
+    set(ANAKIN_FOUND OFF)
+endif()
+
+function(fetch_include_recursively root_dir) 
+    if (IS_DIRECTORY ${root_dir}) 
+        include_directories(${root_dir})
+    endif()
+
+    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
+    foreach(sub ${ALL_SUB})
+        if (IS_DIRECTORY ${root_dir}/${sub})
+            fetch_include_recursively(${root_dir}/${sub})
+        endif()
+    endforeach()
+endfunction()
+
+if (ANAKIN_FOUND)
+    # Anakin's code style doesn't follow google c style.
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp")
+
+    message(STATUS "Anakin for inference is enabled")
+    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
+    fetch_include_recursively(${ANAKIN_INCLUDE})
+
+    link_directories(${ANAKIN_LIBRARY})
+
+    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
+    list(APPEND inference_deps inference_anakin_api)
+endif()
+
+
+function(inference_api_test TARGET_NAME)
+    if (WITH_TESTING)
+        set(options "")
+        set(oneValueArgs "")
+        set(multiValueArgs ARGS)
+        cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+        set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+        cc_test(${TARGET_NAME}
+                SRCS ${TARGET_NAME}.cc
+                DEPS "${inference_deps}"
+                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+        if(inference_test_ARGS)
+            set_tests_properties(${TARGET_NAME}
+                    PROPERTIES DEPENDS "${inference_test_ARGS}")
+        endif()
+    endif(WITH_TESTING)
+endfunction(inference_api_test)
+
+cc_library(paddle_inference_api
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc 
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
+cc_test(test_paddle_inference_api
+        SRCS test_paddle_inference_api.cc
+        DEPS paddle_inference_api)
+
+inference_api_test(test_paddle_inference_api_impl
+                    ARGS test_word2vec test_image_classification)
+
+if (ANAKIN_FOUND)
+    cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
+    DEPS ${inference_deps})
+endif()
+
+if(WITH_TESTING)
+    add_subdirectory(demo)
+endif()
diff --git a/contrib/inference/README.md b/paddle/contrib/inference/README.md
similarity index 100%
rename from contrib/inference/README.md
rename to paddle/contrib/inference/README.md
diff --git a/paddle/contrib/inference/demo/CMakeLists.txt b/paddle/contrib/inference/demo/CMakeLists.txt
new file mode 100644
index 0000000000..7b0fa77ad1
--- /dev/null
+++ b/paddle/contrib/inference/demo/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+inference_api_test(simple_on_word2vec ARGS test_word2vec)
diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc
new file mode 100644
index 0000000000..192a641426
--- /dev/null
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -0,0 +1,128 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains a simple demo for how to take a model for inference.
+ */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <memory>
+#include <thread>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+namespace paddle {
+namespace demo {
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+void Main(bool use_gpu) {
+  //# 1. Create PaddlePredictor with a config.
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  for (int batch_id = 0; batch_id < 3; batch_id++) {
+    //# 2. Prepare input.
+    int64_t data[4] = {1, 2, 3, 4};
+
+    PaddleBuf buf{.data = data, .length = sizeof(data)};
+    PaddleTensor tensor{.name = "",
+                        .shape = std::vector<int>({4, 1}),
+                        .data = buf,
+                        .dtype = PaddleDType::INT64};
+
+    // For simplicity, we set all the slots with the same data.
+    std::vector<PaddleTensor> slots(4, tensor);
+
+    //# 3. Run
+    std::vector<PaddleTensor> outputs;
+    CHECK(predictor->Run(slots, &outputs));
+
+    //# 4. Get output.
+    ASSERT_EQ(outputs.size(), 1UL);
+    LOG(INFO) << "output buffer size: " << outputs.front().data.length;
+    const size_t num_elements = outputs.front().data.length / sizeof(float);
+    // The outputs' buffers are in CPU memory.
+    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+      LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+    }
+    // TODO(Superjomn): this is should be free automatically
+    free(outputs[0].data.data);
+  }
+}
+
+void MainThreads(int num_threads, bool use_gpu) {
+  // Multi-threads only support on CPU
+  // 0. Create PaddlePredictor with a config.
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  auto main_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_threads; ++tid) {
+    threads.emplace_back([&, tid]() {
+      // 1. clone a predictor which shares the same parameters
+      auto predictor = main_predictor->Clone();
+      constexpr int num_batches = 3;
+      for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
+        // 2. Dummy Input Data
+        int64_t data[4] = {1, 2, 3, 4};
+        PaddleBuf buf{.data = data, .length = sizeof(data)};
+        PaddleTensor tensor{.name = "",
+                            .shape = std::vector<int>({4, 1}),
+                            .data = buf,
+                            .dtype = PaddleDType::INT64};
+        std::vector<PaddleTensor> inputs(4, tensor);
+        std::vector<PaddleTensor> outputs;
+        // 3. Run
+        CHECK(predictor->Run(inputs, &outputs));
+
+        // 4. Get output.
+        ASSERT_EQ(outputs.size(), 1UL);
+        LOG(INFO) << "TID: " << tid << ", "
+                  << "output buffer size: " << outputs.front().data.length;
+        const size_t num_elements = outputs.front().data.length / sizeof(float);
+        // The outputs' buffers are in CPU memory.
+        for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+          LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+        }
+        free(outputs[0].data.data);
+      }
+    });
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    threads[i].join();
+  }
+}
+
+TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); }
+
+#ifdef PADDLE_WITH_CUDA
+TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); }
+#endif
+
+}  // namespace demo
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api.cc b/paddle/contrib/inference/paddle_inference_api.cc
new file mode 100644
index 0000000000..d67e1e7667
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api.cc
@@ -0,0 +1,15 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
new file mode 100644
index 0000000000..77e2d77b6b
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -0,0 +1,116 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the definition of a simple Inference API for Paddle.
+ *
+ * ATTENTION: It requires some C++ features, for lower version C++ or C, we
+ * might release another API.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace paddle {
+
+enum PaddleDType {
+  FLOAT32,
+  INT64,
+};
+
+struct PaddleBuf {
+  void* data;     // pointer to the data memory.
+  size_t length;  // number of memory bytes.
+};
+
+struct PaddleTensor {
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+};
+
+enum class PaddleEngineKind {
+  kNative = 0,  // Use the native Fluid facility.
+  kAnakin,      // Use Anakin for inference.
+  // TODO(Superjomn) support following engines latter.
+  // kTensorRT,           // Use TensorRT for inference.
+  // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
+  // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
+};
+
+/*
+ * A simple Inference API for Paddle. Currently this API can be used by
+ * non-sequence scenerios.
+ */
+class PaddlePredictor {
+ public:
+  struct Config;
+  PaddlePredictor() = default;
+  PaddlePredictor(const PaddlePredictor&) = delete;
+  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
+
+  // Predict an record.
+  // The caller should be responsible for allocating and releasing the memory of
+  // `inputs`. `inputs` should be alive until Run returns. caller should be
+  // responsible for releasing the memory of `output_data`.
+  virtual bool Run(const std::vector<PaddleTensor>& inputs,
+                   std::vector<PaddleTensor>* output_data) = 0;
+
+  // Clone a predictor that share the model weights, the Cloned predictor should
+  // be thread-safe.
+  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
+
+  // Destroy the Predictor.
+  virtual ~PaddlePredictor() = default;
+
+  // The common configs for all the predictors.
+  struct Config {
+    std::string model_dir;      // path to the model directory.
+    bool enable_engine{false};  // Enable to execute (part of) the model on
+  };
+};
+
+struct NativeConfig : public PaddlePredictor::Config {
+  // GPU related fields.
+  bool use_gpu{false};
+  int device{0};
+  float fraction_of_gpu_memory{-1.f};  // Negative to notify initialization.
+
+  std::string prog_file;
+  std::string param_file;
+};
+
+// Configurations for Anakin engine.
+struct AnakinConfig : public PaddlePredictor::Config {
+  int device;
+  std::string model_file;
+  int max_batch_size{-1};
+};
+
+// A factory to help create different predictors.
+//
+// FOR EXTENSION DEVELOPER:
+// Different predictors are designated by config type and engine kind. Similar
+// configs can be merged, but there shouldn't be a huge config containing
+// different fields for more than one kind of predictors.
+//
+// Similarly, each engine kind should map to a unique predictor implementation.
+template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
new file mode 100644
index 0000000000..ea7781f691
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -0,0 +1,114 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda.h>
+
+#include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h"
+
+namespace paddle {
+
+PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
+    const AnakinConfig &config) {
+  CHECK(Init(config));
+}
+
+bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
+  if (!(graph_.load(config.model_file))) {
+    return false;
+  }
+  graph_.ResetBatchSize("input_0", config.max_batch_size);
+  // optimization for graph
+  if (!(graph_.Optimize())) {
+    return false;
+  }
+  // construct executer
+  executor_.init(graph_);
+  return true;
+}
+
+bool PaddleInferenceAnakinPredictor::Run(
+    const std::vector<PaddleTensor> &inputs,
+    std::vector<PaddleTensor> *output_data) {
+  for (const auto &input : inputs) {
+    if (input.dtype != PaddleDType::FLOAT32) {
+      LOG(ERROR) << "Only support float type inputs. " << input.name
+                 << "'s type is not float";
+      return false;
+    }
+    auto d_tensor_in_p = executor_.get_in(input.name);
+    float *d_data_p = d_tensor_in_p->mutable_data();
+    if (cudaMemcpy(d_data_p,
+                   static_cast<float *>(input.data.data),
+                   d_tensor_in_p->valid_size() * sizeof(float),
+                   cudaMemcpyHostToDevice) != 0) {
+      LOG(ERROR) << "copy data from CPU to GPU error";
+      return false;
+    }
+  }
+
+  executor_.prediction();
+
+  if (output_data->empty()) {
+    LOG(ERROR) << "At least one output should be set with tensors' names.";
+    return false;
+  }
+  for (auto &output : *output_data) {
+    auto *tensor = executor_.get_out(output.name);
+    output.shape = tensor->shape();
+    // Copy data from GPU -> CPU
+    if (cudaMemcpy(output.data.data,
+                   tensor->mutable_data(),
+                   tensor->valid_size() * sizeof(float),
+                   cudaMemcpyDeviceToHost) != 0) {
+      LOG(ERROR) << "copy data from GPU to CPU error";
+      return false;
+    }
+  }
+  return true;
+}
+
+anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+    &PaddleInferenceAnakinPredictor::get_executer() {
+  return executor_;
+}
+
+// the cloned new Predictor of anakin share the same net weights from original
+// Predictor
+std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
+  VLOG(3) << "Anakin Predictor::clone";
+  std::unique_ptr<PaddlePredictor> cls(new PaddleInferenceAnakinPredictor());
+  // construct executer from other graph
+  auto anakin_predictor_p =
+      dynamic_cast<PaddleInferenceAnakinPredictor *>(cls.get());
+  if (!anakin_predictor_p) {
+    LOG(ERROR) << "fail to call Init";
+    return nullptr;
+  }
+  anakin_predictor_p->get_executer().init(graph_);
+
+  return std::move(cls);
+}
+
+// A factory to help create difference predictor.
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(
+    const AnakinConfig &config) {
+  VLOG(3) << "Anakin Predictor create.";
+  std::unique_ptr<PaddlePredictor> x(
+      new PaddleInferenceAnakinPredictor(config));
+  return x;
+};
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
new file mode 100644
index 0000000000..181784cbdf
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the implementation of inference API with Anakin engine
+ * embeded, this API can only support Anakin models.
+ */
+
+#pragma once
+
+// NOTE This header file do not have namespace.
+//#include <test/framework/net/paddle_api.h>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+#include "framework/core/net/net.h"
+#include "saber/saber_types.h"
+
+namespace paddle {
+
+class PaddleInferenceAnakinPredictor : public PaddlePredictor {
+ public:
+  PaddleInferenceAnakinPredictor() {}
+
+  PaddleInferenceAnakinPredictor(const AnakinConfig& config);
+
+  // NOTE Unlike the native engine, the buffers of anakin engine's output_data
+  // should be allocated first.
+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data) override;
+
+  std::unique_ptr<PaddlePredictor> Clone() override;
+
+  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
+  get_executer();
+
+  ~PaddleInferenceAnakinPredictor() override{};
+
+ private:
+  bool Init(const AnakinConfig& config);
+
+  anakin::graph::Graph<anakin::NV,
+                       anakin::saber::AK_FLOAT,
+                       anakin::Precision::FP32>
+      graph_;
+  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+      executor_;
+  AnakinConfig config_;
+};
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
new file mode 100644
index 0000000000..47b9c6fa28
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "gflags/gflags.h"
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+
+AnakinConfig GetConfig() {
+  AnakinConfig config;
+  config.model_file = "./mobilenet_v2.anakin.bin";
+  config.device = 0;
+  config.max_batch_size = 1;
+  return config;
+}
+
+TEST(inference, anakin) {
+  AnakinConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
+
+  float data[1 * 3 * 224 * 224] = {1.0f};
+
+  PaddleBuf buf{.data = data, .length = sizeof(data)};
+  PaddleTensor tensor{.name = "input_0",
+                      .shape = std::vector<int>({1, 3, 224, 224}),
+                      .data = buf,
+                      .dtype = PaddleDType::FLOAT32};
+
+  // For simplicity, we set all the slots with the same data.
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+
+  float data_out[1000];
+
+  PaddleBuf buf_out{.data = data_out, .length = sizeof(data)};
+  PaddleTensor tensor_out{.name = "prob_out",
+                          .shape = std::vector<int>({1000, 1}),
+                          .data = buf_out,
+                          .dtype = PaddleDType::FLOAT32};
+
+  std::vector<PaddleTensor> outputs(1, tensor_out);
+
+  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+
+  float* data_o = static_cast<float*>(outputs[0].data.data);
+  for (size_t j = 0; j < 1000; ++j) {
+    LOG(INFO) << "output[" << j << "]: " << data_o[j];
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
new file mode 100644
index 0000000000..bda2981a14
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -0,0 +1,285 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <sys/time.h>
+#include <algorithm>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+
+namespace paddle {
+namespace {
+
+// Timer for timer
+class Timer {
+ public:
+  double start;
+  double startu;
+  void tic() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    start = tp.tv_sec;
+    startu = tp.tv_usec;
+  }
+  double toc() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    double used_time_ms =
+        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
+    return used_time_ms;
+  }
+};
+
+template <class T>
+std::string num2str(T a) {
+  std::stringstream istr;
+  istr << a;
+  return istr.str();
+}
+}  // namespace
+
+bool NativePaddlePredictor::Init(
+    std::shared_ptr<framework::Scope> parent_scope) {
+  VLOG(3) << "Predictor::init()";
+
+  if (config_.use_gpu) {
+    place_ = paddle::platform::CUDAPlace(config_.device);
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+  if (parent_scope) {
+    scope_ = parent_scope;
+    sub_scope_ = &(parent_scope->NewScope());
+  } else {
+    paddle::framework::InitDevices(false);
+    scope_.reset(new paddle::framework::Scope());
+  }
+
+  executor_.reset(new paddle::framework::Executor(place_));
+
+  // Initialize the inference program
+  if (!config_.model_dir.empty()) {
+    // Parameters are saved in separate files sited in
+    // the specified `dirname`.
+    inference_program_ = paddle::inference::Load(
+        executor_.get(), scope_.get(), config_.model_dir);
+  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+    // All parameters are saved in a single file.
+    // The file names should be consistent with that used
+    // in Python API `fluid.io.save_inference_model`.
+    inference_program_ = paddle::inference::Load(
+        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+  } else {
+    LOG(ERROR) << "fail to load inference model.";
+    return false;
+  }
+  ctx_ = executor_->Prepare(*inference_program_, 0);
+  executor_->CreateVariables(
+      *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
+
+  // Get the feed_target_names and fetch_target_names
+  feed_target_names_ = inference_program_->GetFeedTargetNames();
+  fetch_target_names_ = inference_program_->GetFetchTargetNames();
+  return true;
+}
+
+NativePaddlePredictor::~NativePaddlePredictor() {
+  if (sub_scope_) {
+    PADDLE_ENFORCE_NOT_NULL(scope_, "Should have parent scope!");
+    scope_->DeleteScope(sub_scope_);
+  }
+};
+
+bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
+                                std::vector<PaddleTensor> *output_data) {
+  VLOG(3) << "Predictor::predict";
+  Timer timer;
+  timer.tic();
+  // set feed variable
+  std::map<std::string, const framework::LoDTensor *> feed_targets;
+  std::vector<framework::LoDTensor> feeds;
+  if (!SetFeed(inputs, &feeds)) {
+    LOG(ERROR) << "fail to set feed";
+    return false;
+  }
+  for (size_t i = 0; i < feed_target_names_.size(); ++i) {
+    feed_targets[feed_target_names_[i]] = &feeds[i];
+  }
+  // get fetch variable
+  std::map<std::string, framework::LoDTensor *> fetch_targets;
+  std::vector<framework::LoDTensor> fetchs;
+  fetchs.resize(fetch_target_names_.size());
+  for (size_t i = 0; i < fetch_target_names_.size(); ++i) {
+    fetch_targets[fetch_target_names_[i]] = &fetchs[i];
+  }
+  // Run the inference program
+  // if share variables, we need not create variables
+  executor_->RunPreparedContext(
+      ctx_.get(),
+      sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
+      &feed_targets,
+      &fetch_targets,
+      false /* don't create variable eatch time */);
+  if (!GetFetch(fetchs, output_data)) {
+    LOG(ERROR) << "fail to get fetchs";
+    return false;
+  }
+  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+  return true;
+}
+
+std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
+  VLOG(3) << "Predictor::clone";
+  std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
+
+  if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(scope_)) {
+    LOG(ERROR) << "fail to call Init";
+    return nullptr;
+  }
+  // fix manylinux compile error.
+  return std::move(cls);
+}
+
+bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
+                                    std::vector<framework::LoDTensor> *feeds) {
+  VLOG(3) << "Predictor::set_feed";
+  if (inputs.size() != feed_target_names_.size()) {
+    LOG(ERROR) << "wrong feed input size.";
+    return false;
+  }
+  for (size_t i = 0; i < feed_target_names_.size(); ++i) {
+    framework::LoDTensor input;
+    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
+    void *input_ptr;
+    if (inputs[i].dtype == PaddleDType::INT64) {
+      input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
+    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
+      input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
+    } else {
+      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
+      return false;
+    }
+
+    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+    std::memcpy(static_cast<void *>(input_ptr),
+                inputs[i].data.data,
+                inputs[i].data.length);
+    feeds->push_back(input);
+  }
+  return true;
+}
+
+bool NativePaddlePredictor::GetFetch(
+    const std::vector<framework::LoDTensor> &fetchs,
+    std::vector<PaddleTensor> *outputs) {
+  VLOG(3) << "Predictor::get_fetch";
+  outputs->resize(fetchs.size());
+  for (size_t i = 0; i < fetchs.size(); ++i) {
+    // TODO(panyx0718): Support fetch of other types.
+    if (fetchs[i].type() != typeid(float)) {
+      LOG(ERROR) << "only support fetching float now.";
+      return false;
+    }
+    std::vector<int> shape;
+    auto dims_i = fetchs[i].dims();
+    auto lod = fetchs[i].lod();
+    const float *output_ptr = fetchs[i].data<float>();
+    // const int64_t* output_ptr = fetchs[i].data<int64_t>();
+    auto num = fetchs[i].numel();
+    std::vector<float> data;
+    if (0 == lod.size()) {
+      std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
+      for (int j = 0; j < dims_i.size(); ++j) {
+        shape.push_back(dims_i[j]);
+      }
+    } else {
+      // for batch detection
+      // image[0] -> output[0] shape {145, 6}
+      // image[1] -> output[1] shape {176, 6}
+      // then,
+      // the batch output shape {321, 6}
+      // the lod {{0, 145, 321}}
+      // so we should append output[0] to {176, 6}
+      size_t max_dim = 0;
+      for (size_t j = 1; j < lod[0].size(); j++) {
+        max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
+      }
+      size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
+      if (max_dim > 0) {
+        data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
+      }
+      for (size_t j = 1; j < lod[0].size(); j++) {
+        size_t start = lod[0][j - 1] * common_dim;
+        size_t end = lod[0][j] * common_dim;
+        if (end > start) {
+          std::copy(output_ptr + start,
+                    output_ptr + end,
+                    data.begin() + (j - 1) * max_dim * common_dim);
+        }
+      }
+      shape.push_back(lod[0].size() - 1);
+      shape.push_back(max_dim);
+      for (int j = 1; j < dims_i.size(); ++j) {
+        shape.push_back(dims_i[j]);
+      }
+    }
+
+    outputs->at(i).shape = shape;
+    outputs->at(i).data.length = sizeof(float) * data.size();
+    outputs->at(i).data.data = malloc(outputs->at(i).data.length);
+    std::memcpy(
+        outputs->at(i).data.data, data.data(), outputs->at(i).data.length);
+    outputs->at(i).dtype = PaddleDType::FLOAT32;
+    // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
+  }
+  return true;
+}
+
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
+    const NativeConfig &config) {
+  VLOG(3) << "create NativePaddlePredictor";
+  if (config.use_gpu) {
+    // 1. GPU memeroy
+    PADDLE_ENFORCE_GT(
+        config.fraction_of_gpu_memory,
+        0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    std::vector<std::string> flags;
+    if (config.fraction_of_gpu_memory >= 0.0f ||
+        config.fraction_of_gpu_memory <= 0.95f) {
+      flags.push_back("dummpy");
+      std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                         num2str<float>(config.fraction_of_gpu_memory);
+      flags.push_back(flag);
+      VLOG(3) << "set flag: " << flag;
+      framework::InitGflags(flags);
+    }
+  }
+
+  std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
+  if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
+    return nullptr;
+  }
+  return std::move(predictor);
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/contrib/inference/paddle_inference_api_impl.h
new file mode 100644
index 0000000000..86d1db7bcc
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+
+class NativePaddlePredictor : public PaddlePredictor {
+ public:
+  explicit NativePaddlePredictor(const NativeConfig &config)
+      : config_(config) {}
+
+  // will only create sub scope if have global scope
+  bool Init(std::shared_ptr<framework::Scope> parent_scope);
+
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data) override;
+
+  std::unique_ptr<PaddlePredictor> Clone() override;
+
+  ~NativePaddlePredictor() override;
+
+ private:
+  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
+               std::vector<framework::LoDTensor> *feeds);
+  bool GetFetch(const std::vector<framework::LoDTensor> &fetchs,
+                std::vector<PaddleTensor> *output_data);
+
+  NativeConfig config_;
+  platform::Place place_;
+  std::unique_ptr<framework::Executor> executor_;
+  std::shared_ptr<framework::Scope> scope_;
+  std::unique_ptr<framework::ExecutorPrepareContext> ctx_;
+  std::unique_ptr<framework::ProgramDesc> inference_program_;
+  std::vector<std::string> feed_target_names_;
+  std::vector<std::string> fetch_target_names_;
+  // Do not use unique_ptr, use parent scope to delete
+  framework::Scope *sub_scope_{nullptr};
+};
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api.cc b/paddle/contrib/inference/test_paddle_inference_api.cc
new file mode 100644
index 0000000000..bc7faab6e2
--- /dev/null
+++ b/paddle/contrib/inference/test_paddle_inference_api.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+/*
+ * Do not use this, just a demo indicating how to customize a config for a
+ * specific predictor.
+ */
+struct DemoConfig : public PaddlePredictor::Config {
+  float other_config;
+};
+
+/*
+ * Do not use this, just a demo indicating how to customize a Predictor.
+ */
+class DemoPredictor : public PaddlePredictor {
+ public:
+  explicit DemoPredictor(const DemoConfig &config) {
+    LOG(INFO) << "I get other_config " << config.other_config;
+  }
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data) override {
+    LOG(INFO) << "Run";
+    return false;
+  }
+
+  std::unique_ptr<PaddlePredictor> Clone() override { return nullptr; }
+
+  ~DemoPredictor() override {}
+};
+
+template <>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<DemoConfig>(
+    const DemoConfig &config) {
+  std::unique_ptr<PaddlePredictor> x(new DemoPredictor(config));
+  return x;
+}
+
+TEST(paddle_inference_api, demo) {
+  DemoConfig config;
+  config.other_config = 1.7;
+  auto predictor = CreatePaddlePredictor(config);
+  std::vector<PaddleTensor> outputs;
+  predictor->Run({}, &outputs);
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
new file mode 100644
index 0000000000..5d843010e0
--- /dev/null
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -0,0 +1,294 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <thread>
+
+#include "gflags/gflags.h"
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+namespace paddle {
+
+PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
+  PaddleTensor pt;
+  pt.data.data = t->data<void>();
+
+  if (t->type() == typeid(int64_t)) {
+    pt.data.length = t->numel() * sizeof(int64_t);
+    pt.dtype = PaddleDType::INT64;
+  } else if (t->type() == typeid(float)) {
+    pt.data.length = t->numel() * sizeof(float);
+    pt.dtype = PaddleDType::FLOAT32;
+  } else {
+    LOG(FATAL) << "unsupported type.";
+  }
+  pt.shape = framework::vectorize2int(t->dims());
+  return pt;
+}
+
+NativeConfig GetConfig() {
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  LOG(INFO) << "dirname  " << config.model_dir;
+  config.fraction_of_gpu_memory = 0.15;
+#ifdef PADDLE_WITH_CUDA
+  config.use_gpu = true;
+#else
+  config.use_gpu = false;
+#endif
+  config.device = 0;
+  return config;
+}
+
+void MainWord2Vec(bool use_gpu) {
+  NativeConfig config = GetConfig();
+  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
+  config.use_gpu = use_gpu;
+
+  framework::LoDTensor first_word, second_word, third_word, fourth_word;
+  framework::LoD lod{{0, 1}};
+  int64_t dict_size = 2073;  // The size of dictionary
+
+  SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
+
+  std::vector<PaddleTensor> paddle_tensor_feeds;
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&first_word));
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&second_word));
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&third_word));
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&fourth_word));
+
+  std::vector<PaddleTensor> outputs;
+  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+  ASSERT_EQ(outputs.size(), 1UL);
+  size_t len = outputs[0].data.length;
+  float* data = static_cast<float*>(outputs[0].data.data);
+  for (size_t j = 0; j < len / sizeof(float); ++j) {
+    ASSERT_LT(data[j], 1.0);
+    ASSERT_GT(data[j], -1.0);
+  }
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&first_word);
+  cpu_feeds.push_back(&second_word);
+  cpu_feeds.push_back(&third_word);
+  cpu_feeds.push_back(&fourth_word);
+
+  framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  TestInference<platform::CPUPlace>(config.model_dir, cpu_feeds, cpu_fetchs1);
+
+  float* lod_data = output1.data<float>();
+  for (int i = 0; i < output1.numel(); ++i) {
+    EXPECT_LT(lod_data[i] - data[i], 1e-3);
+    EXPECT_GT(lod_data[i] - data[i], -1e-3);
+  }
+
+  free(outputs[0].data.data);
+}
+
+void MainImageClassification(bool use_gpu) {
+  int batch_size = 2;
+  bool repeat = false;
+  NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
+  config.model_dir =
+      FLAGS_dirname + "image_classification_resnet.inference.model";
+
+  const bool is_combined = false;
+  std::vector<std::vector<int64_t>> feed_target_shapes =
+      GetFeedTargetShapes(config.model_dir, is_combined);
+
+  framework::LoDTensor input;
+  // Use normilized image pixels as input data,
+  // which should be in the range [0.0, 1.0].
+  feed_target_shapes[0][0] = batch_size;
+  framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
+  SetupTensor<float>(
+      &input, input_dims, static_cast<float>(0), static_cast<float>(1));
+  std::vector<framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  framework::LoDTensor output1;
+  std::vector<framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  TestInference<platform::CPUPlace, false, true>(
+      config.model_dir, cpu_feeds, cpu_fetchs1, repeat, is_combined);
+
+  auto predictor = CreatePaddlePredictor(config);
+  std::vector<PaddleTensor> paddle_tensor_feeds;
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&input));
+
+  std::vector<PaddleTensor> outputs;
+  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+  ASSERT_EQ(outputs.size(), 1UL);
+  size_t len = outputs[0].data.length;
+  float* data = static_cast<float*>(outputs[0].data.data);
+  float* lod_data = output1.data<float>();
+  for (size_t j = 0; j < len / sizeof(float); ++j) {
+    EXPECT_NEAR(lod_data[j], data[j], 1e-3);
+  }
+  free(data);
+}
+
+void MainThreadsWord2Vec(bool use_gpu) {
+  NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
+
+  // prepare inputs data and reference results
+  constexpr int num_jobs = 3;
+  std::vector<std::vector<framework::LoDTensor>> jobs(num_jobs);
+  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
+  std::vector<framework::LoDTensor> refs(num_jobs);
+  for (size_t i = 0; i < jobs.size(); ++i) {
+    // each job has 4 words
+    jobs[i].resize(4);
+    for (size_t j = 0; j < 4; ++j) {
+      framework::LoD lod{{0, 1}};
+      int64_t dict_size = 2073;  // The size of dictionary
+      SetupLoDTensor(&jobs[i][j], lod, static_cast<int64_t>(0), dict_size - 1);
+      paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i][j]));
+    }
+
+    // get reference result of each job
+    std::vector<paddle::framework::LoDTensor*> ref_feeds;
+    std::vector<paddle::framework::LoDTensor*> ref_fetches(1, &refs[i]);
+    for (auto& word : jobs[i]) {
+      ref_feeds.push_back(&word);
+    }
+    TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
+  }
+
+  // create threads and each thread run 1 job
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      auto predictor = main_predictor->Clone();
+      auto& local_inputs = paddle_tensor_feeds[tid];
+      std::vector<PaddleTensor> local_outputs;
+      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
+
+      // check outputs range
+      ASSERT_EQ(local_outputs.size(), 1UL);
+      const size_t len = local_outputs[0].data.length;
+      float* data = static_cast<float*>(local_outputs[0].data.data);
+      for (size_t j = 0; j < len / sizeof(float); ++j) {
+        ASSERT_LT(data[j], 1.0);
+        ASSERT_GT(data[j], -1.0);
+      }
+
+      // check outputs correctness
+      float* ref_data = refs[tid].data<float>();
+      EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
+      for (int i = 0; i < refs[tid].numel(); ++i) {
+        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+      }
+      free(data);
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
+  }
+}
+
+void MainThreadsImageClassification(bool use_gpu) {
+  constexpr int num_jobs = 4;  // each job run 1 batch
+  constexpr int batch_size = 1;
+  NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
+  config.model_dir =
+      FLAGS_dirname + "image_classification_resnet.inference.model";
+
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
+  std::vector<framework::LoDTensor> jobs(num_jobs);
+  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
+  std::vector<framework::LoDTensor> refs(num_jobs);
+  for (size_t i = 0; i < jobs.size(); ++i) {
+    // prepare inputs
+    std::vector<std::vector<int64_t>> feed_target_shapes =
+        GetFeedTargetShapes(config.model_dir, /*is_combined*/ false);
+    feed_target_shapes[0][0] = batch_size;
+    framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
+    SetupTensor<float>(&jobs[i], input_dims, 0.f, 1.f);
+    paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i]));
+
+    // get reference result of each job
+    std::vector<framework::LoDTensor*> ref_feeds(1, &jobs[i]);
+    std::vector<framework::LoDTensor*> ref_fetches(1, &refs[i]);
+    TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
+  }
+
+  // create threads and each thread run 1 job
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      auto predictor = main_predictor->Clone();
+      auto& local_inputs = paddle_tensor_feeds[tid];
+      std::vector<PaddleTensor> local_outputs;
+      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
+
+      // check outputs correctness
+      ASSERT_EQ(local_outputs.size(), 1UL);
+      const size_t len = local_outputs[0].data.length;
+      float* data = static_cast<float*>(local_outputs[0].data.data);
+      float* ref_data = refs[tid].data<float>();
+      EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
+      for (int i = 0; i < refs[tid].numel(); ++i) {
+        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+      }
+      free(data);
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
+  }
+}
+
+TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); }
+TEST(inference_api_native, word2vec_cpu_threads) {
+  MainThreadsWord2Vec(false /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_cpu) {
+  MainThreadsImageClassification(false /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_cpu_threads) {
+  MainThreadsImageClassification(false /*use_gpu*/);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); }
+TEST(inference_api_native, word2vec_gpu_threads) {
+  MainThreadsWord2Vec(true /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_gpu) {
+  MainThreadsImageClassification(true /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_gpu_threads) {
+  MainThreadsImageClassification(true /*use_gpu*/);
+}
+
+#endif
+
+}  // namespace paddle
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index efd1b7a73e..9bbb8de78e 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -87,8 +87,3 @@ else()
 endif()
 
 add_dependencies(paddle_cuda paddle_proto ${external_project_dependencies})
-
-add_style_check_target(paddle_cuda
-                       ${CUDA_SOURCES}
-                       ${CUDA_HEADERS}
-                       ${CUDA_CXX_SOURCES})
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/cuda/include/hl_activation_functions.h
index 29ec248420..66a69db545 100644
--- a/paddle/cuda/include/hl_activation_functions.h
+++ b/paddle/cuda/include/hl_activation_functions.h
@@ -31,7 +31,7 @@ namespace hppl {
  */
 template <class T>
 class Active {
-public:
+ public:
   typedef T (*forward)(T);
   typedef T (*backward)(T, T);
 };
diff --git a/paddle/cuda/include/hl_tensor_ops.h b/paddle/cuda/include/hl_tensor_ops.h
index 85a022ff5e..bc5e5da53d 100644
--- a/paddle/cuda/include/hl_tensor_ops.h
+++ b/paddle/cuda/include/hl_tensor_ops.h
@@ -23,128 +23,128 @@ namespace unary {
 
 template <class T>
 class add_scale {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE add_scale(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a + p; }
 };
 
 template <class T>
 class sub_scale {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE sub_scale(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a - p; }
 };
 
 template <class T>
 class mul_scale {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE mul_scale(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a * p; }
 };
 
 template <class T>
 class div_scale {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE div_scale(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a / p; }
 };
 
 template <class T>
 class neg {
-public:
+ public:
   INLINE T operator()(const T a) const { return -a; }
 };
 
 template <class T>
 class exp_op {
-public:
+ public:
   INLINE T operator()(const T a) const { return std::exp(a); }
 };
 
 template <class T>
 class log_op {
-public:
+ public:
   INLINE T operator()(const T a) const { return std::log(a); }
 };
 
 template <class T>
 class sqrt_op {
-public:
+ public:
   INLINE T operator()(const T a) const { return std::sqrt(a); }
 };
 
 template <class T>
 class square {
-public:
+ public:
   INLINE T operator()(const T a) const { return a * a; }
 };
 
 template <class T>
 class reciprocal {
-public:
+ public:
   INLINE T operator()(const T a) const { return T(1) / a; }
 };
 
 template <class T>
 class abs {
-public:
+ public:
   INLINE T operator()(const T a) const { return a > 0 ? a : -a; }
 };
 
 template <class T>
 class sign {
-public:
+ public:
   INLINE T operator()(const T a) const { return (a > 0) - (a < 0); }
 };
 
 template <class T>
 class min {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE min(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a > p ? p : a; }
 };
 
 template <class T>
 class max {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE max(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a < p ? p : a; }
 };
 
 template <class T>
 class pow_op {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE pow_op(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return std::pow(a, p); }
 };
 
 template <class T>
 class constant {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE constant(const T s) : p(s) {}
   INLINE T operator()(int i) const { return p; }
   INLINE T operator()(int i, int j) const { return p; }
@@ -152,80 +152,80 @@ public:
 
 template <class T>
 class cmp_eq {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_eq(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a == p; }
 };
 
 template <class T>
 class cmp_ne {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_ne(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a != p; }
 };
 
 template <class T>
 class cmp_le {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_le(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a <= p; }
 };
 
 template <class T>
 class cmp_lt {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_lt(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a < p; }
 };
 
 template <class T>
 class cmp_ge {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_ge(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a >= p; }
 };
 
 template <class T>
 class cmp_gt {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_gt(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a > p; }
 };
 
 template <class T>
 class and_op {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE and_op(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a && p; }
 };
 
 template <class T>
 class or_op {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE or_op(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a || p; }
 };
@@ -235,96 +235,96 @@ public:
 namespace binary {
 template <class T>
 class add {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a + b; }
 };
 
 template <class T>
 class add_scale {
-private:
+ private:
   const T p1;
   const T p2;
 
-public:
+ public:
   INLINE add_scale(const T s1, const T s2) : p1(s1), p2(s2) {}
   INLINE T operator()(const T a, const T b) const { return p1 * a + p2 * b; }
 };
 
 template <class T>
 class sub {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a - b; }
 };
 
 template <class T>
 class mul {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a * b; }
 };
 
 template <class T>
 class div {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a / b; }
 };
 
 template <class T>
 class cmp_eq {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a == b; }
 };
 
 template <class T>
 class cmp_ne {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a != b; }
 };
 
 template <class T>
 class cmp_le {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a <= b; }
 };
 
 template <class T>
 class cmp_lt {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a < b; }
 };
 
 template <class T>
 class cmp_ge {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a >= b; }
 };
 
 template <class T>
 class cmp_gt {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a > b; }
 };
 
 template <class T>
 class and_op {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a && b; }
 };
 
 template <class T>
 class or_op {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a || b; }
 };
 
 template <class T>
 class min {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a > b ? b : a; }
 };
 
 template <class T>
 class max {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a < b ? b : a; }
 };
 
@@ -332,7 +332,7 @@ public:
 #ifndef PADDLE_TYPE_DOUBLE
 template <>
 class add<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_add_ps(a, b);
   }
@@ -340,11 +340,11 @@ public:
 
 template <>
 class add_scale<__m128> {
-private:
+ private:
   const __m128 p1;
   const __m128 p2;
 
-public:
+ public:
   INLINE add_scale(const __m128 s1, const __m128 s2) : p1(s1), p2(s2) {}
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_add_ps(_mm_mul_ps(p1, a), _mm_mul_ps(p2, b));
@@ -353,7 +353,7 @@ public:
 
 template <>
 class sub<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_sub_ps(a, b);
   }
@@ -361,7 +361,7 @@ public:
 
 template <>
 class mul<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_mul_ps(a, b);
   }
@@ -369,7 +369,7 @@ public:
 
 template <>
 class div<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_div_ps(a, b);
   }
@@ -377,7 +377,7 @@ public:
 
 template <>
 class min<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_min_ps(a, b);
   }
@@ -385,7 +385,7 @@ public:
 
 template <>
 class max<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_max_ps(a, b);
   }
@@ -393,7 +393,7 @@ public:
 #else
 template <>
 class add<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_add_pd(a, b);
   }
@@ -401,11 +401,11 @@ public:
 
 template <>
 class add_scale<__m128d> {
-private:
+ private:
   const __m128d p1;
   const __m128d p2;
 
-public:
+ public:
   INLINE add_scale(const __m128d s1, const __m128d s2) : p1(s1), p2(s2) {}
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_add_pd(_mm_mul_pd(p1, a), _mm_mul_pd(p2, b));
@@ -414,7 +414,7 @@ public:
 
 template <>
 class sub<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_sub_pd(a, b);
   }
@@ -422,7 +422,7 @@ public:
 
 template <>
 class mul<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_mul_pd(a, b);
   }
@@ -430,7 +430,7 @@ public:
 
 template <>
 class div<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_div_pd(a, b);
   }
@@ -438,7 +438,7 @@ public:
 
 template <>
 class min<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_min_pd(a, b);
   }
@@ -446,7 +446,7 @@ public:
 
 template <>
 class max<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_max_pd(a, b);
   }
@@ -458,7 +458,7 @@ public:
 #ifndef PADDLE_TYPE_DOUBLE
 template <>
 class add<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     return vaddq_f32(a, b);
@@ -467,11 +467,11 @@ public:
 
 template <>
 class add_scale<float32x4_t> {
-private:
+ private:
   const float32x4_t p1;
   const float32x4_t p2;
 
-public:
+ public:
   INLINE add_scale(const float32x4_t s1, const float32x4_t s2)
       : p1(s1), p2(s2) {}
   INLINE float32x4_t operator()(const float32x4_t a,
@@ -482,7 +482,7 @@ public:
 
 template <>
 class sub<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     return vsubq_f32(a, b);
@@ -491,7 +491,7 @@ public:
 
 template <>
 class mul<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     return vmulq_f32(a, b);
@@ -500,7 +500,7 @@ public:
 
 template <>
 class div<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     float32x4_t tmp = vrecpeq_f32(b);
@@ -510,7 +510,7 @@ public:
 
 template <>
 class min<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     return vminq_f32(a, b);
@@ -519,7 +519,7 @@ public:
 
 template <>
 class max<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     return vmaxq_f32(a, b);
diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu
index e30fcddffd..b8c4e433a1 100644
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
@@ -30,7 +30,7 @@ bool hl_lstm_sequence_parallel(int frameSize) {
 }
 
 class frameValue {
-public:
+ public:
   real *value_;
   __device__ frameValue(real *value) : value_(value) {}
   template <int reversed, int frameSize>
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index ed1e70c646..6bc7705806 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -83,11 +83,16 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
 
-cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
-framework_proto glog lod_rank_table feed_fetch_method)
+if(WITH_DISTRIBUTE)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr)
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+else()
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method)
+endif()
 
 
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
index fd409ed4c0..f537e4b9e5 100644
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -169,17 +169,13 @@ void BlockDesc::Flush() {
   }
 
   if (need_update_) {
-    auto &op_field = *this->desc_->mutable_ops();
-    this->ClearPBOps();
-    op_field.Reserve(static_cast<int>(ops_.size()));
+    this->desc_->mutable_ops()->Clear();
     for (auto &op_desc : ops_) {
-      op_field.AddAllocated(op_desc->Proto());
+      this->desc_->mutable_ops()->Add()->CopyFrom(*op_desc->Proto());
     }
-    auto &var_field = *this->desc_->mutable_vars();
-    this->ClearPBVars();
-    var_field.Reserve(static_cast<int>(vars_.size()));
+    this->desc_->mutable_vars()->Clear();
     for (auto &var_desc : vars_) {
-      var_field.AddAllocated(var_desc.second->Proto());
+      this->desc_->mutable_vars()->Add()->CopyFrom(*var_desc.second->Proto());
     }
     need_update_ = false;
   }
@@ -200,7 +196,7 @@ BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
     vars_[var_desc.name()].reset(new VarDesc(var_desc));
   }
   for (const proto::OpDesc &op_desc : desc_->ops()) {
-    ops_.emplace_back(new OpDesc(op_desc, prog, this));
+    ops_.emplace_back(new OpDesc(op_desc, this));
   }
 }
 
@@ -209,7 +205,7 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
     : prog_(prog), desc_(desc) {
   need_update_ = true;
   for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDesc(*op->Proto(), prog, this));
+    ops_.emplace_back(new OpDesc(*op, this));
   }
   for (auto &it : other.vars_) {
     auto *var = new VarDesc(*it.second);
@@ -217,22 +213,6 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
   }
 }
 
-void BlockDesc::ClearPBOps() {
-  auto ops = this->desc_->mutable_ops();
-  while (!ops->empty()) {
-    // we do not own the OpDesc, so release the ownership.
-    ops->ReleaseLast();
-  }
-}
-
-void BlockDesc::ClearPBVars() {
-  auto vars = this->desc_->mutable_vars();
-  while (!vars->empty()) {
-    // we do not own the VarDesc, so release the ownership.
-    vars->ReleaseLast();
-  }
-}
-
 void BlockDesc::SetForwardBlockID(int32_t forward_block_id) {
   PADDLE_ENFORCE(!desc_->has_forward_block_idx(),
                  "Parent block ID has been set to %d. Cannot set to %d",
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 600601669c..ce48548418 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -41,11 +41,6 @@ class BlockDesc {
 
   BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog);
 
-  ~BlockDesc() {
-    this->ClearPBVars();
-    this->ClearPBOps();
-  }
-
   int32_t ID() const { return desc_->idx(); }
 
   int32_t Parent() const { return desc_->parent_idx(); }
@@ -105,7 +100,7 @@ class BlockDesc {
 
   size_t OpSize() const { return ops_.size(); }
 
-  OpDesc *Op(int idx) { return ops_.at(idx).get(); }
+  OpDesc *Op(int idx) const { return ops_.at(idx).get(); }
 
   void Flush();
 
@@ -113,10 +108,6 @@ class BlockDesc {
 
   ProgramDesc *Program() const { return this->prog_; }
 
- private:
-  void ClearPBOps();
-  void ClearPBVars();
-
  private:
   ProgramDesc *prog_;       // not_own
   proto::BlockDesc *desc_;  // not_own
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index a876725ac0..6bcfc6cd55 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -16,31 +16,25 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-static const platform::DeviceContext* GetDeviceContext(
-    const platform::Place& src_place, const platform::Place& dst_place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-
-  if (platform::is_gpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    return pool.Get(src_place);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    return pool.Get(dst_place);
-  } else {
-    PADDLE_THROW(
-        "Currently, model parallelism is only supported between CPU and CUDA");
-  }
-}
-
-void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
-                     Tensor* out) {
+void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
+                     Tensor *out) {
   VLOG(3) << "DeviceTransform in, src_place " << in.place()
           << " dst_place: " << dst_place;
-  auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
 
-  TensorCopy(in, dst_place, *dev_ctx, out);
-  if (platform::is_gpu_place(in.place()) && platform::is_cpu_place(dst_place)) {
-    dev_ctx->Wait();
-  }
+  PADDLE_ENFORCE_NE(
+      in.place().which(), dst_place.which(),
+      "Currently, model parallelism is only supported between CPU and CUDA");
+
+  // FIXME(zcd): TransDataDevice is used to transform data from GPU to CPU and
+  // the enforced checkings have been done in GetDeviceContext, so the
+  // `dev_ctx->Wait()` is necessary. But `dev_ctx->Wait()` will make the program
+  // slow, especially when the number of elements is little, for example,
+  // the elements of learning rate are one and it's CPU side.
+  // One solution is to use a CUDA kernel to complete the copy operation when
+  // the transforming is from CPU to GPU and the number of elements is little.
+  // But the embarrassment is that this solution this solution makes training
+  // slower.
+  TensorCopySync(in, dst_place, out);
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h
index 9c5e2cf7cc..b611bb77b4 100644
--- a/paddle/fluid/framework/data_layout.h
+++ b/paddle/fluid/framework/data_layout.h
@@ -27,6 +27,7 @@ enum class DataLayout {
   kNHWC = 0,
   kNCHW = 1,
   kAnyLayout = 2,
+  kMKLDNN = 3,  // all layouts supported by MKLDNN internally
 };
 
 inline DataLayout StringToDataLayout(const std::string& str) {
@@ -41,6 +42,8 @@ inline DataLayout StringToDataLayout(const std::string& str) {
     return DataLayout::kNCHW;
   } else if (s == "ANYLAYOUT") {
     return DataLayout::kAnyLayout;
+  } else if (s == "MKLDNNLAYOUT") {
+    return DataLayout::kMKLDNN;
   } else {
     PADDLE_THROW("Unknown storage order string: %s", s);
   }
@@ -54,8 +57,10 @@ inline std::string DataLayoutToString(const DataLayout& data_layout) {
       return "NCHW";
     case DataLayout::kAnyLayout:
       return "ANY_LAYOUT";
+    case DataLayout::kMKLDNN:
+      return "MKLDNNLAYOUT";
     default:
-      PADDLE_THROW("unknown DataLayou %d", data_layout);
+      PADDLE_THROW("unknown DataLayout %d", data_layout);
   }
 }
 
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 60ec60a427..5b8dfc57ba 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -16,6 +16,9 @@
 #include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -88,5 +91,85 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var,
   out->set_layout(expected_kernel_type.data_layout_);
 }
 
+#ifdef PADDLE_WITH_MKLDNN
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+
+void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) {
+  switch (type) {
+    case mkldnn::memory::data_type::f32:
+      return platform::to_void_cast(tensor.data<float>());
+    case mkldnn::memory::data_type::s8:
+      return platform::to_void_cast(tensor.data<char>());
+    case mkldnn::memory::data_type::u8:
+      return platform::to_void_cast(tensor.data<unsigned char>());
+    case mkldnn::memory::data_type::s16:
+      return platform::to_void_cast(tensor.data<int16_t>());
+    case mkldnn::memory::data_type::s32:
+      return platform::to_void_cast(tensor.data<int32_t>());
+    default:
+      PADDLE_THROW("wrong mkldnn type provided");
+  }
+}
+#endif
+
+void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
+                               const OpKernelType& expected_kernel_type,
+                               const Tensor& in, Tensor* out) {
+  auto in_layout = kernel_type_for_var.data_layout_;
+  auto out_layout = expected_kernel_type.data_layout_;
+
+  PADDLE_ENFORCE(
+      in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN,
+      "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to "
+      "non-MKLDNN");
+
+#ifdef PADDLE_WITH_MKLDNN
+  PADDLE_ENFORCE(in.format() != memory::format::format_undef &&
+                     in.format() != memory::format::any,
+                 "Input tensor should have specified memory format");
+
+  // Set default as NCHW in case not specified
+  out_layout =
+      out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
+
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
+      pool.Get(expected_kernel_type.place_));
+  auto& cpu_engine = dev_ctx->GetEngine();
+
+  std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
+  std::vector<int> out_tz = in_tz;
+
+  memory::data_type in_type = ToMKLDNNDataType(in.type());
+  PADDLE_ENFORCE(in_type != memory::data_type::data_undef,
+                 "Input tensor type is not supported: ", in.type().name());
+  memory::data_type out_type = in_type;
+
+  memory::format in_format =
+      in_tz.size() == 2 ? memory::format::nc : in.format();
+  memory::format out_format =
+      out_tz.size() == 2 ? memory::format::nc : ToMKLDNNFormat(out_layout);
+
+  void* in_data = GetDataFromTensor(in, in_type);
+
+  // output tensor has the same dims as input. Reorder don't change dims
+  out->Resize(in.dims());
+
+  auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
+
+  auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
+  auto out_memory =
+      memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
+
+  platform::Reorder(in_memory, out_memory);
+
+  out->set_layout(out_layout);
+  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
+  out->set_format(memory::format::format_undef);
+#endif
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 06b638663d..2ba84ce57f 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <map>
 #include <vector>
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -22,6 +23,50 @@
 namespace paddle {
 namespace framework {
 
+#ifdef PADDLE_WITH_MKLDNN
+using MKLDNNFormat = mkldnn::memory::format;
+using MKLDNNDataType = mkldnn::memory::data_type;
+
+inline MKLDNNFormat ToMKLDNNFormat(const DataLayout& layout) {
+  switch (layout) {
+    case DataLayout::kNHWC:
+      return MKLDNNFormat::nhwc;
+    case DataLayout::kNCHW:
+      return MKLDNNFormat::nchw;
+    default:
+      PADDLE_THROW("Fail to convert layout %s to MKLDNN format",
+                   DataLayoutToString(layout));
+  }
+}
+
+inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) {
+  switch (format) {
+    case MKLDNNFormat::nhwc:
+      return DataLayout::kNHWC;
+    case MKLDNNFormat::nchw:
+      return DataLayout::kNCHW;
+    default:
+      PADDLE_THROW("Fail to convert MKLDNN format to paddle layout");
+  }
+}
+
+inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
+  static const std::map<std::type_index, MKLDNNDataType> dict{
+      {std::type_index(typeid(float)), MKLDNNDataType::f32},  // NOLINT
+      {std::type_index(typeid(char)), MKLDNNDataType::s8},    // NOLINT
+      {std::type_index(typeid(unsigned char)), MKLDNNDataType::u8},
+      {std::type_index(typeid(int16_t)), MKLDNNDataType::s16},
+      {std::type_index(typeid(int32_t)), MKLDNNDataType::s32}};
+  auto iter = dict.find(type);
+  if (iter != dict.end()) return iter->second;
+  return MKLDNNDataType::data_undef;
+}
+#endif
+
+void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
+                               const OpKernelType& expected_kernel_type,
+                               const Tensor& in, Tensor* out);
+
 std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
 
 void TransDataLayout(const OpKernelType& kernel_type_for_var,
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 9c277a27da..b8fcc92697 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -33,11 +33,34 @@ void DataTransform(const OpKernelType& expected_kernel_type,
   Tensor in;
   in.ShareDataWith(input_tensor);
   Tensor out;
+  DataLayout lin = kernel_type_for_var.data_layout_;
+  DataLayout lout = expected_kernel_type.data_layout_;
 
   // do layout transform
-  if (NeedTransformLayout(expected_kernel_type.data_layout_,
-                          kernel_type_for_var.data_layout_)) {
-    TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
+  if (NeedTransformLayout(lout, lin)) {
+    if (lin == DataLayout::kMKLDNN || lout == DataLayout::kMKLDNN) {
+      PADDLE_ENFORCE(
+          !(lin == DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN),
+          "No layout transform needed between two MKLDNN OPKernels");
+
+      if (lin != DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN) {
+#ifdef PADDLE_WITH_MKLDNN
+        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
+        // Just set layout/format. No real transform occur
+        out.ShareDataWith(input_tensor);
+        out.set_layout(DataLayout::kMKLDNN);
+        out.set_format(ToMKLDNNFormat(lin));
+#endif
+      } else {
+        // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
+        // Do transform via MKLDNN lib
+        TransDataLayoutFromMKLDNN(kernel_type_for_var, expected_kernel_type, in,
+                                  &out);
+      }
+    } else {
+      // Case3 - transfrom between Non-MKLDNN OPKernels
+      TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
+    }
     transformed = true;
     PassTensorData(&out, &in);
   }
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index b6b93cf422..60382faffb 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -28,6 +28,9 @@ struct DataTypeMap {
 };
 
 static DataTypeMap* InitDataTypeMap();
+// C++11 removes the need for manual locking. Concurrent execution shall wait if
+// a static local variable is already being initialized.
+// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
 static DataTypeMap& gDataTypeMap() {
   static DataTypeMap* g_data_type_map_ = InitDataTypeMap();
   return *g_data_type_map_;
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 9de44beafb..3c73b6cc55 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -3,30 +3,36 @@ cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
-cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry)
+cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
 
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
+cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
+cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
 
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 
 if(WITH_GPU)
-    nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            dynload_cuda)
-    set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
+    nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+            dynload_cuda variable_visitor)
     nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
     nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
 
 else()
-    set(multi_devices_graph_builder_deps)
+    cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+             variable_visitor)
     cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
     cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 endif()
 
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
+cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-        scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle)
+
+
+cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
 
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
@@ -36,5 +42,6 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha
         device_context broadcast_op_handle)
 cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
         device_context gather_op_handle)
-cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
-        device_context reduce_op_handle )
+cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor)
+#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+#        device_context reduce_op_handle )
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
similarity index 61%
rename from paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
rename to paddle/fluid/framework/details/all_reduce_op_handle.cc
index 95aa599cd3..b335d3a0d3 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -11,46 +11,65 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
-#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #include <algorithm>
+
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
-NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
-    const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    const platform::NCCLContextMap &ctxs)
+
+#ifdef PADDLE_WITH_CUDA
+AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                                     const std::vector<platform::Place> &places,
+                                     const platform::NCCLContextMap *ctxs)
     : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {
-  for (auto &p : places_) {
-    this->dev_ctxes_[p] = nccl_ctxs_.DevCtx(p);
+  if (nccl_ctxs_) {
+    for (auto &p : places_) {
+      this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p);
+    }
   }
 }
+#else
+AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                                     const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
+#endif
 
-void NCCLAllReduceOpHandle::RunImpl() {
-  if (inputs_.size() == 1) {
+void AllReduceOpHandle::RunImpl() {
+  if (NoDummyInputSize() == 1) {
     return;  // No need to all reduce when GPU count = 1;
   } else {
     // Wait input done
     WaitInputVarGenerated();
-
-    auto &var_name = static_cast<VarHandle *>(this->inputs_[0])->name_;
-    int dtype = -1;
-    size_t numel = 0;
+    auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+    auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+    PADDLE_ENFORCE_EQ(
+        in_var_handles.size(), places_.size(),
+        "The NoDummyInputSize should be equal to the number of places.");
+    PADDLE_ENFORCE_EQ(
+        in_var_handles.size(), out_var_handles.size(),
+        "The NoDummyInputSize and NoDummyOutputSize should be equal.");
 
     std::vector<const LoDTensor *> lod_tensors;
-
     for (size_t i = 0; i < local_scopes_.size(); ++i) {
       auto *s = local_scopes_[i];
       auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
-
-      auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
+      auto &lod_tensor =
+          local_scope.FindVar(in_var_handles[i]->name_)->Get<LoDTensor>();
       lod_tensors.emplace_back(&lod_tensor);
+      PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
+                        "The name of input and output should be equal.");
     }
 
     if (platform::is_gpu_place(lod_tensors[0]->place())) {
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+      int dtype = -1;
+      size_t numel = 0;
       std::vector<std::function<void()>> all_reduce_calls;
       for (size_t i = 0; i < local_scopes_.size(); ++i) {
         auto &p = places_[i];
@@ -66,7 +85,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
         }
 
         int dev_id = boost::get<platform::CUDAPlace>(p).device;
-        auto &nccl_ctx = nccl_ctxs_.at(dev_id);
+        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
         auto stream = nccl_ctx.stream();
         auto comm = nccl_ctx.comm_;
         all_reduce_calls.emplace_back([=] {
@@ -81,22 +100,25 @@ void NCCLAllReduceOpHandle::RunImpl() {
           call();
         }
       });
+#else
+      PADDLE_THROW("Not compiled with CUDA");
+#endif
     } else {  // Special handle CPU only Operator's gradient. Like CRF
       auto &trg = *this->local_scopes_[0]
                        ->FindVar(kLocalExecScopeName)
                        ->Get<Scope *>()
-                       ->Var()
+                       ->FindVar(out_var_handles[0]->name_)
                        ->GetMutable<framework::LoDTensor>();
 
       // Reduce All Tensor to trg in CPU
       ReduceLoDTensor func(lod_tensors, &trg);
       VisitDataType(ToDataType(lod_tensors[0]->type()), func);
 
-      for (size_t i = 0; i < local_scopes_.size(); ++i) {
+      for (size_t i = 1; i < local_scopes_.size(); ++i) {
         auto &scope =
             *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
         auto &p = places_[i];
-        auto *var = scope.FindVar(var_name);
+        auto *var = scope.FindVar(out_var_handles[i]->name_);
         auto *dev_ctx = dev_ctxes_[p];
 
         RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
@@ -109,7 +131,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
   }
 }
 
-std::string NCCLAllReduceOpHandle::Name() const { return "nccl_all_reduce"; }
+std::string AllReduceOpHandle::Name() const { return "all_reduce"; }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
similarity index 68%
rename from paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
rename to paddle/fluid/framework/details/all_reduce_op_handle.h
index a0c321843e..fdd250b0d3 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -20,17 +20,23 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
+#endif
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-struct NCCLAllReduceOpHandle : public OpHandleBase {
-  NCCLAllReduceOpHandle(const std::vector<Scope *> &local_scopes,
-                        const std::vector<platform::Place> &places,
-                        const platform::NCCLContextMap &ctxs);
-
+struct AllReduceOpHandle : public OpHandleBase {
+#ifdef PADDLE_WITH_CUDA
+  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                    const std::vector<platform::Place> &places,
+                    const platform::NCCLContextMap *ctxs);
+#else
+  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                    const std::vector<platform::Place> &places);
+#endif
   std::string Name() const override;
 
   // Delay and buffer nccl_all_reduce together can significantly increase
@@ -41,9 +47,11 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
   void RunImpl() override;
 
  private:
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
-  const platform::NCCLContextMap &nccl_ctxs_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
+#ifdef PADDLE_WITH_CUDA
+  const platform::NCCLContextMap *nccl_ctxs_;
+#endif
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 629aa00cb8..8036f756b6 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -59,8 +59,8 @@ struct BroadcastOpHandle : public OpHandleBase {
   void RunImpl() override;
 
  private:
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
 #ifdef PADDLE_WITH_CUDA
   const platform::NCCLContextMap *nccl_ctxs_;
 #endif
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 91bdfe6134..64e83acb4d 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <string>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -29,6 +31,8 @@ struct BuildStrategy {
 
   ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
   GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
+
+  std::string debug_graphviz_path_{""};
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index e8d510ec95..716d674fa2 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -20,8 +20,9 @@ namespace details {
 
 struct ExecutionStrategy {
   size_t num_threads_{0};
-  bool use_event_{true};
+  bool use_cuda_{true};
   bool allow_op_delay_{false};
+  size_t num_iteration_per_drop_scope_{100};
 };
 
 }  //  namespace details
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
new file mode 100644
index 0000000000..018c9bff71
--- /dev/null
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
@@ -0,0 +1,51 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/fuse_vars_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+void FuseVarsOpHandle::RunImpl() {
+  WaitInputVarGenerated(place_);
+
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+  PADDLE_ENFORCE_EQ(in_var_handles.size(), 0);
+  PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
+
+  auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+
+  auto out_var_handle = out_var_handles[0];
+  auto out_var = scope->Var(out_var_handle->name_);
+
+  auto out_tensor = out_var->GetMutable<LoDTensor>();
+  out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_);
+
+  int64_t s = 0;
+  for (size_t i = 1; i < out_var_handles.size(); ++i) {
+    auto out_name = out_var_handles[i]->name_;
+    auto out_t = scope->Var(out_name)->GetMutable<LoDTensor>();
+    auto numel = this->inputs_numel_.at(out_name);
+    out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
+    s += numel;
+  }
+  this->RunAndRecordEvent([] {});
+}
+
+std::string FuseVarsOpHandle::Name() const { return "fuse vars"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h
new file mode 100644
index 0000000000..140fb5bb49
--- /dev/null
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h
@@ -0,0 +1,63 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct FuseVarsOpHandle : public OpHandleBase {
+ public:
+  FuseVarsOpHandle(Scope *local_scope, const platform::Place &place,
+                   const std::unordered_map<std::string, int64_t> &inputs_numel,
+                   const std::type_index &var_type)
+      : local_scope_(local_scope),
+        place_(place),
+        inputs_numel_(inputs_numel),
+        type_(var_type) {
+    total_numel_ = 0;
+    for (auto in_numel : inputs_numel) {
+      PADDLE_ENFORCE_GT(in_numel.second, 0);
+      total_numel_ += in_numel.second;
+    }
+  }
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  Scope *local_scope_;
+  const platform::Place place_;
+  const std::unordered_map<std::string, int64_t> inputs_numel_;
+  const std::type_index type_;
+  int64_t total_numel_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 6b0c0a6b9f..78356cb1be 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -11,22 +11,22 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include <algorithm>
+#include <fstream>
+#include <string>
 #include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
+#include "paddle/fluid/framework/details/rpc_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
-#include "paddle/fluid/framework/details/send_op_handle.h"
+#include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/scope.h"
 
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
-#endif
-
-#include <string>
-#include <vector>
-
 namespace paddle {
 namespace framework {
 namespace details {
@@ -78,9 +78,44 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
   }
 }
 
-bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op,
-                                            OpDesc *send_op) const {
-  if (send_op == nullptr) {
+std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
+    const ProgramDesc &program) const {
+  std::vector<std::string> send_vars;
+  // since parameters are all in block 0,
+  // it's enough to only scan send ops in block 0
+  for (auto *op : program.Block(0).AllOps()) {
+    // TODO(Yancey1989): use a graceful method to find send op,
+    // instead of the the hard code string
+    if (op->Type() == "send") {
+      auto op_vars = op->InputArgumentNames();
+      send_vars.reserve(send_vars.size() +
+                        std::distance(op_vars.begin(), op_vars.end()));
+      send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end());
+    }
+  }
+  return send_vars;
+}
+
+std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
+    const ProgramDesc &program) const {
+  std::vector<std::string> recv_vars;
+  for (auto *op : program.Block(0).AllOps()) {
+    // TODO(Yancey1989): use a graceful method to find recv op,
+    // instead of the hard code string
+    if (op->Type() == "recv") {
+      auto op_vars = op->OutputArgumentNames();
+      recv_vars.reserve(recv_vars.size() +
+                        std::distance(op_vars.begin(), op_vars.end()));
+      recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end());
+    }
+  }
+  return recv_vars;
+}
+
+bool MultiDevSSAGraphBuilder::IsDistTrainOp(
+    const OpDesc &op, const std::vector<std::string> &send_vars,
+    const std::vector<std::string> &recv_vars) const {
+  if (send_vars.size() == 0 || recv_vars.size() == 0) {
     return false;
   }
 
@@ -88,29 +123,28 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op,
    * Check any of opvars contains `.block` and in sendvars
    */
   auto checker = [](const std::vector<std::string> &opvars,
-                    const std::vector<std::string> &sendvars) -> bool {
+                    const std::vector<std::string> &rpc_vars) -> bool {
     for (auto &var : opvars) {
+      // a variable name with the suffix `.block` means it's a splited
+      // variable by (DistributeTranspiler)
+      // [python/paddle/fluid/transpiler/distribute_transpiler.py]
       if (var.find(".block") != std::string::npos &&
-          std::find(sendvars.begin(), sendvars.end(), var) != sendvars.end()) {
+          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
         return true;
       }
     }
     return false;
   };
 
-  if (op.Type() == "split" || op.Type() == "split_byref") {
-    return checker(op.OutputArgumentNames(), send_op->InputArgumentNames());
-  } else if (op.Type() == "concat") {
-    return checker(op.InputArgumentNames(), send_op->OutputArgumentNames());
-  }
-  return false;
+  return checker(op.OutputArgumentNames(), send_vars) ||
+         checker(op.InputArgumentNames(), recv_vars);
 }
 
 std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     const ProgramDesc &program) const {
-  std::unordered_map<std::string, proto::VarType::Type> var_types;
+  std::unordered_map<std::string, VarDesc *> all_vars;
   for (auto *var : program.Block(0).AllVars()) {
-    var_types[var->Name()] = var->GetType();
+    all_vars[var->Name()] = var;
   }
 
   auto graph = new SSAGraph();
@@ -122,23 +156,43 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>(
       places_.size());
 
-  // Find "send" op first for split is in front of send.
-  OpDesc *send_op = GetSendOpDesc(program);
+  // find send/recv vars so that we can place the distributed training
+  // realted op in the place 0
+  auto send_vars = FindDistTrainSendVars(program);
+  auto recv_vars = FindDistTrainRecvVars(program);
 
-  size_t cur_device_id = 0;
   std::vector<std::unordered_set<std::string>> var_name_on_devices;
   std::vector<std::unordered_set<std::string>> bcast_var_name_set;
   var_name_on_devices.resize(places_.size());
   bcast_var_name_set.resize(places_.size());
 
+  size_t cur_device_id = 0;
+  std::vector<int64_t> balance_grads(places_.size(), 0);
+
+  auto get_appropriate_dev = [&](std::string &g_name) -> size_t {
+    auto var_desc = all_vars.at(g_name);
+    PADDLE_ENFORCE_NOT_NULL(var_desc);
+    auto dim = framework::make_ddim(var_desc->GetShape());
+    int64_t numel = framework::product(dim);
+    PADDLE_ENFORCE_GE(numel, 0);
+    auto smallest =
+        std::min_element(std::begin(balance_grads), std::end(balance_grads));
+    size_t dev_id =
+        static_cast<size_t>(std::distance(std::begin(balance_grads), smallest));
+    balance_grads[dev_id] += numel;
+    return dev_id;
+  };
+
   bool is_forwarding = true;
   for (auto *op : program.Block(0).AllOps()) {
-    if (op->Type() == "send") {
-      // append send op if program is distributed trainer main program.
+    if (boost::get<int>(
+            op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+        static_cast<int>(OpRole::kRPC)) {
+      // append rpc op if program is distributed trainer main program.
       // always use the first device
-      CreateSendOp(&result, *op);
-    } else if (IsDistTrainOp(*op, send_op)) {
-      CreateComputationalOps(&result, *op, 1);
+      CreateRPCOp(&result, *op);
+    } else if (IsDistTrainOp(*op, send_vars, recv_vars)) {
+      CreateDistTrainOp(&result, *op);
     } else if (IsScaleLossOp(*op)) {
       // user can customize loss@grad if not use_default_grad_scale_
       if (strategy_.gradient_scale_ !=
@@ -159,25 +213,39 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       if (!is_forwarding && places_.size() > 1) {
         // Currently, we assume that once gradient is generated, it can be
         // broadcast, and each gradient is only broadcast once.
-        for (auto &og : op->OutputArgumentNames()) {
-          if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
-            switch (strategy_.reduce_) {
-              case BuildStrategy::ReduceStrategy::kReduce:
-                CreateReduceOp(&result, og, cur_device_id);
-                var_name_on_devices[cur_device_id].emplace(og);
-                bcast_var_name_set[cur_device_id].emplace(
-                    og.substr(0, og.size() - strlen(kGradVarSuffix)));
-                cur_device_id = (cur_device_id + 1) % places_.size();
-                break;
-              case BuildStrategy::ReduceStrategy::kAllReduce:
-                if (IsSparseGradient(var_types, og)) {
-                  CreateReduceOp(&result, og, 0);
-                  CreateBroadcastOp(&result, og, 0);
-                } else {
-                  InsertNCCLAllReduceOp(&result, og);
-                }
-                break;
+        if (static_cast<bool>(boost::get<int>(op->GetAttr(
+                                  OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                              static_cast<int>(OpRole::kBackward))) {
+          try {
+            auto backward_vars =
+                boost::get<std::vector<std::string>>(op->GetNullableAttr(
+                    OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+
+            PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+
+            for (size_t i = 0; i < backward_vars.size(); i += 2) {
+              auto &p_name = backward_vars[i];
+              auto &g_name = backward_vars[i + 1];
+              VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
+
+              switch (strategy_.reduce_) {
+                case BuildStrategy::ReduceStrategy::kReduce:
+                  cur_device_id = get_appropriate_dev(g_name);
+                  CreateReduceOp(&result, g_name, cur_device_id);
+                  var_name_on_devices[cur_device_id].emplace(g_name);
+                  bcast_var_name_set[cur_device_id].emplace(p_name);
+                  break;
+                case BuildStrategy::ReduceStrategy::kAllReduce:
+                  if (IsSparseGradient(all_vars, g_name)) {
+                    CreateReduceOp(&result, g_name, 0);
+                    CreateBroadcastOp(&result, g_name, 0);
+                  } else {
+                    InsertAllReduceOp(&result, g_name);
+                  }
+                  break;
+              }
             }
+          } catch (boost::bad_get e) {
           }
         }
       }
@@ -202,25 +270,32 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    */
   AddOutputToLeafOps(&result);
 
-  if (VLOG_IS_ON(10)) {
-    std::ostringstream sout;
-    PrintGraphviz(*graph, sout);
-    VLOG(10) << sout.str();
-  }
-
   return std::unique_ptr<SSAGraph>(graph);
 }
 
 bool MultiDevSSAGraphBuilder::IsSparseGradient(
-    const std::unordered_map<std::string, proto::VarType::Type> &var_types,
+    const std::unordered_map<std::string, VarDesc *> &all_vars,
     const std::string &og) const {
-  PADDLE_ENFORCE(var_types.count(og) != 0);
-  if (var_types.at(og) == proto::VarType::SELECTED_ROWS) {
+  PADDLE_ENFORCE(all_vars.count(og) != 0);
+  if (all_vars.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
     return true;
   }
   return false;
 }
 
+void MultiDevSSAGraphBuilder::SetCommunicationContext(
+    OpHandleBase *op_handle, const platform::Place &p) const {
+#ifdef PADDLE_WITH_CUDA
+  if (nccl_ctxs_ == nullptr) {
+    op_handle->SetDeviceContext(p,
+                                platform::DeviceContextPool::Instance().Get(p));
+  }
+#else
+  op_handle->SetDeviceContext(p,
+                              platform::DeviceContextPool::Instance().Get(p));
+#endif
+}
+
 void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result,
                                                 const std::string &p_name,
                                                 size_t src_dev_id) const {
@@ -235,15 +310,12 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result,
   op_handle->AddInput(in);
 
   for (size_t i = 0; i < places_.size(); ++i) {
-    auto &vars = result->vars_.at(i).at(p_name);
     auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
+    auto &vars = result->vars_.at(i).at(p_name);
     auto *out_var = new VarHandle(vars.size(), i, p_name, p);
     vars.emplace_back(out_var);
     op_handle->AddOutput(out_var);
-#ifndef ADDLE_WITH_CUDA
-    op_handle->SetDeviceContext(p,
-                                platform::DeviceContextPool::Instance().Get(p));
-#endif
   }
 }
 
@@ -255,24 +327,19 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result,
   CreateOpHandleIOs(result, op, dev_id);
 }
 
-OpDesc *MultiDevSSAGraphBuilder::GetSendOpDesc(
-    const ProgramDesc &program) const {
-  for (auto *op : program.Block(0).AllOps()) {
-    if (op->Type() == "send") {
-      return op;
-    }
-  }
-  return nullptr;
-}
-void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
-    SSAGraph *result, const std::string &og) const {
+void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result,
+                                                const std::string &og) const {
 #ifdef PADDLE_WITH_CUDA
   result->ops_.emplace_back(
-      new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
+      new AllReduceOpHandle(local_scopes_, places_, nccl_ctxs_));
+#else
+  result->ops_.emplace_back(new AllReduceOpHandle(local_scopes_, places_));
+#endif
   auto *op_handle = result->ops_.back().get();
 
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
     auto &vars = result->vars_[i][og];
     PADDLE_ENFORCE(!vars.empty());
     auto &prev_grad = vars.back();
@@ -282,9 +349,6 @@ void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
     vars.emplace_back(var);
     op_handle->AddOutput(var);
   }
-#else
-  PADDLE_ENFORCE("Not implemented");
-#endif
 }
 
 bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
@@ -323,7 +387,9 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
   for (size_t i = 0; i < places_.size(); ++i) {
 // Insert ScaleCost OpHandle
 #ifdef PADDLE_WITH_CUDA
-    auto *communication_dev_ctx = nccl_ctxs_->DevCtx(places_[i]);
+    auto *communication_dev_ctx =
+        nccl_ctxs_ ? nccl_ctxs_->DevCtx(places_[i])
+                   : platform::DeviceContextPool::Instance().Get(places_[i]);
 #else
     auto *communication_dev_ctx =
         platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
@@ -368,12 +434,9 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
   auto *op_handle = result->ops_.back().get();
 
   for (size_t i = 0; i < places_.size(); ++i) {
-    auto &vars = result->vars_[i][og];
-#ifndef PADDLE_WITH_CUDA
     auto &p = places_[i];
-    op_handle->SetDeviceContext(p,
-                                platform::DeviceContextPool::Instance().Get(p));
-#endif
+    SetCommunicationContext(op_handle, p);
+    auto &vars = result->vars_[i][og];
     PADDLE_ENFORCE(!vars.empty());
     auto &prev_grad = vars.back();
     op_handle->AddInput(prev_grad.get());
@@ -386,23 +449,57 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
   return var;
 }
 
-void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
-                                           const OpDesc &op) const {
-  auto &p = places_[0];
-  auto *s = local_scopes_[0];
-  // FIXME(wuyi): send op always copy from GPU 0
-  result->ops_.emplace_back(new SendOpHandle(op, s, p));
-  // Create inputs for output on original place and no ssa output
-  // is created for send op.
+void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
+                                        const std::string &prev_op_name) const {
+  for (auto &prev_op : result->ops_) {
+    if (prev_op->Name() == prev_op_name) {
+      auto *dep_var = new DummyVarHandle();
+      prev_op->AddOutput(dep_var);
+      result->dep_vars_.emplace(dep_var);
+      op->AddInput(dep_var);
+    }
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
+                                                const OpDesc &op) const {
+  CreateComputationalOp(result, op, 0);
+  if (op.Type() == "concat") {
+    ConnectOp(result, result->ops_.back().get(), "fetch_barrier");
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
+                                          const OpDesc &op) const {
+  result->ops_.emplace_back(
+      new RPCOpHandle(op, local_scopes_[0], op.Type(), places_[0]));
+
+  if (op.Type() == "send_barrier") {
+    ConnectOp(result, result->ops_.back().get(), "send");
+  } else if (op.Type() == "recv") {
+    ConnectOp(result, result->ops_.back().get(), "send_barrier");
+  } else if (op.Type() == "fetch_barrier") {
+    ConnectOp(result, result->ops_.back().get(), "recv");
+  } else if (op.Type() == "send") {
+    // do nothing
+  } else {
+    PADDLE_THROW(
+        "rpc op should be in ["
+        "send, send_barrier. recv, fetch_barrier]");
+  }
+
+  // TODO(Yancey1989): schedule rpc op on different place may
+  // increate throughput
   CreateOpHandleIOs(result, op, 0);
 }
 
 bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
-  // FIXME(yy): Do not hard code like this
-  return op.OutputArgumentNames().size() == 1 &&
-         op.OutputArgumentNames()[0] == GradVarName(loss_var_name_);
+  return boost::get<int>(
+             op.GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+             (static_cast<int>(OpRole::kBackward) |
+              static_cast<int>(OpRole::kLoss)) &&
+         !loss_var_name_.empty();  // If loss_var is empty. This is test mode
 }
-
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index 4f70852188..78581755fe 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -64,12 +64,24 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 
   bool IsScaleLossOp(const OpDesc &op) const;
 
-  void CreateSendOp(SSAGraph *result, const OpDesc &op) const;
+  void CreateRPCOp(SSAGraph *result, const OpDesc &op) const;
+  void CreateDistTrainOp(SSAGraph *result, const OpDesc &op) const;
 
   /**
    * Is this operator as the end-point operator before/after send operator.
    */
-  bool IsDistTrainOp(const OpDesc &op, OpDesc *send_op) const;
+  bool IsDistTrainOp(const OpDesc &op,
+                     const std::vector<std::string> &send_vars,
+                     const std::vector<std::string> &recv_vars) const;
+
+  std::vector<std::string> FindDistTrainSendVars(
+      const ProgramDesc &program) const;
+
+  std::vector<std::string> FindDistTrainRecvVars(
+      const ProgramDesc &program) const;
+
+  void ConnectOp(SSAGraph *result, OpHandleBase *op,
+                 const std::string &prev_op_name) const;
 
   void CreateComputationalOps(SSAGraph *result, const OpDesc &op,
                               size_t num_places) const;
@@ -88,23 +100,20 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
       const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
       const OpDesc &op) const;
 
-  void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
+  void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
 
   void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
                          size_t src_dev_id) const;
 
-  /**
-   * Get send op in the global block of program.
-   * nullptr if not found.
-   */
-  OpDesc *GetSendOpDesc(const ProgramDesc &program) const;
-
   bool IsSparseGradient(
-      const std::unordered_map<std::string, proto::VarType::Type> &var_types,
+      const std::unordered_map<std::string, VarDesc *> &all_vars,
       const std::string &og) const;
 
  private:
   BuildStrategy strategy_;
+
+  void SetCommunicationContext(OpHandleBase *op_handle,
+                               const platform::Place &p) const;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 6b064650b4..f79565fe71 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -39,9 +39,9 @@ OpHandleBase::~OpHandleBase() {
 #endif
 }
 
-void OpHandleBase::Run(bool use_event) {
+void OpHandleBase::Run(bool use_cuda) {
 #ifdef PADDLE_WITH_CUDA
-  if (events_.empty() && use_event) {
+  if (events_.empty() && use_cuda) {
     for (auto &p : dev_ctxes_) {
       int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
       PADDLE_ENFORCE(cudaSetDevice(dev_id));
@@ -50,7 +50,7 @@ void OpHandleBase::Run(bool use_event) {
     }
   }
 #else
-  PADDLE_ENFORCE(!use_event);
+  PADDLE_ENFORCE(!use_cuda);
 #endif
 
   RunImpl();
@@ -104,6 +104,16 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
   }
 }
 
+size_t OpHandleBase::NoDummyInputSize() const {
+  size_t cnt = 0;
+  for (auto *in : inputs_) {
+    if (dynamic_cast<DummyVarHandle *>(in) == nullptr) {
+      ++cnt;
+    }
+  }
+  return cnt;
+}
+
 bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
   return in_var && in_var->generated_op_;
 }
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 8f94206a87..fbd90a3296 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -36,7 +36,7 @@ class OpHandleBase {
 
   virtual std::string Name() const = 0;
 
-  void Run(bool use_event);
+  void Run(bool use_cuda);
 
   virtual void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx);
 
@@ -80,6 +80,8 @@ class OpHandleBase {
 
   const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
 
+  size_t NoDummyInputSize() const;
+
  protected:
   void RunAndRecordEvent(const std::function<void()> &callback);
 
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index 1c4b059cd0..eea7e712f8 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -96,10 +96,7 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
     info->proto_ = new proto::OpProto;
     info->checker_ = new OpAttrChecker();
     T maker;
-    maker.SetProto(info->proto_);
-    maker.SetChecker(info->checker_);
-    maker.Make();
-    maker.Validate();
+    maker(info->proto_, info->checker_);
     info->proto_->set_type(op_type);
     PADDLE_ENFORCE(
         info->proto_->IsInitialized(),
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
index 2b95a28499..a6ffb37313 100644
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -37,7 +37,9 @@ struct ReduceLoDTensor {
     PADDLE_ENFORCE_NE(t0.numel(), 0);
     dst_tensor_.Resize(t0.dims());
     T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
-    std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
+    if (dst != t0.data<T>()) {
+      std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
+    }
 
     for (size_t i = 1; i < src_tensors_.size(); ++i) {
       auto &t = *src_tensors_[i];
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index c652a2f4eb..4d14334cdf 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -32,8 +32,8 @@ namespace framework {
 namespace details {
 
 struct ReduceOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
 
 #ifdef PADDLE_WITH_CUDA
   const platform::NCCLContextMap *nccl_ctxs_;
diff --git a/paddle/fluid/framework/details/send_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
similarity index 76%
rename from paddle/fluid/framework/details/send_op_handle.cc
rename to paddle/fluid/framework/details/rpc_op_handle.cc
index 7109659dd7..586465f99f 100644
--- a/paddle/fluid/framework/details/send_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -12,24 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/send_op_handle.h"
+#include "paddle/fluid/framework/details/rpc_op_handle.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-SendOpHandle::SendOpHandle(const framework::OpDesc &op_desc,
-                           const Scope *local_scope,
-                           const platform::Place &place)
+RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc,
+                         const Scope *local_scope, const std::string &name,
+                         const platform::Place &place)
     : op_(framework::OpRegistry::CreateOp(op_desc)),
       local_scope_(local_scope),
+      name_(name),
       place_(place) {}
 
-void SendOpHandle::RunImpl() {
+void RPCOpHandle::RunImpl() {
   // TODO(wuyi): need further analysis whether wait VarDummyHandle.
   // Wait input done
   for (auto *in : inputs_) {
     auto &p = static_cast<VarHandle *>(in)->place_;
+    // FIXME(Yancey1989): need a better solution instead of use DebugString()
     if (in->DebugString() == "dummy") {  // HACK
       continue;
     }
@@ -43,7 +45,7 @@ void SendOpHandle::RunImpl() {
   op_->Run(*tmp_scope, place_);
 }
 
-std::string SendOpHandle::Name() const { return "send"; }
+std::string RPCOpHandle::Name() const { return name_; }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/send_op_handle.h b/paddle/fluid/framework/details/rpc_op_handle.h
similarity index 85%
rename from paddle/fluid/framework/details/send_op_handle.h
rename to paddle/fluid/framework/details/rpc_op_handle.h
index 2f78811fad..ae38c7fe19 100644
--- a/paddle/fluid/framework/details/send_op_handle.h
+++ b/paddle/fluid/framework/details/rpc_op_handle.h
@@ -27,9 +27,9 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-struct SendOpHandle : public OpHandleBase {
-  SendOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
-               const platform::Place& place);
+struct RPCOpHandle : public OpHandleBase {
+  RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
+              const std::string& name, const platform::Place& place);
 
   std::string Name() const override;
 
@@ -43,7 +43,8 @@ struct SendOpHandle : public OpHandleBase {
  private:
   std::unique_ptr<OperatorBase> op_;
   const Scope* local_scope_;
-  const platform::Place& place_;
+  const std::string name_;
+  platform::Place place_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
new file mode 100644
index 0000000000..eb4e7ec52f
--- /dev/null
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/executor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
+    ExecutionStrategy strategy, std::vector<Scope *> local_scopes,
+    std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
+    std::unique_ptr<SSAGraphExecutor> &&underlying_executor)
+    : strategy_(std::move(strategy)),
+      underlying_executor_(std::move(underlying_executor)),
+      local_scopes_(std::move(local_scopes)),
+      var_infos_(std::move(var_infos)),
+      places_(std::move(places)) {}
+
+FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  if (drop_scope_counter_ == 0) {
+    // Create local scopes.
+    for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) {
+      auto &scope = *it;
+      Scope &local_scope = scope->NewScope();
+      *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
+          &local_scope;
+
+      for (auto &info : var_infos_) {
+        if (scope->FindVar(info.name_) != nullptr) {
+          continue;
+        }
+
+        if (info.persistable_) {  // Persistable
+          InitializeVariable(scope->Var(info.name_), info.type_);
+        } else {
+          InitializeVariable(local_scope.Var(info.name_), info.type_);
+        }
+      }
+    }
+  }
+
+  auto fetch_data = underlying_executor_->Run(fetch_tensors);
+  drop_scope_counter_ += 1;
+  if (!fetch_tensors.empty() ||
+      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
+    drop_scope_counter_ = 0;
+    // Wait All computational streams
+    for (auto p : places_) {
+      platform::DeviceContextPool::Instance().Get(p)->Wait();
+    }
+    for (auto &scope : local_scopes_) {
+      auto &local_scope =
+          *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
+      scope->DeleteScope(local_scope);
+    }
+  }
+  return fetch_data;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
new file mode 100644
index 0000000000..20df7a4722
--- /dev/null
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_executor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct VariableInfo {
+  std::string name_;
+  proto::VarType::Type type_;
+  bool persistable_;
+};
+
+class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
+ public:
+  ScopeBufferedSSAGraphExecutor(
+      ExecutionStrategy strategy, std::vector<Scope*> local_scopes,
+      std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
+      std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
+  FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
+
+ private:
+  size_t drop_scope_counter_{0};
+
+  ExecutionStrategy strategy_;
+  std::unique_ptr<SSAGraphExecutor> underlying_executor_;
+  std::vector<Scope*> local_scopes_;
+  std::vector<VariableInfo> var_infos_;
+  std::vector<platform::Place> places_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index 6a56752755..88a21f4887 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include <utility>
 
 namespace paddle {
 namespace framework {
@@ -73,64 +73,6 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
   op_handle->AddOutput(var);
 }
 
-template <typename Callback>
-void IterAllVar(const SSAGraph &graph, Callback callback) {
-  for (auto &each : graph.vars_) {
-    for (auto &pair1 : each) {
-      for (auto &pair2 : pair1.second) {
-        callback(*pair2);
-      }
-    }
-  }
-
-  for (auto &var : graph.dep_vars_) {
-    callback(*var);
-  }
-}
-
-void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
-  size_t var_id = 0;
-  std::unordered_map<const VarHandleBase *, size_t> vars;
-
-  sout << "digraph G {\n";
-
-  IterAllVar(graph, [&](const VarHandleBase &var) {
-    auto *var_ptr = &var;
-    auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
-    auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
-
-    size_t cur_var_id = var_id++;
-    vars[var_ptr] = cur_var_id;
-
-    if (var_handle_ptr) {
-      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
-           << "\\n"
-           << var_handle_ptr->place_ << "\\n"
-           << var_handle_ptr->version_ << "\"]" << std::endl;
-    } else if (dummy_ptr) {
-      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
-    }
-  });
-
-  size_t op_id = 0;
-  for (auto &op : graph.ops_) {
-    std::string op_name = "op_" + std::to_string(op_id++);
-    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
-         << std::endl;
-    for (auto in : op->Inputs()) {
-      std::string var_name = "var_" + std::to_string(vars[in]);
-      sout << var_name << " -> " << op_name << std::endl;
-    }
-
-    for (auto out : op->Outputs()) {
-      std::string var_name = "var_" + std::to_string(vars[out]);
-      sout << op_name << " -> " << var_name << std::endl;
-    }
-  }
-
-  sout << "}\n";
-}
-
 void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
   for (auto &op : graph->ops_) {
     if (!op->Outputs().empty()) {
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index 64e5d93081..5fc12a44b5 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -55,8 +55,6 @@ class SSAGraphBuilder {
                              const platform::Place &place, size_t place_offset);
 
   static void AddOutputToLeafOps(SSAGraph *graph);
-
-  static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout);
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
new file mode 100644
index 0000000000..b4b49d3de6
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
+#include <fstream>
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/ssa_graph_checker.h"
+#include "paddle/fluid/framework/details/ssa_graph_printer.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
+  std::unique_ptr<SSAGraphBuilder> res(
+#ifdef PADDLE_WITH_CUDA
+      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
+                                  local_scopes_, nccl_ctxs_, strategy_)
+#else
+      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
+                                  local_scopes_, strategy_)
+#endif
+          );  // NOLINT
+
+  if (!strategy_.debug_graphviz_path_.empty()) {
+    std::unique_ptr<std::ostream> fout(
+        new std::ofstream(strategy_.debug_graphviz_path_));
+    PADDLE_ENFORCE(fout->good());
+    std::unique_ptr<GraphvizSSAGraphPrinter> graphviz_printer(
+        new GraphvizSSAGraphPrinter());
+    res.reset(new SSAGraghBuilderWithPrinter(
+        std::move(fout), std::move(graphviz_printer), std::move(res)));
+  }
+  res.reset(new SSAGraghBuilderWithChecker(std::move(res)));
+
+  return res;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.h b/paddle/fluid/framework/details/ssa_graph_builder_factory.h
new file mode 100644
index 0000000000..91a119de83
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/platform/place.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace details {
+
+class SSAGraphBuilderFactory {
+ public:
+  SSAGraphBuilderFactory(const std::vector<platform::Place>& places,
+                         const std::string& loss_var_name,
+                         const std::unordered_set<std::string>& param_names,
+                         const std::vector<Scope*>& local_scopes,
+                         const BuildStrategy& strategy)
+      : places_(places),
+        loss_var_name_(loss_var_name),
+        param_names_(param_names),
+        local_scopes_(local_scopes),
+        strategy_(strategy) {
+#ifdef PADDLE_WITH_CUDA
+    nccl_ctxs_ = nullptr;
+#endif
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) {
+    nccl_ctxs_ = nccl_ctxs;
+  }
+#endif
+
+  std::unique_ptr<SSAGraphBuilder> Create();
+
+ private:
+  std::vector<platform::Place> places_;
+  std::string loss_var_name_;
+  std::unordered_set<std::string> param_names_;
+  std::vector<Scope*> local_scopes_;
+  BuildStrategy strategy_;
+
+#ifdef PADDLE_WITH_CUDA
+  platform::NCCLContextMap* nccl_ctxs_;
+#endif
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/ssa_graph_checker.cc
new file mode 100644
index 0000000000..da5428946e
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_checker.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/ssa_graph.h"
+#include <string>
+#include "paddle/fluid/framework/details/ssa_graph_checker.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
+  std::unordered_map<OpHandleBase *, size_t> pending_ops;
+  std::unordered_set<VarHandleBase *> pending_vars;
+  std::unordered_set<VarHandleBase *> ready_vars;
+  std::unordered_set<OpHandleBase *> ready_ops;
+
+  auto insert_pending_var = [&](VarHandleBase *var) {
+    pending_vars.insert(var);
+    if (var->generated_op_ == nullptr) {
+      ready_vars.emplace(var);
+    }
+  };
+
+  for (auto &var_map : graph->vars_) {
+    for (auto &name_pair : var_map) {
+      for (auto &version_pair : name_pair.second) {
+        insert_pending_var(version_pair.get());
+      }
+    }
+  }
+
+  for (auto &var : graph->dep_vars_) {
+    insert_pending_var(var.get());
+  }
+
+  for (auto &op : graph->ops_) {
+    if (op->Inputs().empty()) {
+      ready_ops.insert(op.get());
+    } else {
+      pending_ops.insert({op.get(), op.get()->NoDupInputSize()});
+    }
+  }
+
+  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
+    for (auto *op : set) {
+      for (auto out : op->Outputs()) {
+        ready_vars.emplace(out);
+      }
+    }
+    set.clear();
+  };
+
+  while (!pending_vars.empty()) {
+    run_all_ops(ready_ops);
+
+    if (ready_vars.empty()) {
+      return false;
+    }
+
+    for (auto ready_var : ready_vars) {
+      pending_vars.erase(ready_var);
+      for (auto *op : ready_var->pending_ops_) {
+        auto &deps = --pending_ops[op];
+        if (deps == 0) {
+          ready_ops.insert(op);
+        }
+      }
+    }
+    ready_vars.clear();
+  }
+  return true;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h
new file mode 100644
index 0000000000..304b221e7e
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+struct SSAGraph;
+
+class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
+ public:
+  explicit SSAGraghBuilderWithChecker(
+      std::unique_ptr<SSAGraphBuilder>&& builder)
+      : builder_(std::move(builder)) {}
+
+  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
+    auto graph = builder_->Build(program);
+    PADDLE_ENFORCE(IsValidGraph(graph.get()));
+    return graph;
+  }
+
+  bool IsValidGraph(const SSAGraph* graph) const;
+
+ private:
+  std::unique_ptr<SSAGraphBuilder> builder_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc
index 8da6ca889b..09b97bd0d9 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
@@ -17,10 +17,6 @@
 namespace paddle {
 namespace framework {
 namespace details {
-
-SSAGraphExecutor::SSAGraphExecutor(std::unique_ptr<SSAGraph> &&graph)
-    : graph_(std::move(graph)) {}
-
 SSAGraphExecutor::~SSAGraphExecutor() {}
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
index a8833b7388..9580860336 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -28,15 +28,11 @@ class SSAGraphExecutor {
   DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
 
  public:
-  // Steal graph inside
-  explicit SSAGraphExecutor(std::unique_ptr<SSAGraph> &&graph);
+  SSAGraphExecutor() {}
 
   virtual ~SSAGraphExecutor();
 
   virtual FeedFetchList Run(const std::vector<std::string> &fetch_tensors) = 0;
-
- protected:
-  std::unique_ptr<SSAGraph> graph_;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/ssa_graph_printer.cc
new file mode 100644
index 0000000000..22a40ca4b2
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_printer.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/ssa_graph_printer.h"
+#include <string>
+#include "paddle/fluid/framework/details/ssa_graph.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+template <typename Callback>
+static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
+  for (auto &each : graph.vars_) {
+    for (auto &pair1 : each) {
+      for (auto &pair2 : pair1.second) {
+        callback(*pair2);
+      }
+    }
+  }
+
+  for (auto &var : graph.dep_vars_) {
+    callback(*var);
+  }
+}
+
+void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
+                                    std::ostream &sout) const {
+  size_t var_id = 0;
+  std::unordered_map<const VarHandleBase *, size_t> vars;
+
+  sout << "digraph G {\n";
+
+  IterAllVar(graph, [&](const VarHandleBase &var) {
+    auto *var_ptr = &var;
+    auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
+    auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
+
+    size_t cur_var_id = var_id++;
+    vars[var_ptr] = cur_var_id;
+
+    if (var_handle_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
+           << "\\n"
+           << var_handle_ptr->place_ << "\\n"
+           << var_handle_ptr->version_ << "\"]" << std::endl;
+    } else if (dummy_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
+    }
+  });
+
+  size_t op_id = 0;
+  for (auto &op : graph.ops_) {
+    std::string op_name = "op_" + std::to_string(op_id++);
+    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
+         << std::endl;
+    for (auto in : op->Inputs()) {
+      std::string var_name = "var_" + std::to_string(vars[in]);
+      sout << var_name << " -> " << op_name << std::endl;
+    }
+
+    for (auto out : op->Outputs()) {
+      std::string var_name = "var_" + std::to_string(vars[out]);
+      sout << op_name << " -> " << var_name << std::endl;
+    }
+  }
+
+  sout << "}\n";
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
new file mode 100644
index 0000000000..b4c9001378
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iosfwd>
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+struct SSAGraph;
+class SSAGraphPrinter {
+ public:
+  virtual ~SSAGraphPrinter() {}
+  virtual void Print(const SSAGraph& graph, std::ostream& sout) const = 0;
+};
+
+class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
+ public:
+  void Print(const SSAGraph& graph, std::ostream& sout) const override;
+};
+
+class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
+ public:
+  SSAGraghBuilderWithPrinter(std::ostream& sout,
+                             std::unique_ptr<SSAGraphPrinter>&& printer,
+                             std::unique_ptr<SSAGraphBuilder>&& builder)
+      : printer_(std::move(printer)),
+        builder_(std::move(builder)),
+        stream_ref_(sout) {}
+
+  SSAGraghBuilderWithPrinter(std::unique_ptr<std::ostream>&& sout,
+                             std::unique_ptr<SSAGraphPrinter>&& printer,
+                             std::unique_ptr<SSAGraphBuilder>&& builder)
+      : printer_(std::move(printer)),
+        builder_(std::move(builder)),
+        stream_ptr_(std::move(sout)),
+        stream_ref_(*stream_ptr_) {}
+
+  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
+    auto graph = builder_->Build(program);
+    printer_->Print(*graph, stream_ref_);
+    return graph;
+  }
+
+ private:
+  std::unique_ptr<SSAGraphPrinter> printer_;
+  std::unique_ptr<SSAGraphBuilder> builder_;
+  std::unique_ptr<std::ostream> stream_ptr_;
+  std::ostream& stream_ref_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 815f739371..6c5098ce85 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -21,7 +21,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
     std::unique_ptr<SSAGraph> &&graph)
-    : SSAGraphExecutor(std::move(graph)),
+    : graph_(std::move(graph)),
       pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                        : nullptr),
       local_scopes_(local_scopes),
@@ -185,12 +185,15 @@ void ThreadedSSAGraphExecutor::InsertPendingVar(
     ready_vars->Push(var);
   }
 }
+
 void ThreadedSSAGraphExecutor::RunOp(
     BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
   auto op_run = [ready_var_q, op, this] {
     try {
-      VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
-      op->Run(strategy_.use_event_);
+      if (VLOG_IS_ON(10)) {
+        VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
+      }
+      op->Run(strategy_.use_cuda_);
       VLOG(10) << op << " " << op->Name() << " Done ";
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 1f7f88d752..4a2075f1cc 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -51,6 +51,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
              details::OpHandleBase *op);
 
  private:
+  std::unique_ptr<SSAGraph> graph_;
   std::unique_ptr<::ThreadPool> pool_;
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 4e431561f8..4a6f53cba1 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -20,13 +20,14 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#endif
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(benchmark);
-DEFINE_bool(check_nan_inf, false,
-            "Checking whether operator produce NAN/INF or not. It will be "
-            "extremely slow so please use this flag wisely.");
+DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
 
 namespace paddle {
 namespace framework {
@@ -46,6 +47,14 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
 
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+void Executor::Complete() {
+  ::paddle::operators::detail::RPCClient::GetInstance<
+      ::paddle::operators::detail::GRPCClient>()
+      ->SendComplete();
+}
+#endif
+
 void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   if (var_type == proto::VarType::LOD_TENSOR) {
     var->GetMutable<LoDTensor>();
@@ -78,21 +87,6 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   }
 }
 
-static void CheckTensorNANOrInf(const std::string& name,
-                                const framework::Tensor& tensor) {
-  if (tensor.memory_size() == 0) {
-    return;
-  }
-  if (tensor.type().hash_code() != typeid(float).hash_code() &&   // NOLINT
-      tensor.type().hash_code() != typeid(double).hash_code()) {  // NOLINT
-    return;
-  }
-  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
-                 "Tensor %s contains Inf", name);
-  PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
-                 "Tensor %s contains NAN", name);
-}
-
 void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
                                int block_id) {
   auto& global_block = pdesc.Block(block_id);
@@ -133,6 +127,7 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                    bool create_local_scope, bool create_vars) {
   platform::RecordBlock b(block_id);
+  if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
   auto ctx = Prepare(pdesc, block_id);
   RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
 }
@@ -232,16 +227,18 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
                    const std::string& feed_holder_name,
                    const std::string& fetch_holder_name) {
   platform::RecordBlock b(kProgramId);
+  if (FLAGS_use_mkldnn) EnableMKLDNN(program);
   bool has_feed_ops =
       has_feed_operators(program.Block(0), *feed_targets, feed_holder_name);
   bool has_fetch_ops =
       has_fetch_operators(program.Block(0), *fetch_targets, fetch_holder_name);
 
   ProgramDesc* copy_program = const_cast<ProgramDesc*>(&program);
+  std::unique_ptr<ProgramDesc> unique_ptr_of_copy_program;
   if (!has_feed_ops || !has_fetch_ops) {
-    copy_program = std::unique_ptr<ProgramDesc>(new ProgramDesc(program)).get();
+    unique_ptr_of_copy_program.reset(new ProgramDesc(program));
+    copy_program = unique_ptr_of_copy_program.get();
   }
-
   auto* global_block = copy_program->MutableBlock(0);
 
   if (!has_feed_ops) {
@@ -340,15 +337,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
               << memory::memory_usage(place_);
     }
-    if (FLAGS_check_nan_inf) {
-      for (auto& vname : op->OutputVars(true)) {
-        auto* var = local_scope->FindVar(vname);
-        if (var == nullptr) continue;
-        if (var->IsType<framework::LoDTensor>()) {
-          CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
-        }
-      }
-    }
   }
   platform::DeviceContextPool::Instance().Get(place_)->Wait();
   if (create_vars && create_local_scope) {
@@ -403,5 +391,19 @@ void Executor::RunPreparedContext(
   }
 }
 
+void Executor::EnableMKLDNN(const ProgramDesc& program) {
+#ifdef PADDLE_WITH_MKLDNN
+  VLOG(3) << "use_mkldnn=True";
+  for (size_t bid = 0; bid < program.Size(); ++bid) {
+    auto* block = const_cast<ProgramDesc&>(program).MutableBlock(bid);
+    for (auto* op : block->AllOps()) {
+      if (op->HasAttr("use_mkldnn")) {
+        op->SetAttr("use_mkldnn", true);
+      }
+    }
+  }
+#endif
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 0c3c23611d..67a0761dac 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -44,6 +44,13 @@ class Executor {
 
   explicit Executor(const platform::Place& place);
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+  /*
+   * Sending signal to pserver to mark current trainer stop.
+   */
+  void Complete();
+#endif
+
   /* @Brief
    * Runtime evaluation of the given ProgramDesc under certain Scope
    *
@@ -81,6 +88,8 @@ class Executor {
                           const std::string& feed_holder_name = "feed",
                           const std::string& fetch_holder_name = "fetch");
 
+  void EnableMKLDNN(const ProgramDesc& program);
+
  private:
   const platform::Place place_;
 };
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index d35125fe8c..68fcc104d4 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -71,6 +71,7 @@ message OpProto {
     optional bool duplicable = 3 [ default = false ];
     optional bool intermediate = 4 [ default = false ];
     optional bool dispensable = 5 [ default = false ];
+    optional string reuse = 6;
   }
 
   // AttrProto describes the C++ type Attribute.
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 076c457130..f92769192c 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <unordered_map>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/shape_inference.h"
@@ -102,7 +103,7 @@ void OpDesc::CopyFrom(const OpDesc &op_desc) {
   need_update_ = true;
 }
 
-OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
+OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block)
     : desc_(desc), need_update_(false) {
   // restore inputs_
   int input_size = desc_.inputs_size();
@@ -222,6 +223,15 @@ Attribute OpDesc::GetAttr(const std::string &name) const {
   return it->second;
 }
 
+Attribute OpDesc::GetNullableAttr(const std::string &name) const {
+  auto it = attrs_.find(name);
+  if (it != attrs_.end()) {
+    return it->second;
+  } else {
+    return Attribute();
+  }
+}
+
 int OpDesc::GetBlockAttr(const std::string &name) const {
   auto it = attrs_.find(name);
   PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
@@ -233,13 +243,8 @@ const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
 }
 
 void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
-  for (auto &input : inputs_) {
-    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
-  }
-  for (auto &output : outputs_) {
-    std::replace(output.second.begin(), output.second.end(), old_name,
-                 new_name);
-  }
+  RenameInput(old_name, new_name);
+  RenameOutput(old_name, new_name);
   need_update_ = true;
 }
 
@@ -249,6 +254,13 @@ void OpDesc::RenameOutput(const std::string &old_name,
     std::replace(output.second.begin(), output.second.end(), old_name,
                  new_name);
   }
+
+  auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
+  if (it != attrs_.end()) {
+    auto &op_vars = boost::get<std::vector<std::string>>(it->second);
+    std::replace(op_vars.begin(), op_vars.end(), old_name, new_name);
+  }
+
   need_update_ = true;
 }
 
@@ -257,6 +269,13 @@ void OpDesc::RenameInput(const std::string &old_name,
   for (auto &input : inputs_) {
     std::replace(input.second.begin(), input.second.end(), old_name, new_name);
   }
+
+  auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
+  if (it != attrs_.end()) {
+    auto &op_vars = boost::get<std::vector<std::string>>(it->second);
+    std::replace(op_vars.begin(), op_vars.end(), old_name, new_name);
+  }
+
   need_update_ = true;
 }
 
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 3ee36a47c1..a02d3e2691 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -33,13 +33,14 @@ class OpDesc {
   OpDesc(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const AttributeMap &attrs);
 
-  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block);
+  OpDesc(const proto::OpDesc &desc, BlockDesc *block);
 
   explicit OpDesc(BlockDesc *block) : block_(block) {}
 
   OpDesc(const OpDesc &other, BlockDesc *block) {
     *this = other;
     block_ = block;
+    need_update_ = true;
   }
 
   void CopyFrom(const OpDesc &op_desc);
@@ -78,6 +79,8 @@ class OpDesc {
 
   Attribute GetAttr(const std::string &name) const;
 
+  Attribute GetNullableAttr(const std::string &name) const;
+
   int GetBlockAttr(const std::string &name) const;
 
   void Rename(const std::string &old_name, const std::string &new_name);
diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc
index b99e82f8c4..f1261dee03 100644
--- a/paddle/fluid/framework/op_info.cc
+++ b/paddle/fluid/framework/op_info.cc
@@ -17,12 +17,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-static OpInfoMap* g_op_info_map = nullptr;
-
+// C++11 removes the need for manual locking. Concurrent execution shall wait if
+// a static local variable is already being initialized.
+// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
 OpInfoMap& OpInfoMap::Instance() {
-  if (g_op_info_map == nullptr) {
-    g_op_info_map = new OpInfoMap();
-  }
+  static OpInfoMap* g_op_info_map = new OpInfoMap();
   return *g_op_info_map;
 }
 }  // namespace framework
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index fab20d75f5..f51a184e7b 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -87,7 +87,14 @@ inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
 }
 
 inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
-  return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r;
+  bool ret =
+      (l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r);
+#ifdef PADDLE_WITH_MKLDNN
+  // Layout transform needed for either non-MKLDNN to MKLDNN or vice versa
+  ret |= (l != DataLayout::kMKLDNN && r == DataLayout::kMKLDNN);
+  ret |= (l == DataLayout::kMKLDNN && r != DataLayout::kMKLDNN);
+#endif
+  return ret;
 }
 
 inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index c479d7617c..001b5cb5a8 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -13,6 +13,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include <string>
+#include <vector>
 
 namespace paddle {
 namespace framework {
@@ -20,6 +21,7 @@ namespace framework {
 void OpProtoAndCheckerMaker::Validate() {
   validated_ = true;
   CheckNoDuplicatedInOutAttrs();
+  CheckReuseVars();
 }
 
 OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput(
@@ -55,5 +57,46 @@ void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
   }
 }
 
+void OpProtoAndCheckerMaker::CheckReuseVars() {
+  std::unordered_set<std::string> names;
+  for (auto& input : proto_->inputs()) {
+    names.insert(input.name());
+  }
+  auto checker = [&](const std::string& name, const std::string& reused) {
+    PADDLE_ENFORCE(
+        names.count(reused),
+        "Output [%s] reuse Input [%s], but the input is not registered.", name,
+        reused);
+  };
+  for (auto& output : proto_->outputs()) {
+    if (output.has_reuse()) {
+      checker(output.name(), output.reuse());
+    }
+  }
+}
+
+void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
+                                        OpAttrChecker* attr_checker) {
+  proto_ = proto;
+  op_checker_ = attr_checker;
+  Make();
+
+  AddAttr<int>(OpRoleAttrName(), "The role of this operator")
+      .InEnum(
+          {static_cast<int>(OpRole::kForward),
+           static_cast<int>(OpRole::kBackward),
+           static_cast<int>(OpRole::kOptimize), static_cast<int>(OpRole::kRPC),
+           static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
+           static_cast<int>(OpRole::kLoss) |
+               static_cast<int>(OpRole::kBackward),
+           static_cast<int>(OpRole::kNotSpecified)})
+      .SetDefault(static_cast<int>(OpRole::kNotSpecified));
+  AddAttr<std::vector<std::string>>(OpRoleVarAttrName(),
+                                    "Optimized for variable")
+      .SetDefault({});
+
+  Validate();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index b01a520bba..92f86bb5de 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -14,27 +14,40 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include <unordered_set>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/framework.pb.h"
 namespace paddle {
 namespace framework {
 
+enum class OpRole {
+  kForward = 0x0000,
+  kBackward = 0x0001,
+  kOptimize = 0x0002,
+  kRPC = 0x0003,
+
+  kLoss = 0x0100,
+  // The default value of op's role. This should be only used for unittests and
+  // CreateOp inside a operator.
+  kNotSpecified = 0x1000,
+};
+
 // this class not only make proto but also init attribute checkers.
 class OpProtoAndCheckerMaker {
  public:
+  static const char *OpRoleAttrName() { return "op_role"; }
+  static const char *OpRoleVarAttrName() { return "op_role_var"; }
+
+  void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
+
   virtual void Make() = 0;
 
   virtual ~OpProtoAndCheckerMaker() {
     CHECK(validated_) << "should call Validate after build";
   }
 
-  void SetProto(proto::OpProto *proto) { proto_ = proto; }
-
-  void SetChecker(OpAttrChecker *attr_checker) { op_checker_ = attr_checker; }
-
-  void Validate();
-
  protected:
   struct VariableBuilder {
     proto::OpProto::Var *var_;
@@ -53,6 +66,11 @@ class OpProtoAndCheckerMaker {
       var_->set_dispensable(true);
       return *this;
     }
+
+    VariableBuilder &Reuse(const std::string &name) {
+      var_->set_reuse(name);
+      return *this;
+    }
   };
 
   VariableBuilder AddInput(const std::string &name, const std::string &comment);
@@ -76,6 +94,9 @@ class OpProtoAndCheckerMaker {
 
  private:
   void CheckNoDuplicatedInOutAttrs();
+  void Validate();
+
+  void CheckReuseVars();
 
   proto::OpProto *proto_;
   OpAttrChecker *op_checker_;
diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
index 9b5badbc81..58f70cb39c 100644
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -28,10 +28,8 @@ TEST(ProtoMaker, DuplicatedAttr) {
   paddle::framework::proto::OpProto op_proto;
   paddle::framework::OpAttrChecker op_checker;
   TestAttrProtoMaker proto_maker;
-  proto_maker.SetProto(&op_proto);
-  proto_maker.SetChecker(&op_checker);
-  proto_maker.Make();
-  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+               paddle::platform::EnforceNotMet);
 }
 
 class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
@@ -46,8 +44,26 @@ TEST(ProtoMaker, DuplicatedInOut) {
   paddle::framework::proto::OpProto op_proto;
   paddle::framework::OpAttrChecker op_checker;
   TestAttrProtoMaker proto_maker;
-  proto_maker.SetProto(&op_proto);
-  proto_maker.SetChecker(&op_checker);
-  proto_maker.Make();
-  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+               paddle::platform::EnforceNotMet);
+}
+
+class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "input of test op");
+    AddOutput("XOut", "output of test op").Reuse("X");
+    AddOutput("NoOut", "output of test op").Reuse("NotExists");
+  }
+};
+
+TEST(ProtoMaker, InplaceOutput) {
+  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::OpAttrChecker op_checker;
+  TestInplaceProtoMaker proto_maker;
+  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+               paddle::platform::EnforceNotMet);
+  // proto_maker(&op_proto, &op_checker);
+  // proto_maker.Make();
+  // ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
 }
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 748317438b..43ab227a94 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -83,8 +83,14 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
 
   void operator()(const char* op_type, const char* library_type) const {
     using T = typename KERNEL_TYPE::ELEMENT_TYPE;
+    std::string library(library_type);
+    std::string data_layout = "ANYLAYOUT";
+    if (library == "MKLDNN") {
+      data_layout = "MKLDNNLAYOUT";
+    }
     OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
-                     DataLayout::kAnyLayout, StringToLibraryType(library_type));
+                     StringToDataLayout(data_layout),
+                     StringToLibraryType(library_type));
     OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
 
     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
@@ -99,7 +105,8 @@ struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
   void operator()(const char* op_type, const char* library_type) const {}
 };
 
-// User can register many kernel in one place. The data type could be different.
+// User can register many kernel in one place. The data type could be
+// different.
 template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
  public:
@@ -149,15 +156,15 @@ class OpKernelRegistrar : public Registrar {
 /**
  * Macro to register OperatorKernel.
  */
-#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...)        \
+#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)        \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__,                      \
+      __reg_op_kernel_##op_type##_##library_type##__,                      \
       "REGISTER_OP_KERNEL must be called in global namespace");            \
   static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__>  \
-      __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type,       \
-                                                           #LIBRARY_TYPE); \
-  int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() {                \
-    __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch();          \
+      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,       \
+                                                           #library_type); \
+  int TouchOpKernelRegistrar_##op_type##_##library_type() {                \
+    __op_kernel_registrar_##op_type##_##library_type##__.Touch();          \
     return 0;                                                              \
   }
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index d70f26026c..c633a2f847 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -24,6 +24,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(benchmark);
+DEFINE_bool(check_nan_inf, false,
+            "Checking whether operator produce NAN/INF or not. It will be "
+            "extremely slow so please use this flag wisely.");
 
 namespace paddle {
 namespace framework {
@@ -290,6 +293,38 @@ static Tensor* GetMutableTensorFromVar(Variable* var) {
   }
 }
 
+bool ExecutionContext::HasInput(const std::string& name) const {
+  if (!op_.HasInputs(name)) {
+    return false;
+  }
+  auto& ins = Inputs(name);
+  size_t length = ins.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Input %s should not have more than one inputs", name);
+  auto arg = ins[0];
+  auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
+  return var != nullptr;
+}
+
+bool ExecutionContext::HasOutput(const std::string& name) const {
+  if (!op_.HasOutputs(name)) {
+    return false;
+  }
+  auto& outs = Outputs(name);
+  size_t length = outs.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Output %s should not have more than one inputs", name);
+  auto arg = outs[0];
+  auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
+  return var != nullptr;
+}
+
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
   auto* var = InputVar(name);
@@ -441,10 +476,25 @@ class RuntimeInferShapeContext : public InferShapeContext {
     auto* out_tensor = out_var->GetMutable<LoDTensor>();
     out_tensor->set_lod(in_tensor.lod());
 
-    // TODO(dzhwinter) : reuse ShareLoD in most operators.
-    // Need to call ShareLayout explicitly in sequence related ops.
-    // Shall we have a better method to shared info between in/out Tensor?
-    out_tensor->set_layout(in_tensor.layout());
+// TODO(dzhwinter) : reuse ShareLoD in most operators.
+// Need to call ShareLayout explicitly in sequence related ops.
+// Shall we have a better method to shared info between in/out Tensor?
+#ifdef PADDLE_WITH_MKLDNN
+    // Fix me: ugly workaround below
+    // Correct solution:
+    //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
+    //    layout of output tensor should be set "manually" in Compute()
+    //    of each OPKernel. The reason layout should NOT be shared between
+    //    input and output "automatically" (now by InferShape()->ShareLoD())
+    //    is that layout transform may occur after InferShape().
+    // Workaround:
+    //    Skip set_layout() when input layout is kMKLDNN
+    //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
+    //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
+    //    in Compute()
+    if (in_tensor.layout() != DataLayout::kMKLDNN)
+#endif
+      out_tensor->set_layout(in_tensor.layout());
   }
 
   void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
@@ -466,6 +516,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
  protected:
   DDim GetDim(const std::string& name) const override {
     Variable* var = scope_.FindVar(name);
+    PADDLE_ENFORCE_NOT_NULL(var);
     if (var->IsType<LoDTensor>()) {
       return var->Get<LoDTensor>().dims();
     } else if (var->IsType<SelectedRows>()) {
@@ -513,6 +564,21 @@ class RuntimeInferShapeContext : public InferShapeContext {
   const Scope& scope_;
 };
 
+static void CheckTensorNANOrInf(const std::string& name,
+                                const framework::Tensor& tensor) {
+  if (tensor.memory_size() == 0) {
+    return;
+  }
+  if (tensor.type().hash_code() != typeid(float).hash_code() &&   // NOLINT
+      tensor.type().hash_code() != typeid(double).hash_code()) {  // NOLINT
+    return;
+  }
+  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
+                 "Tensor %s contains Inf", name);
+  PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
+                 "Tensor %s contains NAN", name);
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
@@ -597,6 +663,16 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   if (FLAGS_benchmark) {
     new_dev_ctx->Wait();
   }
+
+  if (FLAGS_check_nan_inf) {
+    for (auto& vname : OutputVars(true)) {
+      auto* var = new_scope.FindVar(vname);
+      if (var == nullptr) continue;
+      if (var->IsType<framework::LoDTensor>()) {
+        CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+      }
+    }
+  }
 }
 
 proto::VarType::Type OperatorWithKernel::IndicateDataType(
@@ -617,8 +693,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
         }
         if (t != nullptr) {
           int tmp = static_cast<int>(ToDataType(t->type()));
-          PADDLE_ENFORCE(tmp == data_type || data_type == -1,
-                         "DataType of Paddle Op %s must be the same.", Type());
+          PADDLE_ENFORCE(
+              tmp == data_type || data_type == -1,
+              "DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
+              data_type, tmp);
           data_type = tmp;
         }
       }
@@ -636,7 +714,8 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType(
 OpKernelType OperatorWithKernel::GetKernelTypeForVar(
     const std::string& var_name, const Tensor& tensor,
     const OpKernelType& expected_kernel_type) const {
-  return OpKernelType(expected_kernel_type.data_type_, tensor.place());
+  return OpKernelType(expected_kernel_type.data_type_, tensor.place(),
+                      tensor.layout());
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 2f480e00c1..b1d75d0d0f 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -191,9 +191,9 @@ class ExecutionContext {
     return op_.Attr<T>(name);
   }
 
-  bool HasInput(const std::string& name) const { return op_.HasInputs(name); }
+  bool HasInput(const std::string& name) const;
 
-  bool HasOutput(const std::string& name) const { return op_.HasOutputs(name); }
+  bool HasOutput(const std::string& name) const;
 
   size_t InputSize(const std::string& name) const {
     return op_.Inputs(name).size();
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 50c3468d55..ac4d1f58a5 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -22,7 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -42,9 +43,8 @@ class ParallelExecutorPrivate {
 #ifdef PADDLE_WITH_CUDA
   std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
-
-  std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
-  bool own_local_scope;
+  bool own_local_scope_;
+  bool use_cuda_;
 };
 
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -61,65 +61,78 @@ ParallelExecutor::ParallelExecutor(
     size_t num_trainers, size_t trainer_id)
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
+  member_->use_cuda_ = exec_strategy.use_cuda_;
 
   // Step 1. Bcast the params to devs.
   // Create local scopes
   if (local_scopes.empty()) {
-    member_->own_local_scope = true;
+    member_->own_local_scope_ = true;
     member_->local_scopes_.emplace_back(member_->global_scope_);
     for (size_t i = 1; i < member_->places_.size(); ++i) {
       member_->local_scopes_.emplace_back(&scope->NewScope());
     }
   } else {
-    member_->own_local_scope = false;
+    member_->own_local_scope_ = false;
     PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
     for (size_t i = 0; i < member_->places_.size(); ++i) {
       member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
     }
   }
 
+  if (member_->use_cuda_) {
 // Bcast Parameters to all GPUs
 #ifdef PADDLE_WITH_CUDA
-  auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
-  ncclUniqueId *nccl_id = nullptr;
-  if (nccl_id_var != nullptr) {
-    nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
-  }
-  member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
-      member_->places_, nccl_id, num_trainers, trainer_id));
+    auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
+    ncclUniqueId *nccl_id = nullptr;
+    if (nccl_id_var != nullptr) {
+      nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
+    }
+    member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
+        member_->places_, nccl_id, num_trainers, trainer_id));
+#else
+    PADDLE_THROW("Not compiled with CUDA");
 #endif
-  if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 &&
-      local_scopes.empty()) {  // Is CUDA
+  }
+
+  if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
     BCastParamsToGPUs(bcast_vars);
   }
-// Startup Program has been run. All local scopes has correct parameters.
+  // Startup Program has been run. All local scopes has correct parameters.
 
-// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
-// ncclOp
-#ifdef PADDLE_WITH_CUDA
-  details::MultiDevSSAGraphBuilder builder(
+  // Step 2. Create vars in each scope;
+  std::vector<details::VariableInfo> var_infos;
+  for (auto *var : main_program.Block(0).AllVars()) {
+    var_infos.emplace_back();
+    var_infos.back().name_ = var->Name();
+    var_infos.back().type_ = var->GetType();
+    var_infos.back().persistable_ = var->Persistable();
+  }
+
+  // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
+  // ncclOp
+
+  details::SSAGraphBuilderFactory builder_factory(
       member_->places_, loss_var_name, params, member_->local_scopes_,
-      member_->nccl_ctxs_.get(), build_strategy);
+      build_strategy);
+  if (member_->use_cuda_) {
+#ifdef PADDLE_WITH_CUDA
+    builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
 #else
-  details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
-                                           params, member_->local_scopes_,
-                                           build_strategy);
+    PADDLE_THROW("Not compiled with CUDA");
 #endif
-  auto graph = builder.Build(main_program);
+  }
 
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-      exec_strategy, member_->local_scopes_, places, std::move(graph)));
+      exec_strategy, member_->local_scopes_, places,
+      builder_factory.Create()->Build(main_program)));
 
-  // Step 3. Create vars in each scope;
-  for (auto *var : main_program.Block(0).AllVars()) {
-    member_->var_types_.emplace_back(var->Name(), var->GetType(),
-                                     var->Persistable());
-  }
+  member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
+      exec_strategy, member_->local_scopes_, std::move(var_infos),
+      member_->places_, std::move(member_->executor_)));
 }
 
 void ParallelExecutor::BCastParamsToGPUs(
     const std::unordered_set<std::string> &vars) const {
-#ifdef PADDLE_WITH_CUDA
   auto *main_scope = member_->local_scopes_[0];
 
   for (auto &var : vars) {
@@ -131,6 +144,7 @@ void ParallelExecutor::BCastParamsToGPUs(
     auto &main_tensor = main_var->Get<LoDTensor>();
     auto &dims = main_tensor.dims();
     if (paddle::platform::is_gpu_place(main_tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
       size_t numel = main_tensor.numel();
       ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
       platform::NCCLGroupGuard guard;
@@ -149,6 +163,10 @@ void ParallelExecutor::BCastParamsToGPUs(
         platform::dynload::ncclBcast(buffer, numel, data_type, 0,
                                      nccl_ctx.comm_, nccl_ctx.stream());
       }
+      member_->nccl_ctxs_->WaitAll();
+#else
+      PADDLE_THROW("Not compiled with CUDA");
+#endif
     } else {
       platform::CPUPlace cpu;
       for (size_t i = 1; i < member_->places_.size(); ++i) {
@@ -159,52 +177,15 @@ void ParallelExecutor::BCastParamsToGPUs(
         paddle::framework::TensorCopy(main_tensor, cpu, t);
       }
     }
-    member_->nccl_ctxs_->WaitAll();
   }
-#else
-  PADDLE_THROW("Not compiled with CUDA");
-#endif
 }
 
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                            const std::string &fetched_var_name) {
   platform::RecordBlock b(0);
-  // Create local scopes.
-  for (auto it = member_->local_scopes_.rbegin();
-       it != member_->local_scopes_.rend(); ++it) {
-    auto &scope = *it;
-    Scope &local_scope = scope->NewScope();
-    *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
-        &local_scope;
-
-    for (auto &name_type_pair : member_->var_types_) {
-      if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) {
-        continue;
-      }
-
-      if (std::get<2>(name_type_pair)) {  // Persistable
-        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
-                           std::get<1>(name_type_pair));
-      } else {
-        InitializeVariable(local_scope.Var(std::get<0>(name_type_pair)),
-                           std::get<1>(name_type_pair));
-      }
-    }
-  }
-
   auto fetch_data = member_->executor_->Run(fetch_tensors);
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
       fetch_data;
-
-  // Wait All computational streams
-  for (auto p : member_->places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  }
-  for (auto &scope : member_->local_scopes_) {
-    auto &local_scope =
-        *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
-    scope->DeleteScope(local_scope);
-  }
 }
 
 void ParallelExecutor::FeedTensorsIntoLocalScopes(
@@ -242,7 +223,7 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
 }
 
 ParallelExecutor::~ParallelExecutor() {
-  if (member_->own_local_scope) {
+  if (member_->own_local_scope_) {
     for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
       member_->global_scope_->DeleteScope(member_->local_scopes_[i]);
     }
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 64fb028f83..1e01a6e900 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -51,12 +51,15 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
     auto *block = desc_.mutable_blocks(i);
     blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
   }
-  for (auto &block : blocks_) {
-    for (auto *op : block->AllOps()) {
-      for (const auto &attr : op->Proto()->attrs()) {
-        if (attr.type() == proto::AttrType::BLOCK) {
-          size_t blk_idx = attr.block_idx();
-          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
+  for (size_t block_id = 0; block_id < blocks_.size(); ++block_id) {
+    auto all_ops = blocks_[block_id]->AllOps();
+    for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) {
+      auto &op = all_ops[op_id];
+      for (const std::string &attr_name : op->AttrNames()) {
+        if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
+          int sub_block_id =
+              o.Block(block_id).Op(op_id)->GetBlockAttr(attr_name);
+          op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
         }
       }
     }
@@ -86,6 +89,16 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
   for (auto &block_desc : *desc_.mutable_blocks()) {
     blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
+  for (auto &block : blocks_) {
+    for (auto *op : block->AllOps()) {
+      for (const auto &attr : op->Proto()->attrs()) {
+        if (attr.type() == proto::AttrType::BLOCK) {
+          size_t blk_idx = attr.block_idx();
+          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
+        }
+      }
+    }
+  }
 }
 
 const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index 76126f3dc6..0b36f1116d 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -25,8 +25,10 @@ void FileReader::ReadNext(std::vector<LoDTensor> *out) {
   if (out->empty()) {
     return;
   }
+
+  PADDLE_ENFORCE_EQ(out->size(), dims_.size());
   for (size_t i = 0; i < dims_.size(); ++i) {
-    auto &actual = out->at(i).dims();
+    auto &actual = (*out)[i].dims();
     auto &expect = dims_[i];
 
     PADDLE_ENFORCE_EQ(actual.size(), expect.size());
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index 3a413941df..64d4ceab62 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -35,14 +35,15 @@ class ReaderBase {
 
 class DecoratedReader : public ReaderBase {
  public:
-  explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) {
+  explicit DecoratedReader(const std::shared_ptr<ReaderBase>& reader)
+      : ReaderBase(), reader_(reader) {
     PADDLE_ENFORCE_NOT_NULL(reader_);
   }
 
   void ReInit() override { reader_->ReInit(); }
 
  protected:
-  ReaderBase* reader_;
+  std::shared_ptr<ReaderBase> reader_;
 };
 
 class FileReader : public ReaderBase {
@@ -64,7 +65,7 @@ class ReaderHolder {
  public:
   void Reset(ReaderBase* reader) { reader_.reset(reader); }
 
-  ReaderBase* Get() const { return reader_.get(); }
+  std::shared_ptr<ReaderBase> Get() const { return reader_; }
 
   void ReadNext(std::vector<LoDTensor>* out) {
     PADDLE_ENFORCE_NOT_NULL(reader_);
@@ -76,7 +77,7 @@ class ReaderHolder {
   }
 
  private:
-  std::unique_ptr<ReaderBase> reader_;
+  std::shared_ptr<ReaderBase> reader_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 9091713158..bb2d866c82 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -34,13 +34,7 @@ DEFINE_bool(
 namespace paddle {
 namespace framework {
 
-Scope::~Scope() {
-  DropKids();
-  for (auto& kv : vars_) {
-    VLOG(3) << "Destroy variable " << kv.first;
-    delete kv.second;
-  }
-}
+Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
   std::unique_lock<std::mutex> lock(mutex_);
@@ -49,10 +43,13 @@ Scope& Scope::NewScope() const {
 }
 
 Variable* Scope::Var(const std::string& name) {
+  // acquire the lock when new var under this scope
+  std::unique_lock<std::mutex> lock(mutex_);
   auto* v = FindVarLocally(name);
   if (v != nullptr) return v;
+
   v = new Variable();
-  vars_[name] = v;
+  vars_[name].reset(v);
   VLOG(3) << "Create variable " << name;
   v->name_ = &(vars_.find(name)->first);
   return v;
@@ -67,22 +64,29 @@ Variable* Scope::Var(std::string* name) {
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
+  // acquire the lock when find var
+  std::unique_lock<std::mutex> lock(mutex_);
+  return FindVarInternal(name);
+}
+
+Variable* Scope::FindVarInternal(const std::string& name) const {
   auto var = FindVarLocally(name);
   if (var != nullptr) {
     return var;
   }
-  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
+  return (parent_ == nullptr) ? nullptr : parent_->FindVarInternal(name);
 }
 
 const Scope* Scope::FindScope(const Variable* var) const {
   for (auto& kv : vars_) {
-    if (kv.second == var) {
+    if (kv.second.get() == var) {
       return this;
     }
   }
   return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
 }
 void Scope::DropKids() {
+  std::unique_lock<std::mutex> lock(mutex_);
   for (Scope* s : kids_) delete s;
   kids_.clear();
 }
@@ -110,10 +114,10 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
+  std::unique_lock<std::mutex> lock(mutex_);
   std::set<std::string> var_set(var_names.begin(), var_names.end());
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
-      delete it->second;
       it = vars_.erase(it);
     } else {
       ++it;
@@ -129,7 +133,7 @@ void Scope::Rename(const std::string& origin_name,
   auto new_it = vars_.find(new_name);
   PADDLE_ENFORCE(new_it == vars_.end(),
                  "The variable with name %s is already in the scope", new_name);
-  vars_[new_name] = origin_it->second;
+  vars_[new_name].reset(origin_it->second.release());
   vars_.erase(origin_it);
 }
 
@@ -141,7 +145,7 @@ std::string Scope::Rename(const std::string& origin_name) const {
 
 Variable* Scope::FindVarLocally(const std::string& name) const {
   auto it = vars_.find(name);
-  if (it != vars_.end()) return it->second;
+  if (it != vars_.end()) return it->second.get();
   return nullptr;
 }
 
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index abc82e452d..98d103d867 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -47,15 +47,18 @@ class Scope {
   Scope& NewScope() const;
 
   /// Create a variable with given name if it doesn't exist.
+  /// Caller doesn't own the returned Variable.
   Variable* Var(const std::string& name);
 
   /// Create a variable with a scope-unique name.
+  /// Caller doesn't own the returned Variable.
   Variable* Var(std::string* name = nullptr);
 
   void EraseVars(const std::vector<std::string>& var_names);
 
   /// Find a variable in the scope or any of its ancestors.  Returns
   /// nullptr if cannot find.
+  /// Caller doesn't own the returned Variable.
   Variable* FindVar(const std::string& name) const;
 
   const Scope* parent() const { return parent_; }
@@ -78,13 +81,21 @@ class Scope {
   // Rename variable to a new name and return the new name
   std::string Rename(const std::string& origin_name) const;
 
-  Variable* FindVarLocally(const std::string& name) const;
-
  private:
   // Call Scope::NewScope for a sub-scope.
   explicit Scope(Scope const* parent) : parent_(parent) {}
 
-  mutable std::unordered_map<std::string, Variable*> vars_;
+  // Called by FindVar recursively.
+  // Caller doesn't own the returned Variable.
+  Variable* FindVarInternal(const std::string& name) const;
+
+  // Called by FindVarInternal and Var.
+  // Caller doesn't own the returned Variable.
+  Variable* FindVarLocally(const std::string& name) const;
+
+  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+
+  // Scope in `kids_` are owned by this class.
   mutable std::list<Scope*> kids_;
   Scope const* parent_{nullptr};
 
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 56cf6693ca..06ed87e7e8 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -18,8 +18,8 @@ namespace paddle {
 namespace framework {
 
 struct ReAllocateVisitor {
-  ReAllocateVisitor(framework::Tensor* tensor, const framework::DDim& dims)
-      : tensor_(tensor), dims_(dims) {}
+  ReAllocateVisitor(const framework::DDim& dims, framework::Tensor* tensor)
+      : dims_(dims), tensor_(tensor) {}
 
   template <typename T>
   void operator()() const {
@@ -34,8 +34,8 @@ struct ReAllocateVisitor {
     tensor_->ShareDataWith(cpu_tensor);
   }
 
-  framework::Tensor* tensor_;
   framework::DDim dims_;
+  framework::Tensor* tensor_;
 };
 
 struct TensorCopyVisitor {
@@ -121,24 +121,29 @@ bool SelectedRows::HasKey(int64_t key) const {
 }
 
 std::vector<std::pair<int64_t, int64_t>> SelectedRows::Get(
-    std::vector<int64_t> keys, framework::Tensor* value) const {
+    const std::vector<int64_t>& keys, framework::Tensor* value) const {
   PADDLE_ENFORCE(value->IsInitialized(),
                  "The value tensor should be initialized.");
   std::vector<std::pair<int64_t, int64_t>> non_keys_pair;
-  int64_t value_width = value_->numel() / value_->dims()[0];
-  PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
-                    "output tensor should have the same shape with table "
-                    "execpt the dims[0].");
-
-  for (size_t i = 0; i < keys.size(); ++i) {
-    int64_t index = Index(keys[i]);
-    if (index == -1) {
-      non_keys_pair.push_back(std::make_pair(keys[i], static_cast<int64_t>(i)));
-    } else {
-      framework::VisitDataType(
-          framework::ToDataType(value_->type()),
-          TensorCopyVisitor(value, i * value_width, *value_.get(),
-                            index * value_width, value_width));
+  if (keys.empty()) {
+    VLOG(3) << "keys is empty, please check data!";
+  } else {
+    int64_t value_width = value_->numel() / value_->dims()[0];
+    PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
+                      "output tensor should have the same shape with table "
+                      "except the dims[0].");
+
+    for (size_t i = 0; i < keys.size(); ++i) {
+      int64_t index = Index(keys[i]);
+      if (index == -1) {
+        non_keys_pair.push_back(
+            std::make_pair(keys[i], static_cast<int64_t>(i)));
+      } else {
+        framework::VisitDataType(
+            framework::ToDataType(value_->type()),
+            TensorCopyVisitor(value, i * value_width, *value_.get(),
+                              index * value_width, value_width));
+      }
     }
   }
   return non_keys_pair;
@@ -153,6 +158,7 @@ bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
   }
   PADDLE_ENFORCE_EQ(value.dims()[0], static_cast<size_t>(1),
                     "The first dim of value should be 1.");
+  std::lock_guard<std::mutex> lock(*auto_grown_mutex_.get());
   auto index = Index(key);
   bool is_new_key = false;
   if (index == -1) {
@@ -164,7 +170,7 @@ bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
       auto dims = value_->dims();
       dims[0] = (dims[0] + 1) << 1;
       framework::VisitDataType(framework::ToDataType(value.type()),
-                               ReAllocateVisitor(value_.get(), dims));
+                               ReAllocateVisitor(dims, value_.get()));
     }
   }
 
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index c27c927ee7..7160670ddd 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <algorithm>
+#include <memory>
+#include <mutex>  // NOLINT
 #include <utility>
 #include <vector>
 
@@ -46,11 +48,13 @@ class SelectedRows {
   SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
       : rows_(rows), height_(height) {
     value_.reset(new Tensor());
+    auto_grown_mutex_.reset(new std::mutex);
   }
 
   SelectedRows() {
     height_ = 0;
     value_.reset(new Tensor());
+    auto_grown_mutex_.reset(new std::mutex);
   }
 
   platform::Place place() const { return value_->place(); }
@@ -82,7 +86,7 @@ class SelectedRows {
    * @return a list of pair which contains the non-exists key and the index in
    * the value
    */
-  std::vector<std::pair<int64_t, int64_t>> Get(std::vector<int64_t> keys,
+  std::vector<std::pair<int64_t, int64_t>> Get(const std::vector<int64_t>& keys,
                                                framework::Tensor* value) const;
 
   /*
@@ -125,6 +129,7 @@ class SelectedRows {
   Vector<int64_t> rows_;
   std::unique_ptr<Tensor> value_{nullptr};
   int64_t height_;
+  std::unique_ptr<std::mutex> auto_grown_mutex_{nullptr};
 };
 
 /*
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index 46c8feec00..5f497cafa0 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -63,6 +63,7 @@ class InferShapeContext {
 
   std::vector<InferShapeVarPtr> GetInputVarPtrs(const std::string &name);
   std::vector<InferShapeVarPtr> GetOutputVarPtrs(const std::string &name);
+  virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0;
 
   // Note: In while op, we need this to be public
   void SetDims(const std::vector<std::string> &names,
@@ -81,8 +82,6 @@ class InferShapeContext {
       const std::vector<std::string> &names) const;
 
   virtual proto::VarType::Type GetVarType(const std::string &name) const = 0;
-
-  virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index e97ada06f0..c7286dacf0 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -15,5 +15,102 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
-namespace framework {}
+namespace framework {
+extern size_t SizeOfType(std::type_index type);
+void Tensor::check_memory_size() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE_LE(
+      numel() * SizeOfType(type()), memory_size(),
+      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+      "first to re-allocate memory.\n"
+      "or maybe the required data-type mismatches the data already stored.");
+}
+
+size_t Tensor::memory_size() const {
+  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
+}
+
+void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+  if (holder_ != nullptr) {
+    holder_->set_type(type);
+  }
+  PADDLE_ENFORCE_GE(numel(), 0,
+                    "When calling this method, the Tensor's numel must be "
+                    "equal or larger than zero. "
+                    "Please check Tensor::Resize has been called first.");
+  int64_t size = numel() * SizeOfType(type);
+  /* some versions of boost::variant don't have operator!= */
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + offset_) {
+    if (platform::is_cpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), size, type));
+    } else if (platform::is_gpu_place(place) ||
+               platform::is_cuda_pinned_place(place)) {
+#ifndef PADDLE_WITH_CUDA
+      PADDLE_THROW(
+          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
+    }
+#else
+      if (platform::is_gpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
+            boost::get<platform::CUDAPlace>(place), size, type));
+      } else if (platform::is_cuda_pinned_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
+            boost::get<platform::CUDAPinnedPlace>(place), size, type));
+      }
+    }
+#endif
+    offset_ = 0;
+  }
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 offset_);
+}
+
+void* Tensor::mutable_data(platform::Place place) {
+  PADDLE_ENFORCE(this->holder_ != nullptr,
+                 "Cannot invoke mutable data if current hold nothing.");
+  return mutable_data(place, holder_->type());
+}
+
+Tensor& Tensor::ShareDataWith(const Tensor& src) {
+  src.check_memory_size();
+  *this = src;
+  return *this;
+}
+
+Tensor Tensor::Slice(int begin_idx, int end_idx) const {
+  check_memory_size();
+  PADDLE_ENFORCE_GE(begin_idx, 0,
+                    "The start row index must be greater than 0.");
+  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
+  PADDLE_ENFORCE_LT(
+      begin_idx, end_idx,
+      "The start row index must be lesser than the end row index.");
+
+  if (dims_[0] == 1) {
+    return *this;
+  } else {
+    size_t base = numel() / dims_[0];
+    Tensor dst;
+    dst.holder_ = holder_;
+    dst.set_layout(layout_);
+    DDim dst_dims = dims_;
+    dst_dims[0] = end_idx - begin_idx;
+    dst.Resize(dst_dims);
+    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
+    return dst;
+  }
+}
+
+Tensor& Tensor::Resize(const DDim& dims) {
+  dims_ = dims;
+  return *this;
+}
+
+const DDim& Tensor::dims() const { return dims_; }
+
+int64_t Tensor::numel() const { return product(dims_); }
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 6f878541e6..ef224d68f1 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -34,6 +34,28 @@ namespace framework {
 class LoDTensor;
 
 class Tensor {
+#ifdef PADDLE_WITH_MKLDNN
+
+ public:
+  inline mkldnn::memory::format format() const { return format_; }
+
+  inline void set_format(const mkldnn::memory::format format) {
+    format_ = format;
+  }
+
+ protected:
+  /**
+   * @brief the detail format of memory block which have layout as kMKLDNN
+   *
+   * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
+   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
+   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
+   *       this field.
+   */
+
+  mkldnn::memory::format format_ = mkldnn::memory::format::format_undef;
+#endif
+
  public:
   template <typename T, size_t D, int MajorType, typename IndexType>
   friend struct EigenTensor;
@@ -54,26 +76,24 @@ class Tensor {
 
   /*! Return a pointer to mutable memory block. */
   template <typename T>
-  inline T* data();
+  T* data();
 
   /*! Return a pointer to constant memory block. */
   template <typename T>
-  inline const T* data() const;
+  const T* data() const;
 
-  inline bool IsInitialized() const;
-
-  inline void switch_place(platform::Place new_place);
+  bool IsInitialized() const;
 
   /**
    * @brief   Return a pointer to mutable memory block.
    * @note    If not exist, then allocation.
    */
   template <typename T>
-  inline T* mutable_data(platform::Place place);
+  T* mutable_data(platform::Place place);
 
-  inline void* mutable_data(platform::Place place, std::type_index type);
+  void* mutable_data(platform::Place place, std::type_index type);
 
-  inline void* mutable_data(platform::Place place);
+  void* mutable_data(platform::Place place);
 
   /**
    * @brief     Return a pointer to mutable memory block.
@@ -84,19 +104,19 @@ class Tensor {
    * @note      If not exist, then allocation.
    */
   template <typename T>
-  inline T* mutable_data(DDim dims, platform::Place place);
+  T* mutable_data(DDim dims, platform::Place place);
 
   /*! Return the dimensions of the memory block. */
-  inline const DDim& dims() const;
+  const DDim& dims() const;
 
   /*! Return the numel of the memory block. */
-  inline int64_t numel() const;
+  int64_t numel() const;
 
   /*! Resize the dimensions of the memory block. */
-  inline Tensor& Resize(const DDim& dims);
+  Tensor& Resize(const DDim& dims);
 
   /*! The internal of two tensors share the same memory block. */
-  inline Tensor& ShareDataWith(const Tensor& src);
+  Tensor& ShareDataWith(const Tensor& src);
 
   /**
    * @brief  Return a sub-tensor of the given tensor.
@@ -106,7 +126,7 @@ class Tensor {
    * @param[in] end_idx     The index of the end row(exclusive) to slice.
    *                        The index number begins from 0.
    */
-  inline Tensor Slice(int begin_idx, int end_idx) const;
+  Tensor Slice(int begin_idx, int end_idx) const;
 
   platform::Place place() const {
     PADDLE_ENFORCE_NOT_NULL(
@@ -123,11 +143,11 @@ class Tensor {
   // memory size returns the holding memory size in byte.
   size_t memory_size() const;
 
-  inline void check_memory_size() const;
+  void check_memory_size() const;
 
-  inline DataLayout layout() const { return layout_; }
+  DataLayout layout() const { return layout_; }
 
-  inline void set_layout(const DataLayout layout) { layout_ = layout; }
+  void set_layout(const DataLayout layout) { layout_ = layout; }
 
  private:
   /**
@@ -197,8 +217,10 @@ class Tensor {
    *       N,C,H,W for respectively the batch size, the number of
    *       feature maps, the height.
    */
-
-  DataLayout layout_ = DataLayout::kNHWC;
+  // Fix me: here just change the default layout to kNCHW
+  // it doesn't fix the real issue, i.e. feeder should set up tensor layout
+  // according to actual input data
+  DataLayout layout_ = DataLayout::kNCHW;
 
   /**
    * @brief   A PlaceHolder may be shared by more than one tensor.
@@ -210,15 +232,6 @@ class Tensor {
   size_t offset_;
 };
 
-inline void Tensor::switch_place(platform::Place new_place) {
-  if (holder_->place() == new_place) {
-    return;
-  }
-
-  // TODO(tonyyang-svail): do memcpy here.
-  PADDLE_THROW("Not Implemented");
-}
-
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 0a1db7758b..96114678a9 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -20,26 +20,11 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-extern size_t SizeOfType(std::type_index type);
-inline void Tensor::check_memory_size() const {
-  PADDLE_ENFORCE_NOT_NULL(
-      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
-  PADDLE_ENFORCE_LE(
-      numel() * SizeOfType(type()), memory_size(),
-      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-      "first to re-allocate memory.\n"
-      "or maybe the required data-type mismatches the data already stored.");
-}
-
-inline size_t Tensor::memory_size() const {
-  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
-}
-
 template <typename T>
 inline const T* Tensor::data() const {
   check_memory_size();
   PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type().hash_code() == typeid(T).hash_code(),
+                     holder_->type() == std::type_index(typeid(T)),
                  "Tensor holds the wrong type, it holds %s",
                  this->holder_->type().name());
 
@@ -53,7 +38,7 @@ template <typename T>
 inline T* Tensor::data() {
   check_memory_size();
   PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type().hash_code() == typeid(T).hash_code(),
+                     holder_->type() == std::type_index(typeid(T)),
                  "Tensor holds the wrong type, it holds %s",
                  this->holder_->type().name());
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
@@ -73,88 +58,6 @@ inline T* Tensor::mutable_data(platform::Place place) {
   return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
 }
 
-inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
-  if (holder_ != nullptr) {
-    holder_->set_type(type);
-  }
-  PADDLE_ENFORCE_GE(numel(), 0,
-                    "When calling this method, the Tensor's numel must be "
-                    "equal or larger than zero. "
-                    "Please check Tensor::Resize has been called first.");
-  int64_t size = numel() * SizeOfType(type);
-  /* some versions of boost::variant don't have operator!= */
-  if (holder_ == nullptr || !(holder_->place() == place) ||
-      holder_->size() < size + offset_) {
-    if (platform::is_cpu_place(place)) {
-      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size, type));
-    } else if (platform::is_gpu_place(place) ||
-               platform::is_cuda_pinned_place(place)) {
-#ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW(
-          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
-    }
-#else
-      if (platform::is_gpu_place(place)) {
-        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-            boost::get<platform::CUDAPlace>(place), size, type));
-      } else if (platform::is_cuda_pinned_place(place)) {
-        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
-            boost::get<platform::CUDAPinnedPlace>(place), size, type));
-      }
-    }
-#endif
-    offset_ = 0;
-  }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-}
-
-inline void* Tensor::mutable_data(platform::Place place) {
-  PADDLE_ENFORCE(this->holder_ != nullptr,
-                 "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, holder_->type());
-}
-
-inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
-  src.check_memory_size();
-  *this = src;
-  return *this;
-}
-
-inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
-  check_memory_size();
-  PADDLE_ENFORCE_GE(begin_idx, 0,
-                    "The start row index must be greater than 0.");
-  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
-  PADDLE_ENFORCE_LT(
-      begin_idx, end_idx,
-      "The start row index must be lesser than the end row index.");
-
-  if (dims_[0] == 1) {
-    return *this;
-  } else {
-    size_t base = numel() / dims_[0];
-    Tensor dst;
-    dst.holder_ = holder_;
-    dst.set_layout(layout_);
-    DDim dst_dims = dims_;
-    dst_dims[0] = end_idx - begin_idx;
-    dst.Resize(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
-    return dst;
-  }
-}
-
-inline Tensor& Tensor::Resize(const DDim& dims) {
-  dims_ = dims;
-  return *this;
-}
-
-inline const DDim& Tensor::dims() const { return dims_; }
-
-inline int64_t Tensor::numel() const { return product(dims_); }
-
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   Tensor res;
   res.ShareDataWith(src);
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index e1012de2ec..0a1cb6d570 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -209,7 +209,7 @@ TEST(Tensor, ReshapeToMatrix) {
 
 TEST(Tensor, Layout) {
   framework::Tensor src;
-  ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC);
+  ASSERT_EQ(src.layout(), framework::DataLayout::kNCHW);
   src.set_layout(framework::DataLayout::kAnyLayout);
   ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
 }
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index b98aeed8a0..ec16a1c600 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -1,17 +1,23 @@
 set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
 
+# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
     SRCS io.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
-# Create static library
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-cc_library(paddle_fluid DEPS ${fluid_modules})
 
+if(WITH_CONTRIB)
+  set(fluid_modules "${fluid_modules}" paddle_inference_api)
+endif()
+
+# Create static library
+cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
     SRCS io.cc
-    DEPS ${fluid_modules})
+    DEPS ${fluid_modules} paddle_fluid_api)
+
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 47929ef749..5083578444 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -1,2 +1,23 @@
-cc_library(analysis SRCS dot.cc node.cc node.h)
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
+cc_library(analysis SRCS dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc fluid_to_data_flow_graph_pass.cc
+  DEPS paddle_fluid)
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
+cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
+
+set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+
+cc_test(test_data_flow_graph SRCS data_flow_graph_tester.cc DEPS analysis ${FLUID_CORE_MODULES} paddle_fluid
+  ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
+set_tests_properties(test_data_flow_graph PROPERTIES DEPENDS test_word2vec)
+
+cc_test(test_subgraph_splitter
+        SRCS subgraph_splitter_tester.cc
+        DEPS analysis paddle_fluid tensor
+        ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
+set_tests_properties(test_subgraph_splitter PROPERTIES DEPENDS test_word2vec)
+
+cc_test(test_dfg_graphviz_draw_pass
+        SRCS dfg_graphviz_draw_pass_tester.cc
+        DEPS analysis
+        ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
+set_tests_properties(test_dfg_graphviz_draw_pass PROPERTIES DEPENDS test_word2vec)
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
new file mode 100644
index 0000000000..4220451e3c
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -0,0 +1,205 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/dot.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+// It is a better idea that the inputs and outputs of this graph is set manully
+// before, but there must be a Pass that helps to prune the unnecessary ops that
+// do not contribute to the given targets, so in this pass, analysis and get the
+// inputs and outputs is OK.
+void DataFlowGraph::Build() {
+  inputs.clear();
+  outputs.clear();
+  std::unordered_set<Node *> ins;
+  std::unordered_set<Node *> outs;
+  for (auto &node : nodes.nodes()) {
+    for (auto *in : node->inlinks) {
+      ins.insert(in);
+    }
+    for (auto *out : node->outlinks) {
+      outs.insert(out);
+    }
+  }
+
+  // The nodes that in ins but not in outs is the graph's inputs
+  // similarly, the nodes that in outs but not in ins is the graphs' outputs
+  for (auto *in : ins) {
+    if (!outs.count(in)) {
+      inputs.push_back(in);
+    }
+  }
+  for (auto *out : outs) {
+    if (!outs.count(out)) {
+      outputs.push_back(out);
+    }
+  }
+}
+
+std::string DataFlowGraph::DotString() const {
+  Dot dot;
+
+  // Add nodes
+  for (size_t i = 0; i < nodes.size(); i++) {
+    const Node &node = nodes.Get(i);
+    switch (node.type()) {
+      case Node::Type::kValue:
+        dot.AddNode(node.repr(), node.dot_attrs());
+        break;
+      case Node::Type::kFunction:
+        dot.AddNode(node.repr(), node.dot_attrs());
+        break;
+      case Node::Type::kFunctionBlock:
+        dot.AddNode(node.repr(), node.dot_attrs());
+        break;
+      default:
+        PADDLE_THROW("unsupported Node type %d", static_cast<int>(node.type()));
+    }
+  }
+
+  // Add edges
+  for (size_t i = 0; i < nodes.size(); i++) {
+    const Node &node = nodes.Get(i);
+    for (auto &in : node.inlinks) {
+      dot.AddEdge(in->repr(), node.repr(), {});
+    }
+  }
+  return dot.Build();
+}
+
+//
+// NodesBFSIterator
+//
+
+GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
+    const std::vector<Node *> &source)
+    : queue_(source.begin(), source.end()) {}
+
+// GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
+//     GraphTraits<DataFlowGraph>::NodesBFSIterator &&other) noexcept
+//     : queue_(std::move(other.queue_)),
+//       visited_(std::move(other.visited_)) {}
+
+GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
+    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other)
+    : queue_(other.queue_), visited_(other.visited_) {}
+
+Node &GraphTraits<DataFlowGraph>::NodesBFSIterator::operator*() {
+  PADDLE_ENFORCE(!queue_.empty());
+  return *queue_.front();
+}
+
+Node *GraphTraits<DataFlowGraph>::NodesBFSIterator::operator->() {
+  PADDLE_ENFORCE(!queue_.empty());
+  return queue_.front();
+}
+
+GraphTraits<DataFlowGraph>::NodesBFSIterator &
+GraphTraits<DataFlowGraph>::NodesBFSIterator::operator=(
+    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other) {
+  queue_ = other.queue_;
+  visited_ = other.visited_;
+  return *this;
+}
+
+GraphTraits<DataFlowGraph>::NodesBFSIterator
+    &GraphTraits<DataFlowGraph>::NodesBFSIterator::operator++() {
+  PADDLE_ENFORCE(!queue_.empty());
+  auto *cur = queue_.front();
+  visited_.insert(cur);
+  queue_.pop_front();
+  for (auto *output : cur->outlinks) {
+    if (!visited_.count(output)) {
+      queue_.push_back(output);
+      visited_.insert(output);
+    }
+  }
+  return *this;
+}
+
+bool GraphTraits<DataFlowGraph>::NodesBFSIterator::operator==(
+    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other) {
+  if (queue_.empty()) return other.queue_.empty();
+  if ((!queue_.empty()) && (!other.queue_.empty())) {
+    return queue_.front() == other.queue_.front() &&
+           visited_.size() == other.visited_.size();  // here need to check the
+                                                      // equality of queue and
+    // visited. Just a light but week implementation.
+  }
+  return false;
+}
+
+//
+// NodesDFSIterator
+//
+GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
+    const std::vector<Node *> &source) {
+  for (auto *x : source) stack_.push(x);
+}
+
+// GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
+//     GraphTraits<DataFlowGraph>::NodesDFSIterator &&other) noexcept
+//     : stack_(std::move(other.stack_)),
+//       visited_(std::move(other.visited_)) {}
+
+GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
+    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other)
+    : stack_(other.stack_), visited_(other.visited_) {}
+
+Node &GraphTraits<DataFlowGraph>::NodesDFSIterator::operator*() {
+  PADDLE_ENFORCE(!stack_.empty());
+  return *stack_.top();
+}
+
+GraphTraits<DataFlowGraph>::NodesDFSIterator
+    &GraphTraits<DataFlowGraph>::NodesDFSIterator::operator++() {
+  if (stack_.empty()) return *this;
+  visited_.insert(stack_.top());
+  auto *cur = stack_.top();
+  stack_.pop();
+  for (auto *x : cur->outlinks) {
+    if (!visited_.count(x)) {
+      stack_.push(x);
+      visited_.insert(x);
+    }
+  }
+  return *this;
+}
+bool GraphTraits<DataFlowGraph>::NodesDFSIterator::operator==(
+    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other) {
+  if (stack_.empty()) return other.stack_.empty();
+  if ((!stack_.empty()) && (!other.stack_.empty())) {
+    return stack_.top() == other.stack_.top();
+  }
+  return false;
+}
+
+GraphTraits<DataFlowGraph>::NodesDFSIterator &
+GraphTraits<DataFlowGraph>::NodesDFSIterator::operator=(
+    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other) {
+  stack_ = other.stack_;
+  visited_ = other.visited_;
+  return *this;
+}
+Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
+  return stack_.top();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
new file mode 100644
index 0000000000..913e344d37
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -0,0 +1,162 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * Data flow graph is an pass that build the basic graph. It contains a graph
+ * and the iterators that enable the iteration over the graph.
+ */
+
+#pragma once
+
+#include <deque>
+#include <stack>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/inference/analysis/graph_traits.h"
+#include "paddle/fluid/inference/analysis/node.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * DataFlowGraph - A container of Value and Function Nodes.
+ */
+struct DataFlowGraph {
+  NodeMap nodes;
+  std::vector<Node *> inputs;
+  std::vector<Node *> outputs;
+
+  // Extract inputs and outputs of the graph.
+  void Build();
+
+  // Output a DOT graph file for debug.
+  std::string DotString() const;
+};
+
+/*
+ * An graph trait help to traverse the graph using BFS.
+ * The BFS start from a graph's inputs, the graph should be fully-connected, so
+ * that the iterator can reach the end.
+ */
+template <>
+struct GraphTraits<DataFlowGraph> {
+  // BFS iterator on nodes.
+  struct NodesBFSIterator
+      : public std::iterator<std::forward_iterator_tag, Node *> {
+    NodesBFSIterator() = default;
+    explicit NodesBFSIterator(const std::vector<Node *> &source);
+    // NodesBFSIterator(NodesBFSIterator &&other) noexcept;
+    // NOTE Heavy to use.
+    NodesBFSIterator(const NodesBFSIterator &other);
+
+    Node &operator*();
+    NodesBFSIterator &operator++();
+    Node *operator->();
+    // TODO(Superjomn) current implementation just compare the first
+    // element, need to compare the graph and all the elements in the queue and
+    // set.
+    NodesBFSIterator &operator=(const NodesBFSIterator &other);
+    bool operator==(const NodesBFSIterator &other);
+    bool operator!=(const NodesBFSIterator &other) { return !(*this == other); }
+
+   private:
+    std::deque<Node *> queue_;
+    std::unordered_set<Node *> visited_;
+  };
+
+  // DFS iterator on nodes.
+  struct NodesDFSIterator
+      : public std::iterator<std::forward_iterator_tag, Node *> {
+    NodesDFSIterator() = default;
+    explicit NodesDFSIterator(const std::vector<Node *> &source);
+    // NodesDFSIterator(NodesDFSIterator &&other) noexcept;
+    NodesDFSIterator(const NodesDFSIterator &other);
+
+    Node &operator*();
+    NodesDFSIterator &operator++();
+    // TODO(Superjomn) current implementation just compare the first
+    // element, need to compare the graph and all the elements in the queue and
+    // set.
+    NodesDFSIterator &operator=(const NodesDFSIterator &other);
+    bool operator==(const NodesDFSIterator &other);
+    bool operator!=(const NodesDFSIterator &other) { return !(*this == other); }
+    Node *operator->();
+
+   private:
+    std::stack<Node *> stack_;
+    std::unordered_set<Node *> visited_;
+  };
+
+  explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
+
+  // default use BFS to visit the nodes.
+  iterator_range<NodesBFSIterator> nodes() {
+    return iterator_range<NodesBFSIterator>(nodes_bfs_begin(), nodes_bfs_end());
+  }
+  iterator_range<NodesBFSIterator> nodes_in_BFS() {
+    return iterator_range<NodesBFSIterator>(nodes_bfs_begin(), nodes_bfs_end());
+  }
+  iterator_range<NodesDFSIterator> nodes_in_DFS() {
+    return iterator_range<NodesDFSIterator>(nodes_dfs_begin(), nodes_dfs_end());
+  }
+
+ private:
+  NodesBFSIterator nodes_bfs_begin() {
+    return NodesBFSIterator(graph_->inputs);
+  }
+  NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }
+  NodesDFSIterator nodes_dfs_begin() {
+    return NodesDFSIterator(graph_->inputs);
+  }
+  NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }
+
+ private:
+  DataFlowGraph *graph_;
+};
+
+// Extract the inputs and outputs of a graph. The inputs and outputs of a
+// sub-graph is the inputs nodes and output nodes that doesn't inside the
+// sub-graph.
+std::pair<
+    std::vector<Node *>,
+    std::vector<
+        Node *>> static ExtractInputAndOutputOfSubGraph(std::vector<Node *>
+                                                            &graph) {
+  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
+  std::unordered_set<Node *> inputs;
+  std::unordered_set<Node *> outputs;
+  for (auto &node : graph) {
+    for (auto *in : node->inlinks) {
+      if (!nodes.count(in) && in->type() == Node::Type::kValue) {
+        inputs.insert(in);
+      }
+    }
+    for (auto *out : node->outlinks) {
+      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
+        outputs.insert(out);
+      }
+    }
+  }
+  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
+                        std::vector<Node *>(outputs.begin(), outputs.end()));
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
new file mode 100644
index 0000000000..9d7cceeb65
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(DataFlowGraph, BFS) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+  dfg.Build();
+
+  for (auto* in : dfg.inputs) {
+    LOG(INFO) << "inputs: " << in->name() << " "
+              << static_cast<int>(in->type());
+  }
+  for (auto* out : dfg.outputs) {
+    LOG(INFO) << "outputs: " << out->name() << " "
+              << static_cast<int>(out->type());
+  }
+
+  GraphTraits<DataFlowGraph> trait(&dfg);
+  auto nodes = trait.nodes();
+  size_t count = 0;
+  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
+    LOG(INFO) << "visiting " << it->name();
+    ++count;
+  }
+  ASSERT_EQ(count, dfg.nodes.size());
+}
+
+TEST(DataFlowGraph, DFS) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+  dfg.Build();
+  GraphTraits<DataFlowGraph> trait(&dfg);
+  auto nodes = trait.nodes_in_DFS();
+  size_t count = 0;
+  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
+    LOG(INFO) << "visiting " << it->name();
+    ++count;
+  }
+  ASSERT_EQ(count, dfg.nodes.size());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
new file mode 100644
index 0000000000..dcee75cee5
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
@@ -0,0 +1,49 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+
+#include <glog/logging.h>
+#include <google/protobuf/text_format.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/io.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, Test) {
+  framework::proto::ProgramDesc new_desc;
+  DataFlowGraph graph;
+
+  FluidToDataFlowGraphPass pass0;
+  DataFlowGraphToFluidPass pass1;
+  pass0.Initialize(desc);
+  pass1.Initialize(&new_desc);
+
+  pass0.Run(&graph);
+  pass1.Run(&graph);
+
+  pass0.Finalize();
+  pass1.Finalize();
+
+  LOG(INFO) << graph.nodes.size();
+}
+
+};  // namespace analysis
+};  // namespace inference
+};  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
new file mode 100644
index 0000000000..41d4475382
--- /dev/null
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file create an DFG_GraphvizDrawPass which helps to draw a data flow
+ * graph's structure using graphviz.
+ */
+
+#pragma once
+
+#include <fstream>
+#include <string>
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Output a dot file and write to some place.
+ */
+class DFG_GraphvizDrawPass : public DataFlowGraphPass {
+ public:
+  DFG_GraphvizDrawPass(const std::string& dir, const std::string& id)
+      : dir_(dir), id_(id) {}
+
+  bool Initialize() override { return Pass::Initialize(); }
+  void Run(DataFlowGraph* graph) override {
+    auto content = Draw(graph);
+    std::ofstream file(GenDotPath());
+    file.write(content.c_str(), content.size());
+    file.close();
+    LOG(INFO) << "draw dot to " << GenDotPath();
+  }
+
+  bool Finalize() override { return Pass::Finalize(); }
+
+  Pass* CreatePrinterPass(std::ostream& os,
+                          const std::string& banner) const override {
+    return nullptr;
+  }
+
+ private:
+  // Path of the dot file to output.
+  std::string GenDotPath() const {
+    return dir_ + "/" + "graph_" + id_ + ".dot";
+  }
+
+  std::string Draw(DataFlowGraph* graph) { return graph->DotString(); }
+
+  std::string dir_;
+  std::string id_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
new file mode 100644
index 0000000000..3fc1cc18b8
--- /dev/null
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+
+#include <gtest/gtest.h>
+#include <fstream>
+#include <string>
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
+  auto dfg = ProgramDescToDFG(desc);
+  DFG_GraphvizDrawPass pass("./", "test");
+  pass.Initialize();
+  pass.Run(&dfg);
+
+  // test content
+  std::ifstream file("./graph_test.dot");
+  ASSERT_TRUE(file.is_open());
+
+  std::string line;
+  int no{0};
+  while (std::getline(file, line)) {
+    no++;
+  }
+  ASSERT_EQ(no, 82);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
new file mode 100644
index 0000000000..9f67c989cc
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -0,0 +1,85 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+FluidToDataFlowGraphPass::FluidToDataFlowGraphPass() {}
+
+bool FluidToDataFlowGraphPass::Initialize() { return Pass::Initialize(); }
+
+bool FluidToDataFlowGraphPass::Initialize(
+    const framework::proto::ProgramDesc &desc) {
+  desc_ = &desc;
+  return true;
+}
+
+bool FluidToDataFlowGraphPass::Finalize() { return Pass::Finalize(); }
+
+void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
+  // insert vars
+  std::unordered_map<std::string, size_t> var2id;
+  auto &main_block = desc_->blocks(framework::kRootBlockIndex);
+  for (int i = 0; i < main_block.vars_size(); i++) {
+    const auto &var = main_block.vars(i);
+    auto *v = graph->nodes.Create(Node::Type::kValue);
+    v->SetName(var.name());
+    v->SetExtraInfo(const_cast<void *>(static_cast<const void *>(&var)));
+    var2id[var.name()] = v->id();
+  }
+  for (int i = 0; i < main_block.ops_size(); i++) {
+    const auto &op = main_block.ops(i);
+    auto *o = graph->nodes.Create(Node::Type::kFunction);
+    o->SetName(op.type());
+    static_cast<Function *>(o)->SetFuncType(op.type());
+    // Link to the original protobuf message's memory, make it easier to
+    // generate from a data flow graph to fluid ProgramDesc.
+    o->SetExtraInfo(const_cast<void *>(static_cast<const void *>(&op)));
+    // set inputs and outputs
+    // TODO(Superjomn) make sure the InputNames is the real variable name.
+    for (int j = 0; j < op.inputs_size(); j++) {
+      auto &in_var = op.inputs(j);
+      for (int k = 0; k < in_var.arguments_size(); k++) {
+        auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
+        in->outlinks.push_back(o);
+        o->inlinks.push_back(in);
+      }
+    }
+    for (int j = 0; j < op.outputs_size(); j++) {
+      auto &out_var = op.outputs(j);
+      for (int k = 0; k < out_var.arguments_size(); k++) {
+        auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
+        out->inlinks.push_back(o);
+        o->outlinks.push_back(out);
+      }
+    }
+  }
+  // Analysis and extract the inputs and outputs of this graph.
+  graph->Build();
+}
+
+Pass *FluidToDataFlowGraphPass::CreatePrinterPass(
+    std::ostream &os, const std::string &banner) const {
+  return nullptr;
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
new file mode 100644
index 0000000000..33517e57be
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+/*
+ * This file implements the transformation from data flow graph to fluid
+ * ProgramDesc.
+ */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Transform a FluidDesc to a data flow graph.
+ */
+class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
+ public:
+  FluidToDataFlowGraphPass();
+  bool Initialize() override;
+  bool Initialize(const framework::proto::ProgramDesc &desc) override;
+  bool Finalize() override;
+
+  void Run(DataFlowGraph *graph) override;
+
+  Pass *CreatePrinterPass(std::ostream &os,
+                          const std::string &banner) const override;
+
+ private:
+  framework::proto::ProgramDesc const *desc_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
new file mode 100644
index 0000000000..817d32c92c
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -0,0 +1,37 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, Init) {
+  FluidToDataFlowGraphPass pass;
+  pass.Initialize();
+  pass.Initialize(desc);
+  DataFlowGraph graph;
+  pass.Run(&graph);
+  ASSERT_GT(graph.nodes.size(), 0);
+  pass.Finalize();
+  LOG(INFO) << '\n' << graph.DotString();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/graph_traits.cc b/paddle/fluid/inference/analysis/graph_traits.cc
new file mode 100644
index 0000000000..2ea70a1d20
--- /dev/null
+++ b/paddle/fluid/inference/analysis/graph_traits.cc
@@ -0,0 +1,15 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/graph_traits.h"
diff --git a/paddle/fluid/inference/analysis/graph_traits.h b/paddle/fluid/inference/analysis/graph_traits.h
new file mode 100644
index 0000000000..aed2b1e8e2
--- /dev/null
+++ b/paddle/fluid/inference/analysis/graph_traits.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the GraphTraits<X> template class that should be specified
+ * by classes that want to be iteratable by generic graph iterators.
+ *
+ * This file also defines the marker class Inverse that is used to iterate over
+ * graphs in a graph defined, inverse ordering...
+ */
+
+#pragma once
+
+#include "paddle/fluid/inference/analysis/helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * This class should be specialized by different graph types...
+ * That's why the base class is empty.
+ */
+template <typename GraphType>
+struct GraphTraits {
+  // using NodesBFSIterator = xxx
+
+  // NodesBFSIterator nodes_begin();
+  // NodesBFSIterator nodes_end();
+};
+
+/*
+ * Inverse - This class is used as a marker class to tell the graph iterator to
+ * iterate in a graph defined Inverse order.
+ */
+template <typename GraphType>
+struct Inverse {
+  const GraphType &graph;
+
+  explicit Inverse(const GraphType &graph) : graph(graph) {}
+};
+
+/*
+ * Provide a partial specialization of GraphTraits so that the inverse of an
+ * inverse turns into the original graph.
+ */
+template <typename GraphType>
+struct GraphTraits<Inverse<Inverse<GraphType>>> : GraphTraits<GraphType> {};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index b2d06c5d63..58eb0e715c 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -1,74 +1,125 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-template <typename IteratorT>
-class iterator_range {
-  IteratorT begin_, end_;
-
- public:
-  template <typename Container>
-  explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {}
-
-  iterator_range(const IteratorT &begin, const IteratorT &end)
-      : begin_(begin), end_(end) {}
-
-  const IteratorT &begin() const { return begin_; }
-  const IteratorT &end() const { return end_; }
-};
-
-/*
- * An registry helper class, with its records keeps the order they registers.
- */
-template <typename T>
-class OrderedRegistry {
- public:
-  T *Register(const std::string &name, T *x) {
-    PADDLE_ENFORCE(!dic_.count(name));
-    dic_[name] = data_.size();
-    data_.emplace_back(std::unique_ptr<T>(x));
-    return data_.back().get();
-  }
-
-  T *Lookup(const std::string &name) {
-    auto it = dic_.find(name);
-    if (it == dic_.end()) return nullptr;
-    return data_[it->second].get();
-  }
-
- protected:
-  std::unordered_map<std::string, int> dic_;
-  std::vector<std::unique_ptr<T>> data_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
-
-#define PADDLE_DISALLOW_COPY_AND_ASSIGN(type__) \
-                                                \
-  type__(const type__ &) = delete;              \
-                                                \
-  void operator=(const type__ &) = delete;
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+template <typename Vec>
+int AccuDims(Vec &&vec, int size) {
+  int res = 1;
+  for (int i = 0; i < size; i++) {
+    res *= std::forward<Vec>(vec)[i];
+  }
+  return res;
+}
+
+#define SET_TYPE(type__) dic_[typeid(type__).hash_code()] = #type__;
+/*
+ * Map typeid to representation.
+ */
+struct DataTypeNamer {
+  static const DataTypeNamer &Global() {
+    static auto *x = new DataTypeNamer();
+    return *x;
+  }
+
+  template <typename T>
+  const std::string &repr() const {
+    auto x = typeid(T).hash_code();
+    PADDLE_ENFORCE(dic_.count(x), "unknown type for representation");
+    return dic_.at(x);
+  }
+
+  const std::string &repr(size_t &hash) const {  // NOLINT
+    PADDLE_ENFORCE(dic_.count(hash), "unknown type for representation");
+    return dic_.at(hash);
+  }
+
+ private:
+  DataTypeNamer() {
+    SET_TYPE(int);
+    SET_TYPE(bool);
+    SET_TYPE(float);
+  }
+
+  std::unordered_map<decltype(typeid(int).hash_code()),  // NOLINT
+                     std::string>
+      dic_;
+};
+#undef SET_TYPE
+
+template <typename IteratorT>
+class iterator_range {
+  IteratorT begin_, end_;
+
+ public:
+  template <typename Container>
+  explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {}
+
+  iterator_range(const IteratorT &begin, const IteratorT &end)
+      : begin_(begin), end_(end) {}
+
+  const IteratorT &begin() const { return begin_; }
+  const IteratorT &end() const { return end_; }
+};
+
+/*
+ * An registry helper class, with its records keeps the order they registers.
+ */
+template <typename T>
+class OrderedRegistry {
+ public:
+  T *Register(const std::string &name, T *x) {
+    PADDLE_ENFORCE(!dic_.count(name));
+    dic_[name] = data_.size();
+    data_.emplace_back(std::unique_ptr<T>(x));
+    return data_.back().get();
+  }
+
+  T *Lookup(const std::string &name) {
+    auto it = dic_.find(name);
+    if (it == dic_.end()) return nullptr;
+    return data_[it->second].get();
+  }
+
+ protected:
+  std::unordered_map<std::string, int> dic_;
+  std::vector<std::unique_ptr<T>> data_;
+};
+
+template <typename T>
+T &GetFromScope(const framework::Scope &scope, const std::string &name) {
+  framework::Variable *var = scope.FindVar(name);
+  PADDLE_ENFORCE(var != nullptr);
+  return *var->GetMutable<T>();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
+
+#define PADDLE_DISALLOW_COPY_AND_ASSIGN(type__) \
+  type__(const type__ &) = delete;              \
+  void operator=(const type__ &) = delete;
diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h
index 07cb7669f9..7972ca25c9 100644
--- a/paddle/fluid/inference/analysis/node.h
+++ b/paddle/fluid/inference/analysis/node.h
@@ -117,7 +117,10 @@ class Node {
         type_hash_ = typeid(T).hash_code();
         data_.resize(sizeof(T));
       }
-      PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(), "type not matched");
+      PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(),
+                     "type not matched, origin is %s, want %s",
+                     DataTypeNamer::Global().repr(type_hash_),
+                     DataTypeNamer::Global().repr<T>());
       PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
       return *reinterpret_cast<T *>(&data_[0]);
     }
@@ -127,6 +130,10 @@ class Node {
     size_t type_hash_{std::numeric_limits<size_t>::max()};
   };
 
+  bool IsFunction() const { return type_ == Node::Type::kFunction; }
+  bool IsValue() const { return type_ == Node::Type::kValue; }
+  bool IsFunctionBlock() const { return type_ == Node::Type::kFunctionBlock; }
+
   virtual ~Node() {}
 
   friend class NodeMap;
diff --git a/paddle/fluid/inference/analysis/node_tester.cc b/paddle/fluid/inference/analysis/node_tester.cc
index 47fea0fdff..ea832a3a7e 100644
--- a/paddle/fluid/inference/analysis/node_tester.cc
+++ b/paddle/fluid/inference/analysis/node_tester.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/fluid/inference/analysis/node.h"
 
diff --git a/paddle/fluid/inference/analysis/pass.cc b/paddle/fluid/inference/analysis/pass.cc
new file mode 100644
index 0000000000..121b72c0a0
--- /dev/null
+++ b/paddle/fluid/inference/analysis/pass.cc
@@ -0,0 +1,15 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/pass.h"
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
new file mode 100644
index 0000000000..aa0e8667b5
--- /dev/null
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <iosfwd>
+#include <string>
+
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/node.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class Pass {
+ public:
+  Pass() = default;
+  virtual ~Pass() {}
+  // Virtual method overridden by subclasses to do only necessary initialization
+  // before any pass is run.
+  virtual bool Initialize() { return false; }
+  // There is some passes such as FlowToDataFlowGraphPass that needs a
+  // ProgramDesc. Here use the native ProgramDesc ProtoBuf message, so that it
+  // only couple with the proto file.
+  virtual bool Initialize(const framework::proto::ProgramDesc &desc) {
+    return false;
+  }
+  // There are some Passes such as DataFlowGraphToFluidPass that will output a
+  // ProgramDesc.
+  virtual bool Initialize(framework::proto::ProgramDesc *desc) { return false; }
+
+  // Virtual method overriden by subclasses to do any necessary clean up after
+  // all passes have run.
+  virtual bool Finalize() { return false; }
+
+  // Get a Pass appropriate to print the Node this pass operates on.
+  virtual Pass *CreatePrinterPass(std::ostream &os,
+                                  const std::string &banner) const = 0;
+
+  // Run on a single Node.
+  virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
+  // Run on a single Function.
+  virtual void Run(Function *x) { LOG(FATAL) << "not valid"; }
+  // Run on a single FunctionBlock.
+  virtual void Run(FunctionBlock *x) { LOG(FATAL) << "not valid"; }
+  // Run on a single DataFlowGraph.
+  virtual void Run(DataFlowGraph *x) { LOG(FATAL) << "not valid"; }
+};
+
+// NodePass process on any Node types.
+class NodePass : public Pass {
+ public:
+  virtual void Run(Node *node) = 0;
+};
+
+// NodePass process on any Function node types.
+class FunctionPass : public Pass {
+ public:
+  virtual void Run(Function *node) = 0;
+};
+
+// NodePass process on any FunctionBlock node types.
+class FunctionBlockPass : public Pass {
+ public:
+  virtual void Run(FunctionBlock *node) = 0;
+};
+
+// GraphPass processes on any GraphType.
+class DataFlowGraphPass : public Pass {
+ public:
+  virtual void Run(DataFlowGraph *graph) = 0;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
new file mode 100644
index 0000000000..43ccac96c8
--- /dev/null
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+const char *SubGraphSplitter::kMarkerAttrName =
+    "_sub_graph_splitter_inside_sub_graph";
+
+std::vector<std::vector<Node *>> SubGraphSplitter::operator()() {
+  MarkNodesInsideSubGraph();
+  return ExtractSubGraphs();
+}
+
+// Mark the output variables inside a subgraph with the func.
+inline void MarkOutLinksInSubGraph(const Function *func) {
+  for (auto *var : func->outlinks) {
+    var->attr(SubGraphSplitter::kMarkerAttrName).Bool() = true;
+  }
+}
+
+void SubGraphSplitter::MarkNodesInsideSubGraph() {
+  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+    if (node_inside_subgraph_teller_(&node)) {
+      node.attr(kMarkerAttrName).Bool() = true;
+      if (node.type() == Node::Type::kFunction) {
+        // If a function is inside the sub-graph, mark all the output variables
+        // to be inside too, so that two marked functions will be inside a same
+        // sub-graph, lets take a example:  A_function->var->B_function, if
+        // A_function is marked, var should also be marked, so that B_function
+        // will be in the same sub-graph with A_function if B_function is
+        // marked.
+        MarkOutLinksInSubGraph(static_cast<const Function *>(&node));
+      }
+    }
+  }
+}
+
+const char *kUnionFindParent = "_sub_graph_splitter_union_find_parent_";
+
+// Use the Union Find(UF) algorithm to find fully connected sub-graphs, if node
+// a's output is node b, that is a and b is in the same sub-graph. The UF
+// algorithm will group them to the same cluster.
+using node_map_t = std::unordered_map<int, Node *>;
+// Find the ancestor id of a node.
+int UnionFindGetAncestor(const node_map_t &node_map, size_t id) {
+  int tmp = id;
+  do {
+    tmp = node_map.at(tmp)->attr(kUnionFindParent).Int32();
+  } while (node_map.at(tmp)->attr(kUnionFindParent).Int32() != tmp);
+  return tmp;
+}
+// Make this two node share the same ancestor.
+// TODO(Superjom) bad performance, make a balanced tree latter.
+void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
+  int a_ancestor = UnionFindGetAncestor(node_map, a);
+  int b_ancestor = UnionFindGetAncestor(node_map, b);
+  node_map.at(b_ancestor)->attr(kUnionFindParent).Int32() = a_ancestor;
+  node_map.at(a)->attr(kUnionFindParent).Int32() = a_ancestor;
+  node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor;
+}
+
+std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
+  std::vector<Node *> marked_nodes;
+  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+    if (node.attr(kMarkerAttrName).Bool()) {
+      marked_nodes.push_back(&node);
+    }
+  }
+  // extract sub-graphs in the marked node set, use Union Find algorithm.
+  node_map_t node_map;  // id to ptr
+  for (auto *n : marked_nodes) {
+    // n's parent == n.id means it is the ancestor
+    n->attr(kUnionFindParent).Int32() = n->id();
+    node_map[n->id()] = n;
+  }
+  std::unordered_set<Node *> visited;
+  for (auto *n : marked_nodes) {
+    for (auto *out : n->outlinks) {
+      if (node_map.count(out->id())) {
+        UnionFindCombine(node_map, n->id(), out->id());
+      }
+    }
+  }
+
+  std::unordered_map<int /*ancestor*/, std::vector<Node *>> clusters;
+  for (auto *n : marked_nodes) {
+    if (n->type() == Node::Type::kFunction) {
+      clusters[UnionFindGetAncestor(node_map,
+                                    n->attr(kUnionFindParent).Int32())]
+          .push_back(n);
+    }
+  }
+  std::vector<std::vector<Node *>> result;
+  std::for_each(clusters.begin(), clusters.end(),
+                [&](const decltype(clusters)::value_type &it) {
+                  result.push_back(it.second);
+                });
+
+  return result;
+}
+
+void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
+
+void SubGraphFuse::ReplaceNodesWithSubGraphs() {
+  auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
+  for (auto &subgraph : subgraphs) {
+    // replace this sub-graph with the first node. Two steps: 1. Create a Block
+    // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
+    // as deleted. 3. Replace the deleted node with the new Block Node.
+    auto *block_node = graph_->nodes.Create(Node::Type::kFunctionBlock);
+    auto io = ExtractInputAndOutputOfSubGraph(subgraph);
+    block_node->inlinks = std::move(io.first);
+    block_node->outlinks = std::move(io.second);
+    for (auto *node : subgraph) {
+      // TODO(Superjomn) need a unified mechanism to treat deleted node in each
+      // pass.
+      node->SetDeleted();
+    }
+
+    std::unordered_map<Node *, Node *>
+        delelte_node_map;  // deleted node to BlockNode
+    for (auto *n : block_node->inlinks) {
+      n->inlinks.clear();
+    }
+    for (auto *n : block_node->outlinks) {
+      n->outlinks.clear();
+    }
+    for (auto *n : block_node->inlinks) {
+      n->outlinks.push_back(block_node);
+    }
+    for (auto *n : block_node->outlinks) {
+      n->inlinks.push_back(n);
+    }
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.h b/paddle/fluid/inference/analysis/subgraph_splitter.h
new file mode 100644
index 0000000000..a31afbe693
--- /dev/null
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the the class to partition a graph.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/node.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Detect the nodes in a sub-graph that meet some conditions. This class doesn't
+ * modify the graph.
+ */
+class SubGraphSplitter {
+ public:
+  static const char *kMarkerAttrName;
+  // Tell whether a node is inside a sub-graph.
+  using NodeInsideSubgraphTeller = std::function<bool(const Node *)>;
+
+  SubGraphSplitter(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller)
+      : graph_(graph), node_inside_subgraph_teller_(teller) {}
+
+  std::vector<std::vector<Node *>> operator()();
+
+ protected:
+  // Mark the nodes inside the accepted sub-graph using
+  // node_inside_subgraph_teller.
+  void MarkNodesInsideSubGraph();
+
+  // Merge the marked nodes into sub-graphs and return the sub-graphs.
+  std::vector<std::vector<Node *>> ExtractSubGraphs();
+
+ private:
+  DataFlowGraph *graph_;
+  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+};
+
+/*
+ * SubGraphFuse - Replace some nodes with the sub-graph node they are inside. To
+ * some extent, the TensorRT engine is just a fusion op for a model.
+ */
+class SubGraphFuse {
+ public:
+  using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller;
+
+  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller)
+      : graph_(graph), node_inside_subgraph_teller_(teller) {}
+
+  // The main method which run all the logic.
+  void operator()();
+
+ protected:
+  // Remove the nodes inside sub-graphs and replace with the SubGraphNode.
+  void ReplaceNodesWithSubGraphs();
+
+ private:
+  DataFlowGraph *graph_;
+  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
new file mode 100644
index 0000000000..0644c0db12
--- /dev/null
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -0,0 +1,67 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, Split) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+  LOG(INFO) << "spliter\n" << dfg.DotString();
+
+  SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
+    if (node->type() != Node::Type::kFunction) return false;
+    const auto* func = static_cast<const Function*>(node);
+    if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
+        func->func_type() == "conv2d" || func->func_type() == "mul" ||
+        func->func_type() == "sigmoid" || func->func_type() == "softmax") {
+      LOG(INFO) << "sub-graph marked " << node->repr();
+      return true;
+    }
+    return false;
+  };
+  ASSERT_GT(dfg.nodes.size(), 5UL);
+
+  auto subgraphs = SubGraphSplitter(&dfg, teller)();
+
+  // Check the number of the marked nodes.
+  int marked_nodes = 0;
+  for (auto& node : dfg.nodes.nodes()) {
+    if (node->IsFunction() &&
+        node->attr(SubGraphSplitter::kMarkerAttrName).Bool()) {
+      ++marked_nodes;
+    }
+  }
+  EXPECT_EQ(marked_nodes, 6);
+
+  // For human debug.
+  for (auto& subgraph : subgraphs) {
+    LOG(INFO) << "subgraph size " << subgraph.size();
+    for (auto* node : subgraph) {
+      LOG(INFO) << "node " << node->repr();
+    }
+  }
+
+  ASSERT_EQ(subgraphs.size(), 1UL);
+  // The last sub-graph has 5 Functions.
+  ASSERT_EQ(subgraphs.back().size(), 6UL);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
new file mode 100644
index 0000000000..722fa99a48
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <string>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/io.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+DEFINE_string(inference_model_dir, "", "inference test model dir");
+
+static framework::proto::ProgramDesc LoadProgramDesc(
+    const std::string& model_dir = FLAGS_inference_model_dir) {
+  paddle::platform::CPUPlace place;
+  paddle::framework::Executor executor(place);
+  paddle::framework::Scope scope;
+  auto program = Load(&executor, &scope, model_dir);
+  return *program->Proto();
+}
+
+static DataFlowGraph ProgramDescToDFG(
+    const framework::proto::ProgramDesc& desc) {
+  DataFlowGraph graph;
+  FluidToDataFlowGraphPass pass;
+  pass.Initialize(desc);
+  pass.Run(&graph);
+  pass.Finalize();
+  return graph;
+}
+
+class DFG_Tester : public ::testing::Test {
+ protected:
+  void SetUp() override { desc = LoadProgramDesc(FLAGS_inference_model_dir); }
+
+  framework::proto::ProgramDesc desc;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 4fb4511d99..748f5a084e 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,4 +1,15 @@
-nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
-nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
-  DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine)
+# Add TRT tests
+nv_library(tensorrt_converter
+  SRCS mul_op.cc conv2d_op.cc fc_op.cc
+  DEPS tensorrt_engine mul_op)
+
+nv_test(test_op_converter SRCS test_op_converter.cc DEPS
+  ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
+
 nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
+nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index 6297051e5a..e1cace9cc1 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
@@ -21,10 +22,11 @@ namespace tensorrt {
 class ReluOpConverter : public OpConverter {
  public:
   ReluOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op) override {
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
-    framework::OpDesc op_desc(op, nullptr, nullptr);
+    framework::OpDesc op_desc(op, nullptr);
     LOG(INFO) << "convert a fluid relu op to tensorrt activation layer whose "
                  "type is Relu";
     const nvinfer1::ITensor* input_tensor =
@@ -32,12 +34,17 @@ class ReluOpConverter : public OpConverter {
     nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
         engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
         nvinfer1::ActivationType::kRELU);
-    engine_->SetITensor(op_desc.Output("Out")[0], layer->getOutput(0));
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
   }
 };
 
-REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 209936c3ba..8e7e23377d 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -21,14 +21,15 @@ namespace tensorrt {
 class Conv2dOpConverter : public OpConverter {
  public:
   Conv2dOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op) override {
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
     LOG(INFO)
         << "convert a fluid conv2d op to tensorrt conv layer without bias";
   }
 };
 
-REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
new file mode 100644
index 0000000000..bb603efaf3
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -0,0 +1,121 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+// Reorder the elements from istrides to ostrides, borrowed from TRT convert in
+// tensorflow.
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/tensorrt/convert/convert_nodes.cc#L318
+template <typename T>
+void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
+              T* odata, nvinfer1::DimsHW ostrides) {
+  for (int h = 0; h < shape.h(); ++h) {
+    for (int w = 0; w < shape.w(); ++w) {
+      odata[h * ostrides.h() + w * ostrides.w()] =
+          idata[h * ostrides.h() + w * ostrides.w()];
+    }
+  }
+}
+
+// Reorder the data layout from CK to KC.
+void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
+                   TensorRTEngine::Weight* oweights) {
+  int c = iweights.dims[0];
+  int k = iweights.dims[1];
+  oweights->dims.assign({k, c});
+  nvinfer1::DimsHW istrides = {1, k};
+  nvinfer1::DimsHW ostrides = {c, 1};
+  Reorder2({k, c}, static_cast<float const*>(iweights.get().values), istrides,
+           static_cast<float*>(const_cast<void*>(oweights->get().values)),
+           ostrides);
+}
+
+/*
+ * FC converter convert a MUL op in Fluid to a FC layer in TRT.
+ */
+class FcOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr);
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    // Declare inputs
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+
+    // Declare weights
+    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    // This may trigger a GPU->CPU copy, because TRT's weight can only be
+    // assigned from CPU memory, that can't be avoided.
+    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL);  // a matrix
+    size_t n_output = Y_t->dims()[1];
+
+    framework::LoDTensor tmp;
+    tmp.Resize(Y_t->dims());
+    memcpy(tmp.mutable_data<float>(platform::CPUPlace()), Y_t->data<float>(),
+           Y_t->dims()[0] * Y_t->dims()[1]);
+
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(weight_data),
+                                  Y_t->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
+                                      static_cast<void*>(tmp.data<float>()),
+                                      Y_t->memory_size() / sizeof(float));
+    weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
+    tmp_weight.dims = weight.dims;
+
+    // The data layout of TRT FC layer's weight is different from fluid's FC,
+    // need to reorder the elements.
+    ReorderCKtoKC(tmp_weight, &weight);
+
+    // Currently, the framework can only handle one fluid op -> one TRT layer,
+    // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
+    // handle `mul`, leave `add` as another layer.
+    // DEBUG
+    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
+                                       *const_cast<nvinfer1::ITensor*>(X),
+                                       n_output, weight.get(), bias.get());
+
+    auto output_name = op_desc.Output("Out").front();
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
+USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index 3ca58b139b..3c34295736 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -18,16 +18,36 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+/*
+ * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
+ */
 class MulOpConverter : public OpConverter {
  public:
-  MulOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op) override {
-    LOG(INFO) << "convert a fluid mul op to tensorrt fc layer without bias";
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
+    // Both the input1 and input2 do not need transpose.
+    auto* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, MatrixMultiply, *const_cast<nvinfer1::ITensor*>(input1), false,
+        *const_cast<nvinfer1::ITensor*>(input2), false);
+
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
   }
 };
 
-REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
+USE_OP(mul);
+REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 1cd3ed9a00..c7a5a49dd0 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
@@ -31,27 +32,45 @@ namespace tensorrt {
 class OpConverter {
  public:
   OpConverter() {}
-  virtual void operator()(const framework::proto::OpDesc& op) {}
 
-  void Run(const framework::proto::OpDesc& op, TensorRTEngine* engine) {
-    std::string type = op.type();
-    auto* it = Registry<OpConverter>::Lookup(type);
-    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", type);
+  // Converter logic for an op.
+  virtual void operator()(const framework::proto::OpDesc& op,
+                          const framework::Scope& scope,
+                          bool test_mode = false) {}
+
+  // Convert a single fluid operator and add the corresponding layer to TRT.
+  // test_mode: whether the instance executes in an unit test.
+  void ConvertOp(const framework::proto::OpDesc& op,
+                 const std::unordered_set<std::string>& parameters,
+                 const framework::Scope& scope, TensorRTEngine* engine,
+                 bool test_mode = false) {
+    framework::OpDesc op_desc(op, nullptr);
+
+    OpConverter* it{nullptr};
+
+    if (op_desc.Type() == "mul") {
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      std::string Y = op_desc.Input("Y")[0];
+      if (parameters.count(Y)) {
+        it = Registry<OpConverter>::Lookup("fc");
+      }
+    }
+    if (!it) {
+      it = Registry<OpConverter>::Lookup(op_desc.Type());
+    }
+    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                            op_desc.Type());
     it->SetEngine(engine);
-    (*it)(op);
-  }
-
-  // convert fluid op to tensorrt layer
-  void ConvertOp(const framework::proto::OpDesc& op, TensorRTEngine* engine) {
-    OpConverter::Run(op, engine);
+    (*it)(op, scope, test_mode);
   }
 
   // convert fluid block to tensorrt network
   void ConvertBlock(const framework::proto::BlockDesc& block,
-                    TensorRTEngine* engine) {
+                    const std::unordered_set<std::string>& parameters,
+                    const framework::Scope& scope, TensorRTEngine* engine) {
     for (int i = 0; i < block.ops_size(); i++) {
       const auto& op = block.ops(i);
-      OpConverter::Run(op, engine);
+      ConvertOp(op, parameters, scope, engine);
     }
   }
 
@@ -62,6 +81,9 @@ class OpConverter {
   // TensorRT engine
   TensorRTEngine* engine_{nullptr};
 
+ protected:
+  bool test_mode_;
+
  private:
   // registered op converter map, whose key is the fluid op type, and value is
   // the pointer position of corresponding OpConverter class.
@@ -70,13 +92,24 @@ class OpConverter {
   framework::Scope* scope_{nullptr};
 };
 
-#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)       \
-  struct trt_##op_type__##_converter {                          \
-    trt_##op_type__##_converter() {                             \
-      Registry<OpConverter>::Register<Converter__>(#op_type__); \
-    }                                                           \
-  };                                                            \
-  trt_##op_type__##_converter trt_##op_type__##_converter__;
+#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)                      \
+  struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
+    trt_##op_type__##_converter() {                                            \
+      ::paddle::inference::                                                    \
+          Registry<paddle::inference::tensorrt::OpConverter>::Register<        \
+              ::paddle::inference::tensorrt::Converter__>(#op_type__);         \
+    }                                                                          \
+  };                                                                           \
+  trt_##op_type__##_converter trt_##op_type__##_converter__;                   \
+  int TouchConverterRegister_##op_type__() {                                   \
+    trt_##op_type__##_converter__.Touch();                                     \
+    return 0;                                                                  \
+  }
+
+#define USE_TRT_CONVERTER(op_type__)                                    \
+  extern int TouchConverterRegister_##op_type__();                      \
+  static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
+      TouchConverterRegister_##op_type__();
 
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index ec33f97c82..0a02a7bebf 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -1,104 +1,47 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
 #include <gtest/gtest.h>
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/place.h"
-
-USE_OP(relu);
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-void Compare(const std::string op_type, float input, float expect) {
+TEST(ReluOpConverter, main) {
   framework::Scope scope;
-  platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
-
-  // init fluid op and variable
-  auto x_var = scope.Var("X");
-  auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
-  x_tensor->Resize({1, 1});
-  x_tensor->mutable_data<float>(place);
-  std::vector<float> init;
-  init.push_back(input);
-  framework::TensorFromVector(init, ctx, x_tensor);
-
-  auto out_var = scope.Var("Out");
-  auto out_tensor = out_var->GetMutable<framework::LoDTensor>();
-  out_tensor->Resize({1, 1});
-  out_tensor->mutable_data<float>(place);
-
-  framework::OpDesc op_desc;
-  op_desc.SetType(op_type);
-  op_desc.SetInput("X", {"X"});
-  op_desc.SetOutput("Out", {"Out"});
-
-  auto op = framework::OpRegistry::CreateOp(*op_desc.Proto());
-
-  // run fluid op
-  op->Run(scope, place);
-  // get fluid output
-  std::vector<float> out1;
-  framework::TensorToVector(*out_tensor, ctx, &out1);
-
-  // init tensorrt op
-  cudaStream_t stream;
-  ASSERT_EQ(0, cudaStreamCreate(&stream));
-  TensorRTEngine* engine = new TensorRTEngine(1, 1 << 10, &stream);
-  engine->InitNetwork();
-  engine->DeclareInput("X", nvinfer1::DataType::kFLOAT,
-                       nvinfer1::DimsCHW{1, 1, 1});
-  // convert op
-  OpConverter op_converter;
-  op_converter.ConvertOp(*op_desc.Proto(), engine);
-
-  engine->DeclareOutput("Out");
-  engine->FreezeNetwork();
-
-  // convert LoDTensor to ITensor
-  size_t size = x_tensor->memory_size();
-  EngineIOConverter::ConvertInput(op_type, *x_tensor,
-                                  engine->buffer("X").buffer, size, &stream);
-  // run tensorrt Outp
-  engine->Execute(1);
-  // convert ITensor to LoDTensor
-  EngineIOConverter::ConvertOutput(op_type, engine->buffer("Out").buffer,
-                                   out_tensor, size, &stream);
-  // get tensorrt output
-  std::vector<float> out2;
-  framework::TensorToVector(*out_tensor, ctx, &out2);
-
-  // compare
-  ASSERT_EQ(out1[0], out2[0]);
-  ASSERT_EQ(out1[0], expect);
-
-  delete engine;
-  cudaStreamDestroy(stream);
-}
-
-TEST(OpConverter, ConvertRelu) {
-  Compare("relu", 1, 1);   // relu(1) = 1
-  Compare("relu", -5, 0);  // relu(-5) = 0
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6));
+  validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("relu");
+  desc.SetInput("X", {"relu-X"});
+  desc.SetOutput("Out", {"relu-Out"});
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(10);
 }
 
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
+USE_OP(relu);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
new file mode 100644
index 0000000000..a30253072a
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(fc_op, test) {
+  std::unordered_set<std::string> parameters({"mul-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(20, parameters, scope, 1000);
+
+  validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1));
+  validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2));
+  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("mul");
+  desc.SetInput("X", {"mul-X"});
+  desc.SetInput("Y", {"mul-Y"});
+  desc.SetOutput("Out", {"mul-Out"});
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(10);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
new file mode 100644
index 0000000000..1ce1130e5d
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(MulOpConverter, main) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
+  validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
+  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("mul");
+  desc.SetInput("X", {"mul-X"});
+  desc.SetInput("Y", {"mul-Y"});
+  desc.SetOutput("Out", {"mul-Out"});
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(10);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 8d66543eb7..9b79f86b0e 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
 namespace inference {
@@ -23,15 +24,17 @@ namespace tensorrt {
 TEST(OpConverter, ConvertBlock) {
   framework::ProgramDesc prog;
   auto* block = prog.MutableBlock(0);
-  auto* mul_op = block->AppendOp();
-  mul_op->SetType("mul");
   auto* conv2d_op = block->AppendOp();
   conv2d_op->SetType("conv2d");
 
   OpConverter converter;
-  converter.ConvertBlock(*block->Proto(), nullptr /*TensorRTEngine*/);
+  framework::Scope scope;
+  converter.ConvertBlock(*block->Proto(), {}, scope,
+                         nullptr /*TensorRTEngine*/);
 }
 
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
+USE_TRT_CONVERTER(conv2d)
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
new file mode 100644
index 0000000000..3b1f531adc
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -0,0 +1,174 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file implements a UT framework to make the validation of transforming
+ * Fluid Op to TRT Layer.
+ */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Get a random float value between [low, high]
+ */
+float random(float low, float high) {
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
+  std::uniform_real_distribution<double> dist(1.0, 10.0);
+  return dist(mt);
+}
+
+void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
+                     const platform::DeviceContext& ctx) {
+  auto dims = tensor->dims();
+  size_t num_elements = analysis::AccuDims(dims, dims.size());
+  PADDLE_ENFORCE_GT(num_elements, 0);
+  auto* data = tensor->mutable_data<float>(place);
+  for (size_t i = 0; i < num_elements; i++) {
+    *(data + i) = random(0., 1.);
+  }
+}
+
+/*
+ * Help to validate the correctness between Fluid Op and the corresponding TRT
+ * layer.
+ */
+class TRTConvertValidation {
+ public:
+  TRTConvertValidation() = delete;
+
+  TRTConvertValidation(int batch_size,
+                       const std::unordered_set<std::string>& parameters,
+                       framework::Scope& scope,  // NOLINT
+                       int workspace_size = 1 << 10)
+      : parameters_(parameters), scope_(scope) {
+    // create engine.
+    engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_));
+    engine_->InitNetwork();
+
+    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+  }
+
+  // Declare a Variable as input with random initialization.
+  void DeclInputVar(const std::string& name, const nvinfer1::Dims& dims) {
+    DeclVar(name, dims);
+    // Declare TRT inputs.
+    engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims);
+  }
+
+  // Declare a parameter varaible in the scope.
+  void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
+    DeclVar(name, dims);
+  }
+
+  void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
+    DeclVar(name, dims);
+  }
+
+  // Declare a variable in a fluid Scope.
+  void DeclVar(const std::string& name, const nvinfer1::Dims& dims) {
+    platform::CPUPlace place;
+    platform::CPUDeviceContext ctx(place);
+
+    // Init Fluid tensor.
+    std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
+    auto* x = scope_.Var(name);
+    auto* x_tensor = x->GetMutable<framework::LoDTensor>();
+    x_tensor->Resize(framework::make_ddim(dim_vec));
+    RandomizeTensor(x_tensor, place, ctx);
+  }
+
+  void SetOp(const framework::proto::OpDesc& desc) {
+    op_ = framework::OpRegistry::CreateOp(desc);
+
+    Singleton<OpConverter>::Global().ConvertOp(
+        desc, parameters_, scope_, engine_.get(), true /*test_mode*/);
+
+    engine_->FreezeNetwork();
+
+    // Declare outputs.
+    op_desc_.reset(new framework::OpDesc(desc, nullptr));
+
+    // Set Inputs.
+    for (const auto& input : op_desc_->InputArgumentNames()) {
+      if (parameters_.count(input)) continue;
+      auto* var = scope_.FindVar(input);
+      PADDLE_ENFORCE(var);
+      auto tensor = var->GetMutable<framework::LoDTensor>();
+
+      engine_->SetInputFromCPU(
+          input, static_cast<void*>(tensor->data<void>()),
+          sizeof(float) *
+              analysis::AccuDims(tensor->dims(), tensor->dims().size()));
+    }
+  }
+
+  void Execute(int batch_size) {
+    // Execute Fluid Op
+    platform::CPUPlace place;
+    platform::CPUDeviceContext ctx(place);
+    op_->Run(scope_, place);
+    // Execute TRT.
+    engine_->Execute(batch_size);
+    cudaStreamSynchronize(*engine_->stream());
+
+    ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
+    const size_t output_space_size = 200;
+    for (const auto& output : op_desc_->OutputArgumentNames()) {
+      std::vector<float> fluid_out;
+      std::vector<float> trt_out(output_space_size);
+      engine_->GetOutputInCPU(output, &trt_out[0],
+                              output_space_size * sizeof(float));
+      cudaStreamSynchronize(*engine_->stream());
+
+      auto* var = scope_.FindVar(output);
+      auto tensor = var->GetMutable<framework::LoDTensor>();
+      framework::TensorToVector(*tensor, ctx, &fluid_out);
+      // Compare two output
+      ASSERT_FALSE(fluid_out.empty());
+      for (size_t i = 0; i < fluid_out.size(); i++) {
+        // Loose the threshold for CI in different machine model.
+        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
+      }
+    }
+  }
+
+  framework::Scope& scope() { return scope_; }
+
+ private:
+  std::unique_ptr<TensorRTEngine> engine_;
+  cudaStream_t stream_;
+  std::unique_ptr<framework::OperatorBase> op_;
+  std::unique_ptr<framework::OpDesc> op_desc_;
+  const std::unordered_set<std::string>& parameters_;
+  framework::Scope& scope_;
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 1c296e33a6..596e0fe9da 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cuda.h>
 #include <glog/logging.h>
 #include <string>
+#include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -42,9 +43,10 @@ void TensorRTEngine::Execute(int batch_size) {
 }
 
 TensorRTEngine::~TensorRTEngine() {
+  cudaStreamSynchronize(*stream_);
   // clean buffer
   for (auto& buf : buffers_) {
-    if (buf.buffer != nullptr) {
+    if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
       PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
       buf.buffer = nullptr;
       buf.max_size = 0;
@@ -71,13 +73,16 @@ void TensorRTEngine::FreezeNetwork() {
   for (auto& item : buffer_sizes_) {
     if (item.second == 0) {
       auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
+      auto dims = infer_engine_->getBindingDimensions(slot_offset);
       item.second = kDataTypeSize[static_cast<int>(
                         infer_engine_->getBindingDataType(slot_offset))] *
-                    AccumDims(infer_engine_->getBindingDimensions(slot_offset));
+                    analysis::AccuDims(dims.d, dims.nbDims);
     }
     auto& buf = buffer(item.first);
     CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
     PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second));
+    VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
+            << buf.buffer;
     buf.size = buf.max_size = item.second;
     buf.device = DeviceType::GPU;
   }
@@ -85,14 +90,16 @@ void TensorRTEngine::FreezeNetwork() {
 
 nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
                                                 nvinfer1::DataType dtype,
-                                                const nvinfer1::Dims& dim) {
+                                                const nvinfer1::Dims& dims) {
   PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
                     name);
 
   PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
-  auto* input = infer_network_->addInput(name.c_str(), dtype, dim);
+  auto* input = infer_network_->addInput(name.c_str(), dtype, dims);
   PADDLE_ENFORCE(input, "infer network add input %s failed", name);
-  buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] * AccumDims(dim);
+  buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
+                        analysis::AccuDims(dims.d, dims.nbDims);
+  PADDLE_ENFORCE(input->isNetworkInput());
   TensorRTEngine::SetITensor(name, input);
   return input;
 }
@@ -103,9 +110,12 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
                     name);
 
   auto* output = layer->getOutput(offset);
+  SetITensor(name, output);
   PADDLE_ENFORCE(output != nullptr);
   output->setName(name.c_str());
+  PADDLE_ENFORCE(!output->isNetworkInput());
   infer_network_->markOutput(*output);
+  PADDLE_ENFORCE(output->isNetworkOutput());
   // output buffers' size can only be decided latter, set zero here to mark this
   // and will reset latter.
   buffer_sizes_[name] = 0;
@@ -118,6 +128,7 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
   auto* output = TensorRTEngine::GetITensor(name);
   PADDLE_ENFORCE(output != nullptr);
   output->setName(name.c_str());
+  PADDLE_ENFORCE(!output->isNetworkInput());
   infer_network_->markOutput(*output);
   // output buffers' size can only be decided latter, set zero here to mark this
   // and will reset latter.
@@ -128,6 +139,20 @@ void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
   return buffer(name).buffer;
 }
 
+void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst,
+                                    size_t max_size) {
+  // determine data size
+  auto it = buffer_sizes_.find(name);
+  PADDLE_ENFORCE(it != buffer_sizes_.end());
+  PADDLE_ENFORCE_GT(it->second, 0);
+  PADDLE_ENFORCE_GE(max_size, it->second);
+  auto& buf = buffer(name);
+  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
+  PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second,
+                                    cudaMemcpyDeviceToDevice, *stream_),
+                    0);
+}
+
 void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
                                     size_t max_size) {
   // determine data size
@@ -149,7 +174,7 @@ Buffer& TensorRTEngine::buffer(const std::string& name) {
   return buffers_[slot_offset];
 }
 
-void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data,
+void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data,
                                      size_t size) {
   auto& buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer);
@@ -159,16 +184,26 @@ void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data,
                                        cudaMemcpyHostToDevice, *stream_));
 }
 
+void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
+                                     size_t size) {
+  auto& buf = buffer(name);
+  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
+  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
+  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
+  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
+                                       cudaMemcpyDeviceToDevice, *stream_));
+}
+
 void TensorRTEngine::SetITensor(const std::string& name,
                                 nvinfer1::ITensor* tensor) {
   PADDLE_ENFORCE(tensor != nullptr);
-  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate itensor name %s",
+  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
                     name);
   itensor_map_[name] = tensor;
 }
 
 nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
-  PADDLE_ENFORCE(itensor_map_.count(name), "no itensor %s", name);
+  PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
   return itensor_map_[name];
 }
 
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index b8298c6059..b60f00de9f 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
 namespace inference {
@@ -37,13 +38,15 @@ class TensorRTEngine : public EngineBase {
   // Weight is model parameter.
   class Weight {
    public:
-    Weight(nvinfer1::DataType dtype, void* value, int num_elem) {
+    Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) {
       w_.type = dtype;
       w_.values = value;
       w_.count = num_elem;
     }
     const nvinfer1::Weights& get() { return w_; }
 
+    std::vector<int64_t> dims;
+
    private:
     nvinfer1::Weights w_;
   };
@@ -92,13 +95,15 @@ class TensorRTEngine : public EngineBase {
   cudaStream_t* stream() { return stream_; }
 
   // Fill an input from CPU memory with name and size.
-  void SetInputFromCPU(const std::string& name, void* data, size_t size);
+  void SetInputFromCPU(const std::string& name, const void* data, size_t size);
   // TODO(Superjomn) is this method necessary given that buffer(xxx) can be
   // accessed directly. Fill an input from GPU memory with name and size.
-  void SetInputFromGPU(const std::string& name, void* data, size_t size);
+  void SetInputFromGPU(const std::string& name, const void* data, size_t size);
   // Get an output called name, the output of tensorrt is in GPU, so this method
-  // will just return the output's GPU memory address.
+  // Return the output's GPU memory address without copy.
   void* GetOutputInGPU(const std::string& name);
+  // Copy data into dst inside the GPU device.
+  void GetOutputInGPU(const std::string& name, void* dst, size_t max_size);
   // LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
   // to CPU.
   void GetOutputInCPU(const std::string& name, void* dst, size_t max_size);
@@ -127,7 +132,11 @@ class TensorRTEngine : public EngineBase {
   // TensorRT related internal members
   template <typename T>
   struct Destroyer {
-    void operator()(T* x) { x->destroy(); }
+    void operator()(T* x) {
+      if (x) {
+        x->destroy();
+      }
+    }
   };
   template <typename T>
   using infer_ptr = std::unique_ptr<T, Destroyer<T>>;
@@ -151,6 +160,27 @@ class TensorRTEngine : public EngineBase {
 #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
   engine__->network()->add##layer__(ARGS);
 
+/*
+ * Helper to control the TensorRT engine's creation and deletion.
+ */
+class TRT_EngineManager {
+ public:
+  TensorRTEngine* Create(int max_batch, int max_workspace,
+                         cudaStream_t* stream) {
+    engines_.emplace_back(new TensorRTEngine(max_batch, max_workspace, stream));
+    return engines_.back().get();
+  }
+
+  void DeleteALl() {
+    for (auto& ptr : engines_) {
+      ptr.reset(nullptr);
+    }
+  }
+
+ private:
+  std::vector<std::unique_ptr<TensorRTEngine>> engines_;
+};
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index 2b402cce60..b6e7968108 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -26,15 +26,6 @@ namespace tensorrt {
 
 namespace dy = paddle::platform::dynload;
 
-static size_t AccumDims(nvinfer1::Dims dims) {
-  size_t num = dims.nbDims == 0 ? 0 : 1;
-  for (int i = 0; i < dims.nbDims; i++) {
-    PADDLE_ENFORCE_GT(dims.d[i], 0);
-    num *= dims.d[i];
-  }
-  return num;
-}
-
 // TensorRT data type to size
 const int kDataTypeSize[] = {
     4,  // kFLOAT
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index dbb81462b8..2fa5a9540b 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -38,3 +38,11 @@ inference_test(recommender_system)
 #inference_test(rnn_encoder_decoder)
 #inference_test(understand_sentiment ARGS conv)
 inference_test(word2vec)
+
+# This is an unly work around to make this test run
+# TODO(TJ): clean me up
+cc_test(test_inference_nlp
+  SRCS test_inference_nlp.cc
+  DEPS paddle_fluid
+  ARGS
+  --model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
new file mode 100644
index 0000000000..9dcd79c3bb
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -0,0 +1,227 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <sys/time.h>
+#include <time.h>
+#include <fstream>
+#include <thread>  // NOLINT
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+#ifdef PADDLE_WITH_MKLML
+#include <mkl_service.h>
+#include <omp.h>
+#endif
+
+DEFINE_string(model_path, "", "Directory of the inference model.");
+DEFINE_string(data_file, "", "File of input index data.");
+DEFINE_int32(repeat, 100, "Running the inference program repeat times");
+DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
+DEFINE_int32(num_threads, 1, "Number of threads should be used");
+
+inline double GetCurrentMs() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
+}
+
+// This function just give dummy data for recognize_digits model.
+size_t DummyData(std::vector<paddle::framework::LoDTensor>* out) {
+  paddle::framework::LoDTensor input;
+  SetupTensor<float>(&input, {1, 1, 28, 28}, -1.f, 1.f);
+  out->emplace_back(input);
+  return 1;
+}
+
+// Load the input word index data from file and save into LodTensor.
+// Return the size of words.
+size_t LoadData(std::vector<paddle::framework::LoDTensor>* out,
+                const std::string& filename) {
+  if (filename.empty()) {
+    return DummyData(out);
+  }
+
+  size_t sz = 0;
+  std::fstream fin(filename);
+  std::string line;
+  out->clear();
+  while (getline(fin, line)) {
+    std::istringstream iss(line);
+    std::vector<int64_t> ids;
+    std::string field;
+    while (getline(iss, field, ' ')) {
+      ids.push_back(stoi(field));
+    }
+    if (ids.size() >= 1024) {
+      // Synced with NLP guys, they will ignore input larger then 1024
+      continue;
+    }
+
+    paddle::framework::LoDTensor words;
+    paddle::framework::LoD lod{{0, ids.size()}};
+    words.set_lod(lod);
+    int64_t* pdata = words.mutable_data<int64_t>(
+        {static_cast<int64_t>(ids.size()), 1}, paddle::platform::CPUPlace());
+    memcpy(pdata, ids.data(), words.numel() * sizeof(int64_t));
+    out->emplace_back(words);
+    sz += ids.size();
+  }
+  return sz;
+}
+
+// Split input data samples into small pieces jobs as balanced as possible,
+// according to the number of threads.
+void SplitData(
+    const std::vector<paddle::framework::LoDTensor>& datasets,
+    std::vector<std::vector<const paddle::framework::LoDTensor*>>* jobs,
+    const int num_threads) {
+  size_t s = 0;
+  jobs->resize(num_threads);
+  while (s < datasets.size()) {
+    for (auto it = jobs->begin(); it != jobs->end(); it++) {
+      it->emplace_back(&datasets[s]);
+      s++;
+      if (s >= datasets.size()) {
+        break;
+      }
+    }
+  }
+}
+
+void ThreadRunInfer(
+    const int tid, paddle::framework::Scope* scope,
+    const std::vector<std::vector<const paddle::framework::LoDTensor*>>& jobs) {
+  // maybe framework:ProgramDesc is not thread-safe
+  auto& sub_scope = scope->NewScope();
+  auto place = paddle::platform::CPUPlace();
+  auto executor = paddle::framework::Executor(place);
+  auto inference_program =
+      paddle::inference::Load(&executor, scope, FLAGS_model_path);
+
+  auto ctx = executor.Prepare(*inference_program, /*block_id*/ 0);
+  executor.CreateVariables(*inference_program, &sub_scope, /*block_id*/ 0);
+
+  const std::vector<std::string>& feed_target_names =
+      inference_program->GetFeedTargetNames();
+  const std::vector<std::string>& fetch_target_names =
+      inference_program->GetFetchTargetNames();
+
+  PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+  paddle::framework::LoDTensor outtensor;
+  fetch_targets[fetch_target_names[0]] = &outtensor;
+
+  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+  PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
+
+  auto& inputs = jobs[tid];
+  auto start_ms = GetCurrentMs();
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    feed_targets[feed_target_names[0]] = inputs[i];
+    executor.RunPreparedContext(ctx.get(), &sub_scope, &feed_targets,
+                                &fetch_targets, false /*create_local_scope*/);
+  }
+  auto stop_ms = GetCurrentMs();
+  scope->DeleteScope(&sub_scope);
+  LOG(INFO) << "Tid: " << tid << ", process " << inputs.size()
+            << " samples, avg time per sample: "
+            << (stop_ms - start_ms) / inputs.size() << " ms";
+}
+
+TEST(inference, nlp) {
+  if (FLAGS_model_path.empty()) {
+    LOG(FATAL) << "Usage: ./example --model_path=path/to/your/model";
+  }
+  if (FLAGS_data_file.empty()) {
+    LOG(WARNING) << "No data file provided, will use dummy data!"
+                 << "Note: if you use nlp model, please provide data file.";
+  }
+  LOG(INFO) << "Model Path: " << FLAGS_model_path;
+  LOG(INFO) << "Data File: " << FLAGS_data_file;
+
+  std::vector<paddle::framework::LoDTensor> datasets;
+  size_t num_total_words = LoadData(&datasets, FLAGS_data_file);
+  LOG(INFO) << "Number of samples (seq_len<1024): " << datasets.size();
+  LOG(INFO) << "Total number of words: " << num_total_words;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  std::unique_ptr<paddle::framework::Scope> scope(
+      new paddle::framework::Scope());
+
+#ifdef PADDLE_WITH_MKLML
+  // only use 1 thread number per std::thread
+  omp_set_dynamic(0);
+  omp_set_num_threads(1);
+  mkl_set_num_threads(1);
+#endif
+
+  double start_ms = 0, stop_ms = 0;
+  if (FLAGS_num_threads > 1) {
+    std::vector<std::vector<const paddle::framework::LoDTensor*>> jobs;
+    SplitData(datasets, &jobs, FLAGS_num_threads);
+    std::vector<std::unique_ptr<std::thread>> threads;
+    start_ms = GetCurrentMs();
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads.emplace_back(
+          new std::thread(ThreadRunInfer, i, scope.get(), std::ref(jobs)));
+    }
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads[i]->join();
+    }
+    stop_ms = GetCurrentMs();
+  } else {
+    // 1. Define place, executor, scope
+    auto place = paddle::platform::CPUPlace();
+    auto executor = paddle::framework::Executor(place);
+
+    // 2. Initialize the inference_program and load parameters
+    std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
+    inference_program = InitProgram(&executor, scope.get(), FLAGS_model_path,
+                                    /*model combined*/ false);
+    // always prepare context
+    std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
+    ctx = executor.Prepare(*inference_program, 0);
+    if (FLAGS_prepare_vars) {
+      executor.CreateVariables(*inference_program, scope.get(), 0);
+    }
+    // preapre fetch
+    const std::vector<std::string>& fetch_target_names =
+        inference_program->GetFetchTargetNames();
+    PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
+    std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+    paddle::framework::LoDTensor outtensor;
+    fetch_targets[fetch_target_names[0]] = &outtensor;
+
+    // prepare feed
+    const std::vector<std::string>& feed_target_names =
+        inference_program->GetFeedTargetNames();
+    PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
+    std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+
+    // feed data and run
+    start_ms = GetCurrentMs();
+    for (size_t i = 0; i < datasets.size(); ++i) {
+      feed_targets[feed_target_names[0]] = &(datasets[i]);
+      executor.RunPreparedContext(ctx.get(), scope.get(), &feed_targets,
+                                  &fetch_targets, !FLAGS_prepare_vars);
+    }
+    stop_ms = GetCurrentMs();
+    LOG(INFO) << "Tid: 0, process " << datasets.size()
+              << " samples, avg time per sample: "
+              << (stop_ms - start_ms) / datasets.size() << " ms";
+  }
+  LOG(INFO) << "Total inference time with " << FLAGS_num_threads
+            << " threads : " << (stop_ms - start_ms) / 1000.0
+            << " sec, QPS: " << datasets.size() / ((stop_ms - start_ms) / 1000);
+}
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index cc1589514a..44c36b1683 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -22,6 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/profiler.h"
 
+DECLARE_bool(use_mkldnn);
+
 template <typename T>
 void SetupTensor(paddle::framework::LoDTensor* input,
                  paddle::framework::DDim dims, T lower, T upper) {
@@ -194,7 +196,10 @@ void TestInference(const std::string& dirname,
     fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
   }
 
-  // 6. Run the inference program
+  // 6. If export Flags_use_mkldnn=True, use mkldnn related ops.
+  if (FLAGS_use_mkldnn) executor.EnableMKLDNN(*inference_program);
+
+  // 7. Run the inference program
   {
     if (!CreateVars) {
       // If users don't want to create and destroy variables every time they
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 7fce138e3f..d6a36eff09 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -166,8 +166,8 @@ function(op_library TARGET)
       # NOTE(*): activation use macro to regist the kernels, set use_op manually.
       if(${TARGET} STREQUAL "activation")
         file(APPEND ${pybind_file} "USE_OP(relu);\n")
-      elseif(${TARGET} STREQUAL "reduce")
-        file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
+      elseif(${TARGET} STREQUAL "fake_dequantize")
+        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
       else()
         file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
       endif()
@@ -186,39 +186,57 @@ endif()
 
 add_subdirectory(detail)
 if(WITH_DISTRIBUTE)
-    
-    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
+
+    set(DISTRIBUTE_DEPS "")
+    if(WITH_GRPC)
+        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
+    else()
+        set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib)
+    endif()
+
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
+    op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor)
+    set_source_files_properties(fetch_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
+    #        listen_and_serv_op sum_op executor SERIAL)
     if(WITH_GPU)
         set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op listen_and_serv_op executor)
-        op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
+        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op executor SERIAL)
+        if(WITH_GRPC)
+            op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
+        else()
+            op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_brpc)
+        endif()
         set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     else()
         set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
     endif()
 else()
-    set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op gen_nccl_id_op)
+    set(DEPS_OPS ${DEPS_OPS}  prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
 endif()
 
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
+if (WITH_GPU AND TENSORRT_FOUND)
+    op_library(tensorrt_engine_op DEPS tensorrt_engine)
+    nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
+      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter)
+else()
+    set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
+endif()
 op_library(sum_op DEPS selected_rows_functor)
 op_library(sgd_op DEPS selected_rows_functor)
 op_library(print_op DEPS lod_tensor)
diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc
index ab7c612271..46ed99bcf2 100644
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -15,6 +15,7 @@
 #include "mkldnn.hpp"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/mkldnn_activation_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -23,6 +24,18 @@ using paddle::framework::Tensor;
 using paddle::platform::MKLDNNDeviceContext;
 
 namespace {
+std::string gethash(const mkldnn::memory::dims &operand_dims,
+                    const mkldnn::algorithm algorithm) {
+  auto dim2str = [](const mkldnn::memory::dims &operand_dims) {
+    std::string dstr = "";
+    for (size_t i = 0; i < operand_dims.size(); ++i) {
+      dstr += std::to_string(operand_dims[i]) + "-";
+    }
+    return dstr;
+  };
+  return dim2str(operand_dims) + std::to_string(algorithm);
+}
+
 template <typename T, typename ExecContext>
 void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
                      const T alpha = 0, const T beta = 0) {
@@ -37,42 +50,70 @@ void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
   const auto *src_data = src->template data<T>();
 
   auto *dst = ctx.template Output<Tensor>("Out");
-  const T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
+  T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
 
   // get memory dim
   PADDLE_ENFORCE(src->dims().size() == 2 || src->dims().size() == 4,
                  "Input dim must be with 2 or 4");
   std::vector<int> src_tz = framework::vectorize2int(src->dims());
 
-  // create memory description
-  auto data_md = src_tz.size() == 2
-                     ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                               mkldnn::memory::format::nc)
-                     : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                               mkldnn::memory::format::nchw);
-
-  // create memory primitives
-  auto src_memory =
-      mkldnn::memory({data_md, mkldnn_engine},
-                     static_cast<void *>(const_cast<float *>(src_data)));
-  auto dst_memory =
-      mkldnn::memory({data_md, mkldnn_engine},
-                     static_cast<void *>(const_cast<float *>(dst_data)));
-
-  auto forward_desc = mkldnn::eltwise_forward::desc(
-      mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
-
-  // save prim desc into global device context to be referred in backward path
-  const std::string key = ctx.op().Output("Out");
-  const std::string key_eltwise_pd = key + "@eltwise_pd";
-  auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
-      forward_desc, mkldnn_engine);
-  dev_ctx.SetBlob(key_eltwise_pd, forward_pd);
-
-  auto eltwise = mkldnn::eltwise_forward(*forward_pd, src_memory, dst_memory);
+  const std::string key = gethash(src_tz, algorithm);
+  const std::string key_src_data =
+      key + ctx.op().Output("Out") + "@eltwise_fwd_src_data";
+  const std::string key_src_mem = key + "@eltwise_fwd_src_mem";
+  const std::string key_dst_mem = key + "@eltwise_fwd_dst_mem";
+  const std::string key_fwd = key + "@eltwise_fwd";
+
+  auto p_fwd = std::static_pointer_cast<mkldnn::eltwise_forward>(
+      dev_ctx.GetBlob(key_fwd));
+
+  // save input data to be referred in backward path
+  auto p_src_data = std::make_shared<const T *>(src_data);
+  dev_ctx.SetBlob(key_src_data, p_src_data);
+
+  if (p_fwd == nullptr) {
+    // create memory description
+    auto data_md = src_tz.size() == 2
+                       ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                                 mkldnn::memory::format::nc)
+                       : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                                 mkldnn::memory::format::nchw);
+
+    // create memory primitives
+    auto p_src_mem = std::make_shared<mkldnn::memory>(mkldnn::memory(
+        {data_md, mkldnn_engine}, platform::to_void_cast(src_data)));
+    dev_ctx.SetBlob(key_src_mem, p_src_mem);
+
+    auto p_dst_mem = std::make_shared<mkldnn::memory>(mkldnn::memory(
+        {data_md, mkldnn_engine}, platform::to_void_cast(dst_data)));
+    dev_ctx.SetBlob(key_dst_mem, p_dst_mem);
+
+    auto fwd_desc = mkldnn::eltwise_forward::desc(
+        mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
+    auto p_fwd_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
+        fwd_desc, mkldnn_engine);
+    const std::string key_fwd_pd = key + "eltwise_fwd_pd";
+    dev_ctx.SetBlob(key_fwd_pd, p_fwd_pd);
+    p_fwd = std::make_shared<mkldnn::eltwise_forward>(
+        *p_fwd_pd, *(p_src_mem.get()), *(p_dst_mem.get()));
+    dev_ctx.SetBlob(key_fwd, p_fwd);
+  } else {
+    // primitives already exist
+    auto p_src_mem =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
+    PADDLE_ENFORCE(p_src_mem != nullptr,
+                   "Fail to find eltwise p_src_mem in device context.");
+    auto p_dst_mem =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_dst_mem));
+    PADDLE_ENFORCE(p_dst_mem != nullptr,
+                   "Fail to find eltwise p_src_mem in device context.");
+
+    p_src_mem->set_data_handle(platform::to_void_reinterpret_cast(src_data));
+    p_dst_mem->set_data_handle(dst_data);
+  }
 
   // push primitive to stream and wait until it's executed
-  std::vector<mkldnn::primitive> pipeline = {eltwise};
+  std::vector<mkldnn::primitive> pipeline = {*(p_fwd.get())};
   mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 }
 
@@ -83,8 +124,7 @@ void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
   const auto &mkldnn_engine = dev_ctx.GetEngine();
 
   // get buffers
-  const auto *x = ctx.template Input<Tensor>("X");
-  const auto *src = x->template data<T>();
+  const auto *out = ctx.template Input<Tensor>("Out");
 
   auto *dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
   const auto *diff_dst = dout->template data<T>();
@@ -94,45 +134,73 @@ void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
   const T *diff_src = dx->template mutable_data<T>(ctx.GetPlace());
 
   // get memory dim
-  std::vector<int> src_tz = framework::vectorize2int(x->dims());
-
-  // create memory description
-  auto data_md = src_tz.size() == 2
-                     ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                               mkldnn::memory::format::nc)
-                     : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                               mkldnn::memory::format::nchw);
-
-  // create memory primitives
-  auto src_memory = mkldnn::memory(
-      {data_md, mkldnn_engine}, static_cast<void *>(const_cast<float *>(src)));
-  auto diff_src_memory =
-      mkldnn::memory({data_md, mkldnn_engine},
-                     static_cast<void *>(const_cast<float *>(diff_src)));
-  auto diff_dst_memory =
-      mkldnn::memory({data_md, mkldnn_engine},
-                     static_cast<void *>(const_cast<float *>(diff_dst)));
-
-  auto backward_desc =
-      mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta);
-
-  // retrieve eltwise primitive desc from device context
-  const std::string key = ctx.op().Input("Out");
-  const std::string key_eltwise_pd = key + "@eltwise_pd";
-  const std::shared_ptr<void> forward_pd = dev_ctx.GetBlob(key_eltwise_pd);
-  PADDLE_ENFORCE(forward_pd != nullptr,
-                 "Fail to find eltwise_pd in device context");
-  auto *p_forward_pd =
-      static_cast<mkldnn::eltwise_forward::primitive_desc *>(forward_pd.get());
-
-  auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc(
-      backward_desc, mkldnn_engine, *p_forward_pd);
-
-  auto eltwise_bwd = mkldnn::eltwise_backward(eltwise_bwd_prim_desc, src_memory,
-                                              diff_dst_memory, diff_src_memory);
+  std::vector<int> src_tz = framework::vectorize2int(out->dims());
+
+  const std::string key = gethash(src_tz, algorithm);
+  const std::string key_diff_src_mem = key + "@eltwise_diff_src_mem";
+  const std::string key_diff_dst_mem = key + "@eltwise_diff_dst_mem";
+  const std::string key_grad = key + "@eltwise_grad";
+
+  const std::string key_src_data =
+      key + ctx.op().Input("Out") + "@eltwise_fwd_src_data";
+  const auto p_src_data =
+      std::static_pointer_cast<T *>(dev_ctx.GetBlob(key_src_data));
+
+  const std::string key_src_mem = key + "@eltwise_fwd_src_mem";
+  auto p_src_mem =
+      std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
+  p_src_mem->set_data_handle(*p_src_data.get());
+
+  auto p_grad = std::static_pointer_cast<mkldnn::eltwise_forward::primitive>(
+      dev_ctx.GetBlob(key_grad));
+
+  if (p_grad == nullptr) {
+    // create memory description
+    auto data_md = src_tz.size() == 2
+                       ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                                 mkldnn::memory::format::nc)
+                       : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                                 mkldnn::memory::format::nchw);
+
+    // create memory primitives
+    std::shared_ptr<void> p_diff_src_mem =
+        std::make_shared<mkldnn::memory>(mkldnn::memory(
+            {data_md, mkldnn_engine}, platform::to_void_cast(diff_src)));
+    dev_ctx.SetBlob(key_diff_src_mem, p_diff_src_mem);
+    std::shared_ptr<void> p_diff_dst_mem =
+        std::make_shared<mkldnn::memory>(mkldnn::memory(
+            {data_md, mkldnn_engine}, platform::to_void_cast(diff_dst)));
+    dev_ctx.SetBlob(key_diff_dst_mem, p_diff_dst_mem);
+
+    auto bwd_desc = mkldnn::eltwise_backward::desc(algorithm, data_md, data_md,
+                                                   alpha, beta);
+
+    const std::string key_fwd_pd = key + "eltwise_fwd_pd";
+    auto *p_fwd_pd = static_cast<mkldnn::eltwise_forward::primitive_desc *>(
+        dev_ctx.GetBlob(key_fwd_pd).get());
+
+    auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc(
+        bwd_desc, mkldnn_engine, *p_fwd_pd);
+
+    p_grad = std::make_shared<mkldnn::eltwise_backward>(
+        eltwise_bwd_prim_desc, *static_cast<mkldnn::memory *>(p_src_mem.get()),
+        *(static_cast<mkldnn::memory *>(p_diff_dst_mem.get())),
+        *(static_cast<mkldnn::memory *>(p_diff_src_mem.get())));
+  } else {
+    // primitives already exist
+    auto p_diff_src_mem = std::static_pointer_cast<mkldnn::memory>(
+        dev_ctx.GetBlob(key_diff_src_mem));
+    auto p_diff_dst_mem = std::static_pointer_cast<mkldnn::memory>(
+        dev_ctx.GetBlob(key_diff_dst_mem));
+
+    p_diff_src_mem->set_data_handle(
+        platform::to_void_reinterpret_cast(diff_src));
+    p_diff_dst_mem->set_data_handle(
+        platform::to_void_reinterpret_cast(diff_dst));
+  }
 
   // push primitive to stream and wait until it's executed
-  std::vector<mkldnn::primitive> pipeline = {eltwise_bwd};
+  std::vector<mkldnn::primitive> pipeline = {*(p_grad.get())};
   mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 }
 }  // anonymous namespace
@@ -154,35 +222,35 @@ struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
 };
 
 template <typename T>
-using ReluMkldnnFunctor =
+using ReluMKLDNNFunctor =
     MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_relu>;
 
 template <typename T>
-using TanhMkldnnFunctor =
+using TanhMKLDNNFunctor =
     MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_tanh>;
 
 template <typename T>
-using SqrtMkldnnFunctor =
+using SqrtMKLDNNFunctor =
     MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_sqrt>;
 
 template <typename T>
-using AbsMkldnnFunctor =
+using AbsMKLDNNFunctor =
     MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_abs>;
 
 template <typename T>
-using ReluMkldnnGradFunctor =
+using ReluMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_relu>;
 
 template <typename T>
-using TanhMkldnnGradFunctor =
+using TanhMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_tanh>;
 
 template <typename T>
-using SqrtMkldnnGradFunctor =
+using SqrtMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_sqrt>;
 
 template <typename T>
-using AbsMkldnnGradFunctor =
+using AbsMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_abs>;
 }  // namespace operators
 }  // namespace paddle
@@ -197,9 +265,9 @@ namespace ops = paddle::operators;
       ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
 
 #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)            \
-  __macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor); \
-  __macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor); \
-  __macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor); \
-  __macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor);
+  __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \
+  __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradFunctor); \
+  __macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradFunctor); \
+  __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor);
 
 FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 55482abdf0..af1d85047e 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -24,12 +24,12 @@ namespace operators {
       : public ::paddle::framework::OpProtoAndCheckerMaker {            \
    public:                                                              \
     void Make() override {                                              \
-      AddInput("X", "Input of " #OP_NAME "operator");                   \
-      AddOutput("Out", "Output of" #OP_NAME "operator");                \
+      AddInput("X", "Input of " #OP_NAME " operator");                  \
+      AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X");   \
       AddAttr<bool>("use_mkldnn",                                       \
                     "(bool, default false) Only used in mkldnn kernel") \
           .SetDefault(false);                                           \
-      AddComment(#OP_COMMENT);                                          \
+      AddComment(OP_COMMENT);                                           \
     }                                                                   \
   }
 
@@ -41,7 +41,7 @@ namespace operators {
                                                                              \
    protected:                                                                \
     std::unique_ptr<::paddle::framework::OpDesc> Apply() const override {    \
-      auto *op = new ::paddle::framework::OpDesc();                          \
+      auto* op = new ::paddle::framework::OpDesc();                          \
       op->SetType(#KERNEL_TYPE "_grad");                                     \
       op->SetInput("Out", Output("Out"));                                    \
       op->SetInput(::paddle::framework::GradVarName("Out"),                  \
@@ -54,23 +54,52 @@ namespace operators {
     }                                                                        \
   }
 
+framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
+                                      const framework::OperatorWithKernel& oper,
+                                      const std::string& name) {
+  framework::LibraryType library{framework::LibraryType::kPlain};
+
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+#ifdef PADDLE_WITH_MKLDNN
+  auto it = oper.Attrs().find("use_mkldnn");
+  if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
+  }
+#endif
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::Tensor>(name)->type()),
+      ctx.GetPlace(), layout, library);
+}
+
 class ActivationOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetKernelType(ctx, *this, "X");
+  }
 };
 
 class ActivationOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetKernelType(ctx, *this, "Out");
+  }
 };
 
 __attribute__((unused)) constexpr char SigmoidDoc[] = R"DOC(
diff --git a/paddle/fluid/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc
index 99b0239855..6ee73c3000 100644
--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
@@ -89,9 +89,9 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
     AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
 
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("Moment1Out", "(Tensor) Output first moment");
-    AddOutput("Moment2Out", "(Tensor) Output second moment");
+    AddOutput("ParamOut", "(Tensor) Output parameter").Reuse("Param");
+    AddOutput("Moment1Out", "(Tensor) Output first moment").Reuse("Moment1");
+    AddOutput("Moment2Out", "(Tensor) Output second moment").Reuse("Moment2");
 
     AddAttr<float>("beta1",
                    "(float, default 0.9) "
diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc
new file mode 100644
index 0000000000..8174d37358
--- /dev/null
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -0,0 +1,33 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
+
+REGISTER_OPERATOR(arg_max, paddle::operators::ArgMinMaxOp,
+                  paddle::operators::ArgMaxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    arg_max,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, float>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, double>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    int32_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    int16_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, size_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    uint8_t>);
diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu
new file mode 100644
index 0000000000..a147d77a9e
--- /dev/null
+++ b/paddle/fluid/operators/arg_max_op.cu
@@ -0,0 +1,31 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    arg_max,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    double>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    int32_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    int16_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    size_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    uint8_t>);
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
new file mode 100644
index 0000000000..6cbdaefeda
--- /dev/null
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -0,0 +1,160 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <type_traits>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace operators {
+
+enum ArgMinMaxType { kArgMin, kArgMax };
+
+template <typename DeviceContext, typename T, typename Tout, int64_t Rank,
+          ArgMinMaxType argMinMaxValue>
+struct ArgMinMaxFunctor {};
+
+#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value)      \
+  template <typename DeviceContext, typename T, typename Tout, int64_t Rank>  \
+  struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank,                       \
+                          enum_argminmax_value> {                             \
+    void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
+                    framework::LoDTensor* out, int64_t axis) {                \
+      auto in_eigen = framework::EigenTensor<T, Rank>::From(in);              \
+      auto out_eigen = framework::EigenTensor<Tout, Rank - 1>::From(*out);    \
+      out_eigen.device(*(ctx.eigen_device())) =                               \
+          in_eigen.eigen_op_type(axis).template cast<Tout>();                 \
+    }                                                                         \
+  }
+
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin);
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax);
+
+template <typename DeviceContext, typename T, typename Tout,
+          ArgMinMaxType EnumArgMinMaxValue>
+class ArgMinMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& x = *(ctx.Input<framework::LoDTensor>("X"));
+    auto& out = *(ctx.Output<framework::LoDTensor>("Out"));
+    out.mutable_data<Tout>(ctx.GetPlace());
+    auto axis = ctx.Attr<int64_t>("axis");
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+#define CALL_ARG_MINMAX_FUNCTOR(rank)                                \
+  ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
+      functor##rank;                                                 \
+  functor##rank(dev_ctx, x, &out, axis)
+
+    switch (x.dims().size()) {
+      case 1:
+        CALL_ARG_MINMAX_FUNCTOR(1);
+        break;
+      case 2:
+        CALL_ARG_MINMAX_FUNCTOR(2);
+        break;
+      case 3:
+        CALL_ARG_MINMAX_FUNCTOR(3);
+        break;
+      case 4:
+        CALL_ARG_MINMAX_FUNCTOR(4);
+        break;
+      case 5:
+        CALL_ARG_MINMAX_FUNCTOR(5);
+        break;
+      case 6:
+        CALL_ARG_MINMAX_FUNCTOR(6);
+        break;
+      default:
+        PADDLE_THROW(
+            "%s operator doesn't supports tensors whose ranks are greater "
+            "than 6.",
+            (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax"));
+        break;
+#undef CALL_ARG_MINMAX_FUNCTOR
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+using ArgMinKernel =
+    ArgMinMaxKernel<DeviceContext, T, int64_t, ArgMinMaxType::kArgMin>;
+
+template <typename DeviceContext, typename T>
+using ArgMaxKernel =
+    ArgMinMaxKernel<DeviceContext, T, int64_t, ArgMinMaxType::kArgMax>;
+
+class ArgMinMaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
+    const auto& x_dims = ctx->GetInputDim("X");
+    int64_t axis = ctx->Attrs().Get<int64_t>("axis");
+    PADDLE_ENFORCE(axis >= -x_dims.size() && axis < x_dims.size(),
+                   "'axis' must be inside [-Rank(X), Rank(X))");
+
+    auto x_rank = x_dims.size();
+    if (axis < 0) axis += x_rank;
+
+    std::vector<int64_t> vec;
+    for (int64_t i = 0; i < axis; i++) vec.push_back(x_dims[i]);
+    for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]);
+    ctx->SetOutputDim("Out", framework::make_ddim(vec));
+  }
+};
+
+class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ protected:
+  virtual const char* OpName() const = 0;
+  virtual const char* Name() const = 0;
+
+ public:
+  void Make() override {
+    AddInput("X", "Input tensor.");
+    AddOutput("Out", "Output tensor.");
+    AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
+    AddComment(string::Sprintf(R"DOC(
+      %s Operator.
+
+      Computes the indices of the %s elements of the input tensor's element
+      along the provided axis.
+)DOC",
+                               OpName(), Name()));
+  }
+};
+
+class ArgMinOpMaker : public BaseArgMinMaxOpMaker {
+ protected:
+  const char* OpName() const override { return "ArgMin"; }
+  const char* Name() const override { return "min"; }
+};
+
+class ArgMaxOpMaker : public BaseArgMinMaxOpMaker {
+ protected:
+  const char* OpName() const override { return "ArgMax"; }
+  const char* Name() const override { return "max"; }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc
new file mode 100644
index 0000000000..41f188029f
--- /dev/null
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -0,0 +1,33 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
+
+REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinMaxOp,
+                  paddle::operators::ArgMinOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    arg_min,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, float>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, double>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    int32_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    int16_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, size_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    uint8_t>);
diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu
new file mode 100644
index 0000000000..4d02050850
--- /dev/null
+++ b/paddle/fluid/operators/arg_min_op.cu
@@ -0,0 +1,31 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    arg_min,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    double>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    int32_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    int16_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    size_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    uint8_t>);
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 6ec8c9d18b..92fbb9adaf 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -111,14 +111,16 @@ class BatchNormOp : public framework::OperatorWithKernel {
                       "Variance input should be of float type");
 
     framework::LibraryType library_{framework::LibraryType::kPlain};
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
 #ifdef PADDLE_WITH_MKLDNN
     if (library_ == framework::LibraryType::kPlain &&
         platform::CanMKLDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
     }
 #endif
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
     return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
                                    library_);
   }
@@ -149,13 +151,15 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Variance",
              "The global variance (for training) "
              "or estimated Variance (for testing)");
-    AddOutput("Y", "result after normalization");
+    AddOutput("Y", "result after normalization").Reuse("X");
     AddOutput("MeanOut",
               "Share memory with Mean. "
-              "Store the global mean when training");
+              "Store the global mean when training")
+        .Reuse("Mean");
     AddOutput("VarianceOut",
               "Share memory with Variance. "
-              "Store the global Variance when training");
+              "Store the global Variance when training")
+        .Reuse("Variance");
     AddOutput("SavedMean",
               "Mean of the current mini batch, "
               "will apply to output when training")
@@ -367,17 +371,18 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
     }
 
     framework::LibraryType library_{framework::LibraryType::kPlain};
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
     if (library_ == framework::LibraryType::kPlain &&
         platform::CanMKLDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
     }
 #endif
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout, library_);
+        layout_, library_);
   }
 };
 
diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h
index 483c9f8c21..fc15d56891 100644
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -54,18 +54,18 @@ class BatchSizeLikeOp : public framework::OperatorWithKernel {
 class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() final {
-    AddInput("Input",
-             "(Tensor) Tensor "
-             "whose input_dim_idx'th dimension specifies the batch_size");
+    AddInput(
+        "Input",
+        "Tensor whose input_dim_idx'th dimension specifies the batch_size");
     AddOutput("Out",
-              "(Tensor) Tensor of specified shape will be filled "
+              "Tensor of specified shape will be filled "
               "with the specified value");
-    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
+    AddAttr<std::vector<int>>("shape", "The shape of the output");
     AddAttr<int>("input_dim_idx",
-                 "(int, default 0) The index of input's batch size dimension")
+                 "default 0. The index of input's batch size dimension")
         .SetDefault(0);
     AddAttr<int>("output_dim_idx",
-                 "(int, default 0) The index of output's batch size dimension")
+                 "default 0. The index of output's batch size dimension")
         .SetDefault(0);
     Apply();
   }
diff --git a/paddle/fluid/operators/bilinear_interp_op.cc b/paddle/fluid/operators/bilinear_interp_op.cc
index d46fda54e7..2572e813d6 100644
--- a/paddle/fluid/operators/bilinear_interp_op.cc
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
@@ -34,22 +34,38 @@ class BilinearInterpOp : public framework::OperatorWithKernel {
     int out_w = ctx->Attrs().Get<int>("out_w");
     PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4");
 
+    if (ctx->HasInput("OutSize")) {
+      auto out_size_dim = ctx->GetInputDim("OutSize");
+      PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
+                        "OutSize's dimension size must be 1");
+      PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2");
+    }
     std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
     ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
 };
 
 class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "(Tensor) The input tensor of bilinear interpolation, "
+             "The input tensor of bilinear interpolation, "
              "This is a 4-D tensor with shape of (N x C x h x w)");
-    AddOutput("Out",
-              "(Tensor) The dimension of output is (N x C x out_h x out_w]");
+    AddInput("OutSize",
+             "This is a 1-D tensor with two number. "
+             "The first number is height and the second number is width.")
+        .AsDispensable();
+    AddOutput("Out", "The dimension of output is (N x C x out_h x out_w)");
 
-    AddAttr<int>("out_h", "(int) output height of bilinear interpolation op.");
-    AddAttr<int>("out_w", "(int) output width of bilinear interpolation op.");
+    AddAttr<int>("out_h", "output height of bilinear interpolation op.");
+    AddAttr<int>("out_w", "output width of bilinear interpolation op.");
     AddComment(R"DOC(
           Bilinear interpolation is an extension of linear interpolation for 
           interpolating functions of two variables (e.g. H-direction and 
@@ -78,6 +94,12 @@ class BilinearInterpOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/bilinear_interp_op.cu b/paddle/fluid/operators/bilinear_interp_op.cu
index 510190f1aa..4c19715384 100644
--- a/paddle/fluid/operators/bilinear_interp_op.cu
+++ b/paddle/fluid/operators/bilinear_interp_op.cu
@@ -102,10 +102,21 @@ class BilinearInterpOpCUDAKernel : public framework::OpKernel<T> {
     auto* input_t = ctx.Input<Tensor>("X");      // float tensor
     auto* output_t = ctx.Output<Tensor>("Out");  // float tensor
     auto* input = input_t->data<T>();
-    auto* output = output_t->mutable_data<T>(ctx.GetPlace());
 
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+    auto out_dims = output_t->dims();
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      Tensor sizes;
+      framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
+    auto* output = output_t->mutable_data<T>(
+        {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
+
     int batch_size = input_t->dims()[0];
     int channels = input_t->dims()[1];
     int in_h = input_t->dims()[2];
@@ -139,8 +150,8 @@ class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
     auto* d_output = d_output_t->data<T>();
+    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
 
     auto& device_ctx =
         ctx.template device_context<platform::CUDADeviceContext>();
@@ -149,6 +160,16 @@ class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> {
 
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      Tensor sizes;
+      framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
+
     int batch_size = d_input_t->dims()[0];
     int channels = d_input_t->dims()[1];
     int in_h = d_input_t->dims()[2];
diff --git a/paddle/fluid/operators/bilinear_interp_op.h b/paddle/fluid/operators/bilinear_interp_op.h
index f6cd77e4d4..8b03cd5a06 100644
--- a/paddle/fluid/operators/bilinear_interp_op.h
+++ b/paddle/fluid/operators/bilinear_interp_op.h
@@ -24,11 +24,18 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* input_t = ctx.Input<Tensor>("X");      // float tensor
     auto* output_t = ctx.Output<Tensor>("Out");  // float tensor
+    auto out_dims = output_t->dims();
     auto* input = input_t->data<T>();
-    auto* output = output_t->mutable_data<T>(ctx.GetPlace());
-
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      auto out_size_data = out_size_t->data<int>();
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+    auto* output = output_t->mutable_data<T>(
+        {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
     int batch_size = input_t->dims()[0];
     int channels = input_t->dims()[1];
     int in_h = input_t->dims()[2];
@@ -83,9 +90,8 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
     auto* d_output = d_output_t->data<T>();
-
+    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
     auto& device_ctx =
         ctx.template device_context<platform::CPUDeviceContext>();
     math::SetConstant<platform::CPUDeviceContext, T> zero;
@@ -93,6 +99,14 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
 
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      auto out_size_data = out_size_t->data<int>();
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+
     int batch_size = d_input_t->dims()[0];
     int channels = d_input_t->dims()[1];
     int in_h = d_input_t->dims()[2];
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index 84660d042c..8d6a498dc9 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -89,4 +89,5 @@ REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
                        ops::CastOpKernel<CPU, int>,
                        ops::CastOpKernel<CPU, int64_t>,
                        ops::CastOpKernel<CPU, bool>,
+                       ops::CastOpKernel<CPU, uint8_t>,
                        ops::CastOpKernel<CPU, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index c486c5850e..657d162878 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -21,5 +21,5 @@ using CastOpKernel =
 
 REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
                         CastOpKernel<int>, CastOpKernel<int64_t>,
-                        CastOpKernel<bool>,
+                        CastOpKernel<bool>, CastOpKernel<uint8_t>,
                         CastOpKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 7a7b8b76e4..1828be57b5 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
 
-DEFINE_bool(cudnn_algo_use_autotune, true,
+DEFINE_bool(cudnn_deterministic, true,
             "Whether allow using an autotuning algorithm for convolution "
             "operator. The autotuning algorithm may be non-deterministic. If "
             "false, the algorithm is deterministic.");
@@ -272,7 +272,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
-      if (FLAGS_cudnn_algo_use_autotune) {
+      if (FLAGS_cudnn_deterministic) {
         PADDLE_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
                 handle, cudnn_filter_desc,
@@ -297,7 +297,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     }
 
     if (filter_grad) {
-      if (FLAGS_cudnn_algo_use_autotune) {
+      if (FLAGS_cudnn_deterministic) {
         PADDLE_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
                 handle, cudnn_input_desc, cudnn_output_grad_desc,
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 697d914842..27f1313116 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -75,6 +75,11 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library{framework::LibraryType::kPlain};
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  framework::DataLayout layout = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
   if (platform::CanCUDNNBeUsed(ctx)) {
     library = framework::LibraryType::kCUDNN;
@@ -84,6 +89,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
   if (library == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
   }
 #endif
 
@@ -99,9 +105,6 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
                       "float16 can only be used when CUDNN is used");
   }
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::DataLayout layout = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
                                  library);
 }
@@ -122,7 +125,8 @@ void Conv2DOpMaker::Make() {
            "input image channels divided by the groups.");
   AddOutput("Output",
             "(Tensor) The output tensor of convolution operator. "
-            "The format of output tensor is also NCHW.");
+            "The format of output tensor is also NCHW.")
+      .Reuse("Input");
   AddAttr<std::vector<int>>("strides",
                             "(vector<int> default:{1, 1}), the "
                             "strides(h_stride, w_stride) of "
@@ -217,7 +221,8 @@ void Conv3DOpMaker::Make() {
            "input image channels divided by the groups.");
   AddOutput("Output",
             "(Tensor) The output tensor of convolution operator."
-            "The format of output tensor is also NCDHW.");
+            "The format of output tensor is also NCDHW.")
+      .Reuse("Input");
   AddAttr<std::vector<int>>("strides",
                             "(vector<int>, default:{1, 1, 1}), the "
                             "strides(d_stride, h_stride, w_stride) of "
@@ -309,6 +314,10 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
   if (platform::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
@@ -318,12 +327,10 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
   if (library_ == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
 #endif
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
       layout_, library_);
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
index 901682edbb..038ea89990 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
@@ -44,6 +44,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     // cudnn v5 does not support dilations
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
     int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
 
     const T* input_data = input->data<T>();
@@ -64,13 +65,13 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
 
     // (N, M, H, W) or (N, M, D, H, W)
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize2int(input->dims()));
+        layout, framework::vectorize2int(input->dims()), groups);
     // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize2int(output->dims()));
+        layout, framework::vectorize2int(output->dims()), groups);
     // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
-        layout, framework::vectorize2int(filter->dims()));
+        layout, framework::vectorize2int(filter->dims()), groups);
     cudnnConvolutionDescriptor_t cudnn_conv_desc =
         conv_desc.descriptor<T>(paddings, strides, dilations);
 
@@ -104,11 +105,17 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
 
     // ------------------- cudnn conv transpose forward ---------------------
+    int input_offset = input->numel() / input->dims()[0] / groups;
+    int output_offset = output->numel() / output->dims()[0] / groups;
+    int filter_offset = filter->numel() / groups;
     T alpha = 1.0f, beta = 0.0f;
-    PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
-        handle, &alpha, cudnn_filter_desc, filter_data, cudnn_input_desc,
-        input_data, cudnn_conv_desc, algo, cudnn_workspace,
-        workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
+    for (int g = 0; g < groups; g++) {
+      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+          handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
+          cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
+          algo, cudnn_workspace, workspace_size_in_bytes, &beta,
+          cudnn_output_desc, output_data + output_offset * g));
+    }
 
     // Release the cudnn workspace
     paddle::memory::Free(gpu, cudnn_workspace);
@@ -134,6 +141,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     // cudnn v5 does not support dilations
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
     int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
 
     // ------------------- cudnn descriptors ---------------------
@@ -145,13 +153,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
 
     // Input: (N, M, H, W) or (N, M, D, H, W)
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize2int(input->dims()));
+        layout, framework::vectorize2int(input->dims()), groups);
     // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize2int(output_grad->dims()));
+        layout, framework::vectorize2int(output_grad->dims()), groups);
     // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
-        layout, framework::vectorize2int(filter->dims()));
+        layout, framework::vectorize2int(filter->dims()), groups);
 
     cudnnConvolutionDescriptor_t cudnn_conv_desc =
         conv_desc.descriptor<T>(paddings, strides, dilations);
@@ -205,15 +213,22 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv backward data ---------------------
     // FIXME(typhoonzero): template type T may not be the same as cudnn call.
+    int input_offset = input->numel() / input->dims()[0] / groups;
+    int output_grad_offset =
+        output_grad->numel() / output_grad->dims()[0] / groups;
+    int filter_offset = filter->numel() / groups;
     T alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
-          handle, &alpha, cudnn_output_desc, output_grad_data,
-          cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
-          cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
-          input_grad_data));
+      for (int g = 0; g < groups; g++) {
+        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+            handle, &alpha, cudnn_output_desc,
+            output_grad_data + output_grad_offset * g, cudnn_filter_desc,
+            filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
+            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+            input_grad_data + input_offset * g));
+      }
     }
 
     // ------------------- cudnn conv backward filter ---------------------
@@ -221,11 +236,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset filter_grad.
       // Gradient with respect to the filter
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
-          handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
-          input_data, cudnn_conv_desc, filter_algo, cudnn_workspace,
-          workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data));
+      for (int g = 0; g < groups; g++) {
+        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+            handle, &alpha, cudnn_output_desc,
+            output_grad_data + output_grad_offset * g, cudnn_input_desc,
+            input_data + input_offset * g, cudnn_conv_desc, filter_algo,
+            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc,
+            filter_grad_data + filter_offset * g));
+      }
     }
+
     // Release the cudnn workspace
     paddle::memory::Free(gpu, cudnn_workspace);
   }
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index c27c8e2731..0b363f5c43 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -32,6 +32,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
   std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
+  int groups = ctx->Attrs().Get<int>("groups");
 
   PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
                  "ConvTransposeOp intput should be 4-D or 5-D tensor.");
@@ -48,10 +49,10 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
                     "ConvTransposeOp paddings dimension and dilations "
                     "dimension should be the same.");
   PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
-                    "In ConvTransposeOp, The input channel should be the same "
-                    "as the number of filters.");
+                    "In ConvTransposeOp, The number of input channels should "
+                    "be equal to the number of filter's channels.");
 
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
   for (size_t i = 0; i < strides.size(); ++i) {
     auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
     output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
@@ -102,7 +103,10 @@ void Conv2DTransposeOpMaker::Make() {
   AddOutput("Output",
             "(Tensor) The output tensor of convolution transpose operator. "
             "The format of output tensor is also NCHW.");
-
+  AddAttr<int>("groups",
+               "(int default:1), the groups number of the convolution "
+               "transpose operator. ")
+      .SetDefault(1);
   AddAttr<std::vector<int>>("dilations",
                             "(vector<int> default:{1, 1}), the "
                             "dilations(h_dilation, w_dilation) of convolution "
@@ -204,6 +208,10 @@ void Conv3DTransposeOpMaker::Make() {
                             "(vector<int> default:{0, 0, 0}), paddings(d_pad, "
                             "h_pad, w_pad) of convolution transpose operator.")
       .SetDefault({0, 0, 0});
+  AddAttr<int>("groups",
+               "(int default:1), the groups number of the convolution3d "
+               "transpose operator. ")
+      .SetDefault(1);
   AddAttr<bool>(
       "use_cudnn",
       "(bool, default false) Only used in cudnn kernel, need install cudnn")
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index f9d205a5b5..1dcfc651fd 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -70,7 +70,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    // groups will alway be disabled in conv2dtranspose.
+    int groups = context.Attr<int>("groups");
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
@@ -81,10 +81,10 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
 
     // use col_shape in the im2col and col2im (or vol2col and col2vol)
     // calculation
-    // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
+    // col_shape_vec: {c/g, k_h, k_w, h, w} or {c/g, k_d, k_h, k_w, d, h, w}
     size_t data_dim = filter_shape_vec.size() - 2;
     std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = output->dims()[1];
+    col_shape_vec[0] = output->dims()[1] / groups;
     for (size_t j = 0; j < data_dim; ++j) {
       col_shape_vec[j + 1] = filter_shape_vec[j + 2];
       col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
@@ -92,7 +92,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     DDim col_shape(framework::make_ddim(col_shape_vec));
 
     // use col_matrix_shape in the gemm calculation
-    // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
+    // size: (c/g * k_h * k_w, h * w) or (c/g * k_d * k_h * k_w, d * h * w)
     DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
 
     Tensor col;
@@ -111,7 +111,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     // input matrix size: (m, h * w) or (m, d * h * w)
     DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
 
-    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
+    // filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w)
     DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
     filter.Resize(filter_matrix_shape);
 
@@ -121,6 +121,8 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     set_zero(dev_ctx, output, static_cast<T>(0));
 
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int out_step = static_cast<int>(output->dims()[1]) / groups;
     math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
     math::Col2VolFunctor<DeviceContext, T> col2vol;
 
@@ -133,22 +135,29 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
       // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
       Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
 
-      // col_matrix = filter * input_batch
-      // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
-      blas.MatMul(filter, true, input_batch, false, static_cast<T>(1.0),
-                  &col_matrix, static_cast<T>(0.0));
-
-      if (data_dim == 2U) {
-        // col2im: col_matrix -> dy
-        // from (c * k_h * k_w, h * w) to (c, o_h, o_w)
-        col2im(dev_ctx, col, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &output_batch);
-      } else if (data_dim == 3U) {
-        // col2vol: col_matrix -> dy
-        // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
-        col2vol(dev_ctx, col, dilations, strides, paddings, &output_batch);
+      for (int g = 0; g < groups; g++) {
+        Tensor in_slice = input_batch.Slice(g * in_step, (g + 1) * in_step);
+        Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
+        Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);
+
+        // col_matrix = filter_slice * input_slice
+        // of shape (c/g * k_h * k_w, h * w)
+        // or (c/g * k_d * k_h * k_w, d * h * w)
+        blas.MatMul(filter_slice, true, in_slice, false, static_cast<T>(1.0),
+                    &col_matrix, static_cast<T>(0.0));
+
+        if (data_dim == 2U) {
+          // col2im: col_matrix -> dy
+          // from (c/g * k_h * k_w, h * w) to (c/g, o_h, o_w)
+          col2im(dev_ctx, col, dilations, strides,
+                 std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                  paddings[1]},
+                 &out_slice);
+        } else if (data_dim == 3U) {
+          // col2vol: col_matrix -> dy
+          // from (c/g * k_d * k_h * k_w, d * h * w) to (c/g, o_d, o_h, o_w)
+          col2vol(dev_ctx, col, dilations, strides, paddings, &out_slice);
+        }
       }
     }
   }
@@ -174,6 +183,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    int groups = context.Attr<int>("groups");
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
@@ -205,9 +215,11 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
     // input matrix size: (m, h * w) or (m, d * h * w)
     DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
 
-    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
-    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
+    // filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w)
+    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0] / groups};
     filter.Resize(filter_matrix_shape);
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int col_step = static_cast<int>(col_matrix_shape[0]) / groups;
 
     // convolution transpose grad on input:
     // im2col + gemm (similar to conv-forward)
@@ -233,7 +245,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
       if (input_grad) {
         input_grad->mutable_data<T>(context.GetPlace());
       }
-      if (filter_grad) {  // filter size (m, c, k_h, k_w)
+      if (filter_grad) {  // filter size (m, c/g, k_h, k_w)
         filter_grad->mutable_data<T>(context.GetPlace());
         set_zero(dev_ctx, filter_grad, static_cast<T>(0));
         filter_grad_ = *filter_grad;
@@ -268,8 +280,17 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
           // or
           // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
           // d, h, w)
-          blas.MatMul(filter, false, col_matrix, false, static_cast<T>(1.0),
-                      &input_grad_batch, static_cast<T>(0.0));
+          for (int g = 0; g < groups; g++) {
+            Tensor input_grad_slice =
+                input_grad_batch.Slice(g * in_step, (g + 1) * in_step);
+            Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
+            Tensor col_matrix_slice =
+                col_matrix.Slice(g * col_step, (g + 1) * col_step);
+
+            blas.MatMul(filter_slice, false, col_matrix_slice, false,
+                        static_cast<T>(1.0), &input_grad_slice,
+                        static_cast<T>(0.0));
+          }
         }
         if (filter_grad) {
           // input batch
@@ -279,8 +300,17 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
           // or
           // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
           // k_h * k_w)
-          blas.MatMul(in_batch, false, col_matrix, true, static_cast<T>(1.0),
-                      &filter_grad_, static_cast<T>(1.0));
+          for (int g = 0; g < groups; g++) {
+            Tensor in_batch_slice =
+                in_batch.Slice(g * in_step, (g + 1) * in_step);
+            Tensor filter_grad_slice =
+                filter_grad_.Slice(g * in_step, (g + 1) * in_step);
+            Tensor col_matrix_slice =
+                col_matrix.Slice(g * col_step, (g + 1) * col_step);
+            blas.MatMul(in_batch_slice, false, col_matrix_slice, true,
+                        static_cast<T>(1.0), &filter_grad_slice,
+                        static_cast<T>(1.0));
+          }
         }
       }
     }
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index 669b3bbe9d..5b5a220cf9 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -48,6 +48,13 @@ class CropOp : public framework::OperatorWithKernel {
       ctx->SetOutputDim("Out", y_dim);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
 class CropOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -60,13 +67,19 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker {
              "The input used as reference for cropping, "
              "which is of the same dimensions as X.")
         .AsDispensable();
+    AddInput("Offsets",
+             "The input used to describe offsets in runtime, which is a "
+             "1-D vector whose size equals to the rank of input 'X'. The "
+             "elements data type must be int.")
+        .AsDispensable();
     AddOutput("Out",
               "The output of crop op, "
               "which is of the same dimensions as X.");
     AddAttr<std::vector<int>>("offsets",
                               "A list<int> describing offsets to be cropped. "
                               "The size of offsets list should be the same as "
-                              "the dimension size of input X.");
+                              "the dimension size of input X.")
+        .SetDefault(std::vector<int>());
     AddAttr<std::vector<int>>("shape",
                               "A list<int> describing the shape of output. "
                               "The size of shape list should be the same as "
@@ -77,6 +90,17 @@ Crop Operator.
 
 Crop input into output, as specified by offsets and shape.
 
+There are two ways to set the offsets:
+1. In runtime: Using the input 'Offsets', which is a Vairbale and can be 
+               output of other operators. This way is suitable for 
+               dynamic offsets.
+2. In network configuration: Using the attribute 'offsets', which will be 
+                             set in Python configure script. This way is 
+                             suitable for fixed offsets.
+You CANNOT use these two ways at the same time. An exception will be raised 
+if input 'Offset' is configured and meanwhile the attribute 'offsets' is 
+not empty.
+
 There are two ways to set shape:
 1. reference input: crop input X into the same shape as reference input.
                     The dimension of reference input should
@@ -146,6 +170,15 @@ class CropOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(x_grad_name, x_dims);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index f05c2e2328..91cfbbda73 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -27,6 +27,37 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 using framework::Tensor;
 
+static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
+  std::vector<int> res;
+  int rank = ctx.Input<Tensor>("X")->dims().size();
+  if (ctx.HasInput("Offsets")) {
+    PADDLE_ENFORCE(ctx.Attr<std::vector<int>>("offsets").empty(),
+                   "Input 'Offsets' and attribute 'offsets' should not be used "
+                   "at the same time.");
+    const auto* offsets_tensor = ctx.Input<Tensor>("Offsets");
+    PADDLE_ENFORCE_EQ(offsets_tensor->dims().size(), 1);
+    PADDLE_ENFORCE_EQ(
+        rank, offsets_tensor->dims()[0],
+        "Offsets size should be equal to dimension size of input tensor.");
+    const int* offsets_data;
+    framework::Tensor cpu_tmp_tensor;
+    if (platform::is_cpu_place(offsets_tensor->place())) {
+      offsets_data = offsets_tensor->data<int>();
+    } else {
+      framework::TensorCopySync(*offsets_tensor, platform::CPUPlace(),
+                                &cpu_tmp_tensor);
+      offsets_data = cpu_tmp_tensor.data<int>();
+    }
+    res = std::vector<int>(offsets_data, offsets_data + rank);
+  } else {
+    res = ctx.Attr<std::vector<int>>("offsets");
+    PADDLE_ENFORCE_EQ(
+        rank, res.size(),
+        "Offsets size should be equal to dimension size of input tensor.");
+  }
+  return res;
+}
+
 template <typename T>
 class CropKernel : public framework::OpKernel<T> {
  public:
@@ -37,10 +68,7 @@ class CropKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
     auto x_stride = framework::stride(x->dims());
     auto out_stride = framework::stride(out->dims());
-    auto offsets = context.Attr<std::vector<int>>("offsets");
-    PADDLE_ENFORCE_EQ(
-        x->dims().size(), static_cast<int64_t>(offsets.size()),
-        "Offsets size should be equal to dimension size of input tensor.");
+    auto offsets = GetOffsets(context);
     int64_t offset = 0;
     for (size_t i = 0; i < offsets.size(); ++i) {
       offset += (x_stride[i] * offsets[i]);
@@ -56,7 +84,7 @@ void CropGradFunction(const framework::ExecutionContext& context) {
   if (d_x != nullptr) {
     auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
     d_x->mutable_data<T>(context.GetPlace());
-    auto offsets = context.Attr<std::vector<int>>("offsets");
+    auto offsets = GetOffsets(context);
     Eigen::array<std::pair<int, int>, D> paddings;
     for (size_t i = 0; i < D; ++i) {
       paddings[i].first = offsets[i];
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index a3bec3da45..d5e095f9ca 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -124,7 +124,8 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
              "Tensor<float/double> with shape [N x D].");
     AddOutput("Y",
               "(Tensor, default Tensor<float>), a 2-D tensor with shape "
-              "[N x 1]. The cross entropy loss.");
+              "[N x 1]. The cross entropy loss.")
+        .Reuse("X");
     AddAttr<bool>("soft_label",
                   "(bool, default false), a flag indicating whether to "
                   "interpretate the given labels as soft labels.")
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
index 719a7465b8..abc5aad043 100644
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -1,9 +1,38 @@
-if(WITH_DISTRIBUTE)
+if(NOT WITH_DISTRIBUTE)
+    return()
+endif()
+
+
+if(WITH_GRPC)
   grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
-      grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
+      request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
+      selected_rows memory)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
-      cares zlib protobuf sendrecvop_grpc)
-  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op)
+  set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
+          cares zlib protobuf sendrecvop_grpc SERIAL)
+  cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc
+          grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
+          proto_desc lookup_table_op SERIAL)
+  return()
 endif()
+
+
+set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc  rpc_client.cc request_handler_impl.cc
+  PROTO send_recv.proto
+  DEPS lod_tensor selected_rows memory)
+
+find_library(OPENSSL_CRYPTO_LIBRARY_STATIC NAMES libcrypto.so)
+ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY_STATIC})
+
+
+find_library(OPENSSL_SSL_LIBRARY_STATIC NAMES libssl.so)
+ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY_STATIC})
+
+cc_test(brpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_brpc 
+       brpc protobuf leveldb gflags glog
+       protobuf executor proto_desc lookup_table_op snappystream snappy ssl crypto SERIAL)
diff --git a/paddle/fluid/operators/detail/brpc_client.cc b/paddle/fluid/operators/detail/brpc_client.cc
new file mode 100644
index 0000000000..9a4e410f1d
--- /dev/null
+++ b/paddle/fluid/operators/detail/brpc_client.cc
@@ -0,0 +1,180 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/detail/brpc_client.h"
+#include "paddle/fluid/framework/threadpool.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+DEFINE_int32(brpc_channel_num, 24,
+             "Number of channels to send requests connected to one server");
+DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds");
+DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)");
+
+BRPCClient::~BRPCClient() { Wait(); }
+
+void HandleSendResponse(brpc::Controller* cntl,
+                        sendrecv::VoidMessage* response) {
+  // std::unique_ptr makes sure cntl/response will be deleted before returning.
+  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
+  std::unique_ptr<sendrecv::VoidMessage> response_guard(response);
+
+  if (cntl->Failed()) {
+    LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText();
+    return;
+  }
+  LOG(INFO) << "Received response from " << cntl->remote_side()
+            << " latency=" << cntl->latency_us() << "us";
+}
+
+bool BRPCClient::AsyncSendVar(const std::string& ep,
+                              const platform::DeviceContext& ctx,
+                              const framework::Scope& scope,
+                              const std::string& var_name, int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch_ptr = GetChannel(ep_val);
+
+  framework::AsyncIO(
+      [var_name_val, p_ctx, ep_val, p_scope, time_out, ch_ptr, this] {
+        auto ch_ctx = ch_ptr->Pop();
+        brpc::Controller* cntl = new brpc::Controller();
+        sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
+        cntl->set_timeout_ms(time_out);
+
+        google::protobuf::Closure* done =
+            brpc::NewCallback(&HandleSendResponse, cntl, response);
+
+        sendrecv::VariableMessage request;
+        ch_ctx->stub->SendVariable(cntl, &request, response, done);
+      });
+  req_count_++;
+
+  return true;
+}
+
+void HandleGetResponse(brpc::Controller* cntl,
+                       sendrecv::VariableMessage* response) {
+  // std::unique_ptr makes sure cntl/response will be deleted before returning.
+  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
+  std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
+
+  if (cntl->Failed()) {
+    LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText();
+    return;
+  }
+  LOG(INFO) << "Received response from " << cntl->remote_side()
+            << " latency=" << cntl->latency_us() << "us";
+
+  // framework::Variable* outvar = nullptr;
+  // DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
+}
+
+bool BRPCClient::AsyncGetVar(const std::string& ep,
+                             const platform::DeviceContext& ctx,
+                             const framework::Scope& scope,
+                             const std::string& var_name, int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::AsyncIO(
+      [var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {});
+
+  req_count_++;
+
+  return true;
+}
+
+bool BRPCClient::AsyncPrefetchVar(const std::string& ep,
+                                  const platform::DeviceContext& ctx,
+                                  const framework::Scope& scope,
+                                  const std::string& in_var_name,
+                                  const std::string& out_var_name,
+                                  int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string in_var_name_val = in_var_name;
+  const std::string out_var_name_val = out_var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
+                      time_out, ch, this] {});
+
+  req_count_++;
+  return true;
+}
+
+void BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
+                                       int64_t time_out) {
+  req_count_++;
+}
+
+void BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
+                                       int64_t time_out) {
+  req_count_++;
+}
+
+void BRPCClient::Wait() {
+  std::unique_lock<std::mutex> lk(sync_mutex_);
+  sync_cond_.wait(lk, [this] { return req_count_ == 0; });
+}
+
+ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
+  {
+    std::lock_guard<std::mutex> guard(chan_mutex_);
+    auto it = channels_.find(ep);
+    if (it != channels_.end()) {
+      return it->second;
+    }
+  }
+
+  ChannelQueuePtr q(new framework::BlockingQueue<ChannelContextPtr>());
+
+  brpc::ChannelOptions options;
+  options.protocol = "baidu_std";
+  options.connection_type = "pooled";
+  options.connect_timeout_ms = 100;
+  options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/;
+  options.max_retry = FLAGS_max_retry;
+  for (int i = 0; i < FLAGS_brpc_channel_num; ++i) {
+    std::shared_ptr<ChannelContext> c(new ChannelContext());
+    if (c->channel.Init(ep.c_str(), &options) != 0) {
+      LOG(ERROR) << "Fail to initialize channel";
+      return nullptr;
+    }
+
+    c->stub.reset(new sendrecv::SendRecvService_Stub(
+        static_cast<google::protobuf::RpcChannel*>(&c->channel)));
+    q->Push(c);
+  }
+
+  {
+    std::lock_guard<std::mutex> guard(chan_mutex_);
+    channels_[ep] = q;
+  }
+
+  return q;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/brpc_client.h b/paddle/fluid/operators/detail/brpc_client.h
new file mode 100644
index 0000000000..1e953ea431
--- /dev/null
+++ b/paddle/fluid/operators/detail/brpc_client.h
@@ -0,0 +1,100 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <time.h>
+
+#include <chrono>  // NOLINT
+#include <ctime>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <mutex>  // NOLINT
+#include <string>
+#include <vector>
+
+#include "brpc/channel.h"
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/detail/rpc_client.h"
+#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+struct ChannelContext {
+  brpc::Channel channel;
+  std::shared_ptr<sendrecv::SendRecvService_Stub> stub;
+};
+
+typedef std::shared_ptr<ChannelContext> ChannelContextPtr;
+typedef std::shared_ptr<framework::BlockingQueue<ChannelContextPtr>>
+    ChannelQueuePtr;
+
+class BRPCClient : public RPCClient {
+ public:
+  BRPCClient() {}
+  virtual ~BRPCClient();
+
+  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
+                    const framework::Scope& scope, const std::string& var_name,
+                    int64_t time_out = RPCClient::rpc_time_out) override;
+
+  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
+                   const framework::Scope& scope, const std::string& var_name,
+                   int64_t time_out = RPCClient::rpc_time_out) override;
+
+  bool AsyncPrefetchVar(const std::string& ep,
+                        const platform::DeviceContext& ctx,
+                        const framework::Scope& scope,
+                        const std::string& in_var_name,
+                        const std::string& out_var_name,
+                        int64_t time_out = RPCClient::rpc_time_out) override;
+
+  void AsyncSendBatchBarrier(
+      const std::string& ep,
+      int64_t time_out = RPCClient::rpc_time_out) override;
+
+  void AsyncSendFetchBarrier(
+      const std::string& ep,
+      int64_t time_out = RPCClient::rpc_time_out) override;
+
+  void Wait() override;
+
+ private:
+  void Proceed();
+  ChannelQueuePtr GetChannel(const std::string& ep);
+
+ private:
+  std::unordered_map<std::string, ChannelQueuePtr> channels_;
+
+  // mutex for Wait client sync
+  std::mutex sync_mutex_;
+  std::condition_variable sync_cond_;
+  std::atomic<int64_t> req_count_{0};
+
+  // mutex for GetChannel thread safety
+  std::mutex chan_mutex_;
+  DISABLE_COPY_AND_ASSIGN(BRPCClient);
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/brpc_server.cc b/paddle/fluid/operators/detail/brpc_server.cc
new file mode 100644
index 0000000000..2170abe679
--- /dev/null
+++ b/paddle/fluid/operators/detail/brpc_server.cc
@@ -0,0 +1,144 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/detail/brpc_server.h"
+#include "paddle/fluid/operators/detail/request_handler.h"
+
+namespace sendrecv {
+
+typedef std::unordered_map<std::string,
+                           paddle::operators::detail::RequestHandler*>
+    HandlerMap;
+
+class BRPCServiceImpl : public SendRecvService {
+ public:
+  explicit BRPCServiceImpl(const HandlerMap& rpc_call_map)
+      : request_send_h_(nullptr),
+        request_get_h_(nullptr),
+        request_prefetch_h_(nullptr) {
+    auto it = rpc_call_map.find(paddle::operators::detail::kRequestSend);
+    if (it != rpc_call_map.end()) {
+      request_send_h_ = it->second;
+    }
+
+    it = rpc_call_map.find(paddle::operators::detail::kRequestSend);
+    if (it != rpc_call_map.end()) {
+      request_get_h_ = it->second;
+    }
+
+    it = rpc_call_map.find(paddle::operators::detail::kRequestPrefetch);
+    if (it != rpc_call_map.end()) {
+      request_prefetch_h_ = it->second;
+    }
+  }
+
+  virtual ~BRPCServiceImpl() {}
+
+  void SendVariable(google::protobuf::RpcController* cntl_butil,
+                    const VariableMessage* request, VoidMessage* response,
+                    google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(request_send_h_ != nullptr,
+                   "RequestSend handler should be registed first!");
+    brpc::ClosureGuard done_guard(done);
+
+    paddle::framework::Scope* local_scope = request_send_h_->scope();
+    paddle::framework::Variable* outvar = nullptr;
+    paddle::framework::Variable* invar = nullptr;
+
+    std::string varname = request->varname();
+
+    if (!request_send_h_->sync_mode()) {
+      local_scope = &request_send_h_->scope()->NewScope();
+      invar = local_scope->Var(varname);
+    } else {
+      invar = local_scope->FindVar(varname);
+    }
+
+    request_send_h_->Handle(varname, local_scope, invar, &outvar);
+
+    if (!request_send_h_->sync_mode()) {
+      request_send_h_->scope()->DeleteScope(local_scope);
+    }
+  }
+
+  void GetVariable(google::protobuf::RpcController* cntl_butil,
+                   const VariableMessage* request, VariableMessage* response,
+                   google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(request_get_h_ != nullptr,
+                   "RequestGet handler should be registed first!");
+  }
+
+  void PrefetchVariable(google::protobuf::RpcController* cntl_butil,
+                        const VariableMessage* request,
+                        VariableMessage* response,
+                        google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(request_prefetch_h_ != nullptr,
+                   "kRequestPrefetch handler should be registed first!");
+  }
+
+ private:
+  paddle::operators::detail::RequestHandler* request_send_h_;
+  paddle::operators::detail::RequestHandler* request_get_h_;
+  paddle::operators::detail::RequestHandler* request_prefetch_h_;
+};
+}  // namespace sendrecv
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+void AsyncBRPCServer::StartServer() {
+  // Instance of your service.
+  sendrecv::BRPCServiceImpl service_impl(rpc_call_map_);
+
+  // Add the service into server. Notice the second parameter, because the
+  // service is put on stack, we don't want server to delete it, otherwise
+  // use brpc::SERVER_OWNS_SERVICE.
+  if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) {
+    LOG(FATAL) << "Fail to add service";
+    return;
+  }
+
+  brpc::ServerOptions options;
+  options.idle_timeout_sec = idle_timeout_s_;
+  options.max_concurrency = max_concurrency_;
+  if (server_.Start(bind_address_.c_str(), &options) != 0) {
+    LOG(FATAL) << "Fail to start EchoServer" << bind_address_;
+    return;
+  }
+
+  butil::EndPoint ep = server_.listen_address();
+  selected_port_ = ep.port;
+
+  {
+    std::lock_guard<std::mutex> lock(this->mutex_ready_);
+    ready_ = 1;
+  }
+  condition_ready_.notify_all();
+
+  server_.Join();
+}
+
+void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); }
+
+void AsyncBRPCServer::WaitServerReady() {
+  VLOG(3) << "AsyncGRPCServer is wait server ready";
+  std::unique_lock<std::mutex> lock(this->mutex_ready_);
+  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
+  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
+}
+
+};  // namespace detail
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/detail/brpc_server.h b/paddle/fluid/operators/detail/brpc_server.h
new file mode 100644
index 0000000000..0105c8074a
--- /dev/null
+++ b/paddle/fluid/operators/detail/brpc_server.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>  // NOLINT
+#include <mutex>               // NOLINT
+#include <string>
+
+#include "brpc/server.h"
+#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/detail/send_recv.pb.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class AsyncBRPCServer final : public RPCServer {
+ public:
+  explicit AsyncBRPCServer(const std::string& address, int client_num)
+      : RPCServer(address, client_num), ready_(0) {}
+
+  virtual ~AsyncBRPCServer() {}
+  void StartServer() override;
+  void WaitServerReady() override;
+
+ private:
+  void ShutDownImpl() override;
+
+  brpc::Server server_;
+
+  static constexpr int idle_timeout_s_ = -1;
+  static constexpr int max_concurrency_ = 0;
+
+  std::mutex mutex_ready_;
+  std::condition_variable condition_ready_;
+  int ready_;
+};
+
+};  // namespace detail
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index ae60ab1532..02ffe3651e 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -19,16 +19,43 @@ limitations under the License. */
 #include <limits>
 
 #include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
 namespace detail {
 
-bool RPCClient::AsyncSendVariable(const std::string& ep,
-                                  const platform::DeviceContext& ctx,
-                                  const framework::Scope& scope,
-                                  const std::string& var_name,
-                                  int64_t time_out) {
+void GRPCClient::InitImpl() { InitEventLoop(); }
+
+void GRPCClient::InitEventLoop() {
+  // start the client process thread
+  // TODO(wuyi): can make this in a threadpool
+  client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
+}
+
+void GRPCClient::SendComplete() {
+  for (auto& it : channels_) {
+    this->AsyncSendComplete(it.first);
+  }
+}
+
+GRPCClient::~GRPCClient() {
+  Wait();
+  cq_.Shutdown();
+  {
+    std::lock_guard<std::mutex> guard(chan_mutex_);
+    for (auto& it : channels_) {
+      it.second.reset();
+    }
+  }
+  client_thread_->join();
+}
+
+bool GRPCClient::AsyncSendVar(const std::string& ep,
+                              const platform::DeviceContext& ctx,
+                              const framework::Scope& scope,
+                              const std::string& var_name, int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
@@ -59,7 +86,6 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
     call->StartCall();
     call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   });
-
   req_count_++;
 
   return true;
@@ -79,11 +105,10 @@ void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
   result->Swap(&tmp);
 }
 
-bool RPCClient::AsyncGetVariable(const std::string& ep,
-                                 const platform::DeviceContext& ctx,
-                                 const framework::Scope& scope,
-                                 const std::string& var_name,
-                                 int64_t time_out) {
+bool GRPCClient::AsyncGetVar(const std::string& ep,
+                             const platform::DeviceContext& ctx,
+                             const framework::Scope& scope,
+                             const std::string& var_name, int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
@@ -121,12 +146,12 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
   return true;
 }
 
-bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& in_var_name,
-                                      const std::string& out_var_name,
-                                      int64_t time_out) {
+bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
+                                  const platform::DeviceContext& ctx,
+                                  const framework::Scope& scope,
+                                  const std::string& in_var_name,
+                                  const std::string& out_var_name,
+                                  int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string in_var_name_val = in_var_name;
@@ -164,7 +189,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
   return true;
 }
 
-void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
+void GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
+                                       int64_t time_out) {
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
@@ -177,7 +203,8 @@ void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
   req_count_++;
 }
 
-void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
+void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
+                                       int64_t time_out) {
   const auto ch = GetChannel(ep);
   FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
   s->Prepare(time_out);
@@ -189,62 +216,50 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
   req_count_++;
 }
 
-bool RPCClient::Wait() {
-  if (req_count_ <= 0) {
-    return true;
-  }
-  const size_t kReqCnt = req_count_;
-  bool a[kReqCnt];
-  std::vector<std::future<void>> waits(req_count_);
-
-  for (int i = 0; i < req_count_; i++) {
-    waits[i] = framework::AsyncIO([i, &a, this] { a[i] = Proceed(); });
-  }
-
-  for (int i = 0; i < req_count_; i++) {
-    waits[i].wait();
-  }
+void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) {
+  const auto ch = GetChannel(ep);
 
-  int last_req_count = req_count_;
-  req_count_ = 0;
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  s->Prepare(time_out);
 
-  for (int i = 0; i < last_req_count; i++) {
-    if (!a[i]) {
-      return false;
-    }
-  }
+  sendrecv::VariableMessage req;
+  req.set_varname(COMPLETE_MESSAGE);
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
 
-  return true;
+void GRPCClient::Wait() {
+  std::unique_lock<std::mutex> lk(sync_mutex_);
+  sync_cond_.wait(lk, [this] { return req_count_ == 0; });
 }
 
-bool RPCClient::Proceed() {
-  void* tag = NULL;
+void GRPCClient::Proceed() {
+  void* tag = nullptr;
   bool ok = false;
 
-  // request counts.
-  if (!cq_.Next(&tag, &ok)) {
-    LOG(ERROR) << "Get meets CompletionQueue error";
-    return false;
-  }
-
-  GPR_ASSERT(ok);
-  PADDLE_ENFORCE(tag);
-
-  // TODO(gongwb): add more retries.
-  BaseProcessor* c = static_cast<BaseProcessor*>(tag);
-  if (!c->status_.ok()) {
-    LOG(ERROR) << "proc param error:" << c->var_h_.String()
-               << " grpc error:" << c->status_.error_message();
+  while (cq_.Next(&tag, &ok)) {
+    BaseProcessor* c = static_cast<BaseProcessor*>(tag);
+    GPR_ASSERT(ok);
+    PADDLE_ENFORCE(c);
+    if (c->status_.ok()) {
+      c->Process();
+    } else {
+      LOG(ERROR) << "var: " << c->var_h_.String()
+                 << " grpc error:" << c->status_.error_message();
+    }
     delete c;
-    return false;
+    {
+      std::lock_guard<std::mutex> lk(sync_mutex_);
+      req_count_--;
+    }
+    sync_cond_.notify_all();
   }
-
-  c->Process();
-  delete c;
-  return true;
 }
 
-std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
+std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
+  // TODO(Yancey1989): make grpc client completely thread-safe
+  std::lock_guard<std::mutex> guard(chan_mutex_);
   auto it = channels_.find(ep);
   if (it != channels_.end()) {
     return it->second;
@@ -257,7 +272,6 @@ std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
 
   auto ch =
       grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args);
-
   channels_[ep] = ch;
   return ch;
 }
diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h
index dabce7414d..44000c028b 100644
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -16,14 +16,18 @@ limitations under the License. */
 
 #include <time.h>
 
-#include <chrono>  // NOLINT
+#include <chrono>              // NOLINT
+#include <condition_variable>  // NOLINT
 #include <ctime>
 #include <functional>
 #include <iostream>
 #include <map>
+#include <mutex>  // NOLINT
 #include <string>
+#include <thread>  // NOLINT
 #include <vector>
 
+#include "grpc++/channel.h"
 #include "grpc++/generic/generic_stub.h"
 #include "grpc++/grpc++.h"
 #include "grpc++/support/byte_buffer.h"
@@ -34,7 +38,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/detail/rpc_client.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
 namespace paddle {
 namespace operators {
@@ -159,43 +165,65 @@ class FetchBarrierProcessor : public BaseProcessor {
   std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
 
-class RPCClient {
+class GRPCClient : public RPCClient {
  public:
-  bool AsyncSendVariable(const std::string& ep,
-                         const platform::DeviceContext& ctx,
-                         const framework::Scope& scope,
-                         const std::string& var_name,
-                         int64_t time_out = 600 * 1000);
+  GRPCClient() {}
+  virtual ~GRPCClient();
 
-  bool AsyncGetVariable(const std::string& ep,
+  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
+                    const framework::Scope& scope, const std::string& var_name,
+                    int64_t time_out = RPCClient::rpc_time_out) override;
+
+  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
+                   const framework::Scope& scope, const std::string& var_name,
+                   int64_t time_out = RPCClient::rpc_time_out) override;
+
+  bool AsyncPrefetchVar(const std::string& ep,
                         const platform::DeviceContext& ctx,
                         const framework::Scope& scope,
-                        const std::string& var_name,
-                        int64_t time_out = 600 * 1000);
+                        const std::string& in_var_name,
+                        const std::string& out_var_name,
+                        int64_t time_out = RPCClient::rpc_time_out) override;
 
-  bool AsyncPrefetchVariable(const std::string& ep,
-                             const platform::DeviceContext& ctx,
-                             const framework::Scope& scope,
-                             const std::string& in_var_name,
-                             const std::string& out_var_name,
-                             int64_t time_out = 600 * 1000);
+  void AsyncSendBatchBarrier(
+      const std::string& ep,
+      int64_t time_out = RPCClient::rpc_time_out) override;
 
-  void AsyncSendBatchBarrier(const std::string& ep,
-                             int64_t time_out = 600 * 1000);
+  void AsyncSendFetchBarrier(
+      const std::string& ep,
+      int64_t time_out = RPCClient::rpc_time_out) override;
 
-  void AsyncSendFetchBarrier(const std::string& ep,
-                             int64_t time_out = 600 * 1000);
+  void Wait() override;
 
-  bool Wait();
+  void SendComplete() override;
+
+ protected:
+  void InitImpl() override;
 
  private:
-  bool Proceed();
+  // InitEventLoop should only be called by Init()
+  void InitEventLoop();
+
+  void Proceed();
+
+  void AsyncSendComplete(const std::string& ep,
+                         int64_t time_out = RPCClient::rpc_time_out);
+
   std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
 
  private:
   grpc::CompletionQueue cq_;
-  std::map<std::string, std::shared_ptr<grpc::Channel>> channels_;
-  int64_t req_count_ = 0;
+  std::unordered_map<std::string, std::shared_ptr<grpc::Channel>> channels_;
+  std::unique_ptr<std::thread> client_thread_;
+
+  // mutex for Wait client sync
+  std::mutex sync_mutex_;
+  std::condition_variable sync_cond_;
+  std::atomic<int64_t> req_count_{0};
+
+  // mutex for GetChannel thread safety
+  std::mutex chan_mutex_;
+  DISABLE_COPY_AND_ASSIGN(GRPCClient);
 };
 
 }  // namespace detail
diff --git a/paddle/fluid/operators/detail/serde_test.cc b/paddle/fluid/operators/detail/grpc_serde_test.cc
similarity index 100%
rename from paddle/fluid/operators/detail/serde_test.cc
rename to paddle/fluid/operators/detail/grpc_serde_test.cc
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index eb114a47d9..2d34f85838 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/*Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,17 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/detail/grpc_server.h"
-
 #include <limits>
 #include <string>
 
+#include "paddle/fluid/operators/detail/grpc_server.h"
+
 using ::grpc::ServerAsyncResponseWriter;
 
 namespace paddle {
 namespace operators {
 namespace detail {
-
 enum CallStatus { PROCESS = 0, FINISH };
 
 // reference:
@@ -30,335 +29,322 @@ enum CallStatus { PROCESS = 0, FINISH };
 class RequestBase {
  public:
   explicit RequestBase(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
-                       const platform::DeviceContext* dev_ctx)
+                       ::grpc::ServerCompletionQueue* cq,
+                       RequestHandler* request_handler, int req_id)
       : service_(service),
         cq_(cq),
-        sync_mode_(sync_mode),
         status_(PROCESS),
-        dev_ctx_(dev_ctx) {
+        request_handler_(request_handler),
+        req_id_(req_id) {
     PADDLE_ENFORCE(cq_);
   }
   virtual ~RequestBase() {}
-  virtual void Process() { assert(false); }
+  virtual void Process() = 0;
+
+  CallStatus Status() const {
+    std::lock_guard<std::mutex> l(status_mu_);
+    return status_;
+  }
 
-  CallStatus Status() { return status_; }
-  void SetStatus(CallStatus status) { status_ = status; }
-  virtual std::string GetReqName() {
-    assert(false);
-    return "";
+  template <typename T>
+  void Finish(const T& reply, ServerAsyncResponseWriter<T>* responder) {
+    std::lock_guard<std::mutex> l(status_mu_);
+    status_ = FINISH;
+    responder->Finish(reply, ::grpc::Status::OK,
+                      reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
   }
+  virtual std::string GetReqName() = 0;
 
  protected:
+  mutable std::mutex status_mu_;
   ::grpc::ServerContext ctx_;
   GrpcService::AsyncService* service_;
   ::grpc::ServerCompletionQueue* cq_;
-  const bool sync_mode_;
   CallStatus status_;
-  const platform::DeviceContext* dev_ctx_;
+  RequestHandler* request_handler_;
+  int req_id_;
 };
 
 class RequestSend final : public RequestBase {
  public:
   explicit RequestSend(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
-                       framework::Scope* scope, ReceivedQueue* queue,
-                       const platform::DeviceContext* dev_ctx)
-      : RequestBase(service, cq, sync_mode, dev_ctx),
-        queue_(queue),
-        responder_(&ctx_) {
-    if (sync_mode_) {
-      request_.reset(new VariableResponse(scope, dev_ctx_, false));
-    } else {
-      request_.reset(new VariableResponse(scope, dev_ctx_, true));
-    }
+                       ::grpc::ServerCompletionQueue* cq,
+                       RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    request_.reset(new VariableResponse(request_handler->scope(),
+                                        request_handler->dev_ctx(),
+                                        !request_handler->sync_mode()));
     int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
-    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
-                                cq_, cq_, this);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
   }
-
   virtual ~RequestSend() {}
+  std::string GetReqName() override { return request_->Varname(); }
 
-  virtual std::string GetReqName() { return request_->Varname(); }
+  void Process() override {
+    std::string varname = GetReqName();
+    VLOG(3) << "RequestSend var_name:" << varname;
 
-  virtual void Process() {
-    std::string var_name = GetReqName();
-    VLOG(3) << "RequestSend " << var_name;
-    queue_->Push(std::make_pair(var_name, request_));
+    auto scope = request_->GetMutableLocalScope();
+    auto invar = request_->GetVar();
+    framework::Variable* outvar = nullptr;
 
-    sendrecv::VoidMessage reply;
-    responder_.Finish(reply, ::grpc::Status::OK, this);
-    status_ = FINISH;
+    request_handler_->Handle(varname, scope, invar, &outvar);
+    Finish(reply_, &responder_);
   }
 
  protected:
+  sendrecv::VoidMessage reply_;
   std::shared_ptr<VariableResponse> request_;
-  ReceivedQueue* queue_;
   ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
 };
 
 class RequestGet final : public RequestBase {
  public:
   explicit RequestGet(GrpcService::AsyncService* service,
-                      ::grpc::ServerCompletionQueue* cq, bool sync_mode,
-                      framework::Scope* scope,
-                      const platform::DeviceContext* dev_ctx,
-                      framework::BlockingQueue<MessageWithName>* queue)
-      : RequestBase(service, cq, sync_mode, dev_ctx),
-        responder_(&ctx_),
-        scope_(scope),
-        queue_(queue) {
+                      ::grpc::ServerCompletionQueue* cq,
+                      RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
     auto method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
-    service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
-                                cq_, this);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, &request_, &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
   }
 
   virtual ~RequestGet() {}
 
-  virtual std::string GetReqName() { return request_.varname(); }
+  std::string GetReqName() override { return request_.varname(); }
 
-  virtual void Process() {
+  void Process() override {
     // proc request.
-    std::string var_name = request_.varname();
-    VLOG(3) << "RequestGet " << var_name;
-    auto* var = scope_->FindVar(var_name);
+    std::string varname = request_.varname();
+    VLOG(3) << "RequestGet " << varname;
 
-    ::grpc::ByteBuffer reply;
-    if (var_name != FETCH_BARRIER_MESSAGE) {
-      SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
-    }
+    auto scope = request_handler_->scope();
+    auto invar = scope->FindVar(varname);
+    framework::Variable* outvar = nullptr;
 
-    responder_.Finish(reply, ::grpc::Status::OK, this);
-    status_ = FINISH;
+    request_handler_->Handle(varname, scope, invar, &outvar);
 
-    if (var_name == FETCH_BARRIER_MESSAGE) {
-      sendrecv::VariableMessage msg;
-      MessageWithName msg_with_name = std::make_pair(var_name, msg);
-      queue_->Push(msg_with_name);
+    if (outvar) {
+      SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+                            &reply_);
     }
+    Finish(reply_, &responder_);
   }
 
  protected:
   sendrecv::VariableMessage request_;
+  ::grpc::ByteBuffer reply_;
   ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  framework::Scope* scope_;
-  framework::BlockingQueue<MessageWithName>* queue_;
 };
 
 class RequestPrefetch final : public RequestBase {
  public:
   explicit RequestPrefetch(GrpcService::AsyncService* service,
-                           ::grpc::ServerCompletionQueue* cq, bool sync_mode,
-                           framework::Scope* scope,
-                           const platform::DeviceContext* dev_ctx,
-                           framework::Executor* executor,
-                           framework::ProgramDesc* program,
-                           framework::ExecutorPrepareContext* prefetch_ctx)
-      : RequestBase(service, cq, sync_mode, dev_ctx),
+                           ::grpc::ServerCompletionQueue* cq,
+                           RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id),
         responder_(&ctx_),
-        scope_(scope),
-        executor_(executor),
-        program_(program),
-        prefetch_ctx_(prefetch_ctx) {
-    if (sync_mode_) {
-      request_.reset(new VariableResponse(scope, dev_ctx_, false));
-    } else {
-      request_.reset(new VariableResponse(scope, dev_ctx_, true));
-    }
+        local_scope_(nullptr) {
+    request_.reset(new VariableResponse(request_handler->scope(),
+                                        request_handler->dev_ctx(), true));
     int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
-    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
-                                cq_, cq_, this);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
   }
 
   virtual ~RequestPrefetch() {}
 
-  virtual std::string GetReqName() { return request_->Varname(); }
+  std::string GetReqName() override { return request_->Varname(); }
 
-  virtual void Process() {
+  void Process() override {
     // prefetch process...
-    ::grpc::ByteBuffer reply;
+    std::string in_var_name = request_->Varname();
+    std::string out_var_name = request_->OutVarname();
+    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
+            << " out_var_name: " << out_var_name;
 
-    std::string var_name = request_->OutVarname();
-    VLOG(3) << "RequestPrefetch " << var_name;
-    auto var_desc = program_->Block(0).FindVar(var_name);
-    framework::Scope* local_scope = &scope_->NewScope();
-    auto* var = local_scope->FindVar(var_name);
-    InitializeVariable(var, var_desc->GetType());
-    executor_->RunPreparedContext(prefetch_ctx_, scope_);
+    auto scope = request_->GetMutableLocalScope();
+    auto invar = scope->FindVar(in_var_name);
+    framework::Variable* outvar = scope->FindVar(out_var_name);
 
-    SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
+    request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name);
 
-    responder_.Finish(reply, ::grpc::Status::OK, this);
-    status_ = FINISH;
+    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
+                          &reply_);
+    Finish(reply_, &responder_);
   }
 
  protected:
   std::shared_ptr<VariableResponse> request_;
+  ::grpc::ByteBuffer reply_;
   ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  framework::Scope* scope_;
-  framework::Executor* executor_;
-  framework::ProgramDesc* program_;
-  framework::ExecutorPrepareContext* prefetch_ctx_;
+  framework::Scope* local_scope_;
 };
 
-void AsyncGRPCServer::WaitClientGet(int count) {
-  int fetch_barriers = 0;
-  while (fetch_barriers < count) {
-    auto msg = var_get_queue_.Pop();
-    if (msg.first == FETCH_BARRIER_MESSAGE) {
-      fetch_barriers++;
-    }
-  }
-}
-
 void AsyncGRPCServer::WaitServerReady() {
+  VLOG(3) << "AsyncGRPCServer is wait server ready";
   std::unique_lock<std::mutex> lock(this->mutex_ready_);
   condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
+  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
 }
 
-void AsyncGRPCServer::RunSyncUpdate() {
+void AsyncGRPCServer::StartServer() {
   ::grpc::ServerBuilder builder;
-  builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(),
+  builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(),
                            &selected_port_);
+
   builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
   builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
   builder.RegisterService(&service_);
 
-  cq_send_ = builder.AddCompletionQueue();
-  cq_get_ = builder.AddCompletionQueue();
-  cq_prefetch_ = builder.AddCompletionQueue();
+  for (auto t : rpc_call_map_) {
+    rpc_cq_[t.first].reset(builder.AddCompletionQueue().release());
+  }
 
   server_ = builder.BuildAndStart();
-  LOG(INFO) << "Server listening on " << address_
+  LOG(INFO) << "Server listening on " << bind_address_
             << " selected port: " << selected_port_;
 
-  std::function<void()> send_register =
-      std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
-  std::function<void()> get_register =
-      std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
-  std::function<void()> prefetch_register =
-      std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
-
-  // TODO(wuyi): Run these "HandleRequest" in thread pool
-  t_send_.reset(
-      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
-                                cq_send_.get(), "cq_send", send_register)));
-  t_get_.reset(
-      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
-                                cq_get_.get(), "cq_get", get_register)));
-  t_prefetch_.reset(new std::thread(
-      std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
-                "cq_prefetch", prefetch_register)));
+  std::function<void(const std::string&, int)> f =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewOne, this,
+                std::placeholders::_1, std::placeholders::_2);
+
+  for (auto& t : rpc_call_map_) {
+    auto& rpc_name = t.first;
+    auto& cq = rpc_cq_[rpc_name];
+    auto threadnum = rpc_thread_num_[rpc_name];
+    auto& reqs = rpc_reqs_[rpc_name];
+
+    reqs.reserve(kRequestBufSize);
+
+    for (int i = 0; i < kRequestBufSize; i++) {
+      TryToRegisterNewOne(rpc_name, i);
+    }
+
+    for (int i = 0; i < threadnum; i++) {
+      rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
+          &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
+      VLOG(3) << t.first << " creates threads!";
+    }
+  }
 
   {
     std::lock_guard<std::mutex> lock(this->mutex_ready_);
     ready_ = 1;
   }
   condition_ready_.notify_all();
+
   // wait server
   server_->Wait();
-  t_send_->join();
-  t_get_->join();
-  t_prefetch_->join();
+
+  for (auto& t : rpc_threads_) {
+    auto& threads = t.second;
+    for (size_t i = 0; i < threads.size(); ++i) {
+      threads[i]->join();
+      VLOG(3) << t.first << " threads ends!";
+    }
+  }
 }
 
 void AsyncGRPCServer::ShutdownQueue() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  cq_send_->Shutdown();
-  cq_get_->Shutdown();
-  cq_prefetch_->Shutdown();
+  for (auto& t : rpc_cq_) {
+    t.second->Shutdown();
+    VLOG(3) << t.first << " shutdown!";
+  }
 }
 
-// This URL explains why shutdown is complicate:
-void AsyncGRPCServer::ShutDown() {
+void AsyncGRPCServer::ShutDownImpl() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
   is_shut_down_ = true;
   ShutdownQueue();
+
+  VLOG(3) << "server_ shutdown!";
   server_->Shutdown();
 }
 
-void AsyncGRPCServer::TryToRegisterNewSendOne() {
+void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
+                                          int req_id) {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
     VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
     return;
   }
-  RequestSend* send = new RequestSend(&service_, cq_send_.get(), sync_mode_,
-                                      scope_, &var_recv_queue_, dev_ctx_);
-  VLOG(4) << "Create RequestSend status:" << send->Status();
-}
 
-void AsyncGRPCServer::TryToRegisterNewGetOne() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  if (is_shut_down_) {
-    VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
-    return;
+  VLOG(4) << "register send rpc_name:" << rpc_name
+          << ", handler:" << rpc_call_map_[kRequestSend];
+
+  auto& reqs = rpc_reqs_[rpc_name];
+  auto& handler = rpc_call_map_[rpc_name];
+  auto& cq = rpc_cq_[rpc_name];
+
+  RequestBase* b = nullptr;
+  if (rpc_name == kRequestSend) {
+    b = new RequestSend(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestGet) {
+    b = new RequestGet(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestPrefetch) {
+    b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
+  } else {
+    PADDLE_ENFORCE(false, "not supported rpc");
   }
-  RequestGet* get = new RequestGet(&service_, cq_get_.get(), sync_mode_, scope_,
-                                   dev_ctx_, &var_get_queue_);
-  VLOG(4) << "Create RequestGet status:" << get->Status();
-}
 
-void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  if (is_shut_down_) {
-    VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
-    return;
-  }
-  RequestPrefetch* prefetch =
-      new RequestPrefetch(&service_, cq_prefetch_.get(), sync_mode_, scope_,
-                          dev_ctx_, executor_, program_, prefetch_ctx_.get());
+  reqs[req_id] = b;
 
-  VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
+  VLOG(4) << "Create RequestSend status:" << b->Status();
 }
 
-// FIXME(typhoonzero): change cq_name to enum.
-void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
-                                    const std::string& cq_name,
-                                    std::function<void()> TryToRegisterNewOne) {
-  TryToRegisterNewOne();
-
+void AsyncGRPCServer::HandleRequest(
+    ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
+    std::function<void(const std::string&, int)> TryToRegisterNewOne) {
   void* tag = NULL;
   bool ok = false;
 
   while (true) {
-    VLOG(3) << "HandleRequest for " << cq_name << " wait Next";
+    VLOG(3) << "HandleRequest " << rpc_name << " wait next";
     if (!cq->Next(&tag, &ok)) {
-      LOG(INFO) << cq_name << " CompletionQueue shutdown!";
+      LOG(INFO) << "CompletionQueue " << rpc_name << " shutdown!";
       break;
     }
-    VLOG(3) << "HandleRequest for " << cq_name << " get Next";
 
-    PADDLE_ENFORCE(tag);
+    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
+    VLOG(3) << "HandleRequest " << rpc_name << ", req_id:" << req_id
+            << " get next";
 
-    if (sync_mode_) {
-      // FIXME(typhoonzero): de-couple the barriers with recv_op
-      if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
-      if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
-      VLOG(3) << "HandleRequest for " << cq_name << " after WaitCond";
+    auto& reqs = rpc_reqs_[rpc_name];
+    RequestBase* base = nullptr;
+    {
+      PADDLE_ENFORCE(req_id >= 0 && req_id < kRequestBufSize);
+      std::unique_lock<std::mutex> lock(cq_mutex_);
+      base = reqs[req_id];
     }
 
-    RequestBase* base = reinterpret_cast<RequestBase*>(tag);
     // reference:
     // https://github.com/tensorflow/tensorflow/issues/5596
     // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
     // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
     if (!ok) {
-      LOG(WARNING) << cq_name << " recv no regular event:argument name["
+      LOG(WARNING) << "completion queue:" << rpc_name
+                   << " recv no regular event:argument name["
                    << base->GetReqName() << "]";
-      TryToRegisterNewOne();
+      TryToRegisterNewOne(rpc_name, req_id);
       delete base;
       continue;
     }
 
+    VLOG(3) << "queue id:" << rpc_name << ", req_id:" << req_id
+            << ", status:" << base->Status();
+
     switch (base->Status()) {
       case PROCESS: {
-        TryToRegisterNewOne();
         base->Process();
-        VLOG(4) << cq_name << " PROCESS status:" << base->Status();
         break;
       }
       case FINISH: {
-        VLOG(4) << cq_name << " FINISH status:" << base->Status();
+        TryToRegisterNewOne(rpc_name, req_id);
         delete base;
         break;
       }
@@ -367,20 +353,6 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
   }
 }
 
-void AsyncGRPCServer::WaitCond(int cond) {
-  std::unique_lock<std::mutex> lock(this->barrier_mutex_);
-  barrier_condition_.wait(lock,
-                          [=] { return this->barrier_cond_step_ == cond; });
-}
-
-void AsyncGRPCServer::SetCond(int cond) {
-  {
-    std::lock_guard<std::mutex> lock(this->barrier_mutex_);
-    barrier_cond_step_ = cond;
-  }
-  barrier_condition_.notify_all();
-}
-
 }  // namespace detail
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index 238aaa2963..f1db7590f6 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -14,9 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#include <map>
+#include <set>
 #include <string>
 #include <thread>  // NOLINT
 #include <utility>
+#include <vector>
 
 #include "grpc++/grpc++.h"
 #include "paddle/fluid/framework/blocking_queue.h"
@@ -27,104 +30,58 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/detail/grpc_service.h"
+#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/detail/rpc_server.h"
 #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
 namespace detail {
 
-typedef std::pair<std::string, std::shared_ptr<VariableResponse>>
-    ReceivedMessage;
-typedef framework::BlockingQueue<ReceivedMessage> ReceivedQueue;
-
-typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
 class RequestBase;
 
-class AsyncGRPCServer final {
+class AsyncGRPCServer final : public RPCServer {
  public:
-  explicit AsyncGRPCServer(const std::string &address, bool sync_mode)
-      : address_(address), sync_mode_(sync_mode), ready_(0) {}
-
-  ~AsyncGRPCServer() {}
-  void WaitServerReady();
-  void RunSyncUpdate();
-
-  // functions to sync server barrier status.
-  void WaitCond(int cond);
-  void SetCond(int cond);
-  void WaitClientGet(int count);
-
-  void SetScope(framework::Scope *scope) { scope_ = scope; }
-
-  void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
-
-  void SetProgram(framework::ProgramDesc *program) { program_ = program; }
-
-  void SetExecutor(framework::Executor *executor) { executor_ = executor; }
-
-  void SetPrefetchPreparedCtx(
-      std::unique_ptr<framework::ExecutorPrepareContext> prepared) {
-    prefetch_ctx_.reset(prepared.release());
-  }
-
-  int GetSelectedPort() const { return selected_port_; }
+  explicit AsyncGRPCServer(const std::string& address, int client_num)
+      : RPCServer(address, client_num), ready_(0) {}
 
-  const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
+  virtual ~AsyncGRPCServer() {}
+  void WaitServerReady() override;
+  void StartServer() override;
 
-  void Push(const std::string &msg_name) {
-    this->var_recv_queue_.Push(std::make_pair(msg_name, nullptr));
-  }
-
-  void ShutDown();
+ private:
+  // HandleRequest needs to be thread-safe.
+  void HandleRequest(
+      ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
+      std::function<void(const std::string&, int)> TryToRegisterNewOne);
 
- protected:
-  void HandleRequest(::grpc::ServerCompletionQueue *cq,
-                     const std::string &cq_name,
-                     std::function<void()> TryToRegisterNewOne);
-  void TryToRegisterNewSendOne();
-  void TryToRegisterNewGetOne();
-  void TryToRegisterNewPrefetchOne();
+  void TryToRegisterNewOne(const std::string& rpc_name, int req_id);
   void ShutdownQueue();
+  void ShutDownImpl() override;
 
  private:
+  static const int kRequestBufSize = 100;
+
   std::mutex cq_mutex_;
   volatile bool is_shut_down_ = false;
-  std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_;
-  std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_;
-  std::unique_ptr<::grpc::ServerCompletionQueue> cq_prefetch_;
 
   GrpcService::AsyncService service_;
   std::unique_ptr<::grpc::Server> server_;
 
-  std::string address_;
-  const bool sync_mode_;
-  framework::Scope *scope_;
-  const platform::DeviceContext *dev_ctx_;
-
-  // received variable from RPC, operators fetch variable from this queue.
-  framework::BlockingQueue<MessageWithName> var_get_queue_;
-  // client send variable to this queue.
-  ReceivedQueue var_recv_queue_;
-
   // condition of the sub program
-  std::mutex barrier_mutex_;
-  mutable int barrier_cond_step_;
   std::condition_variable barrier_condition_;
 
-  std::unique_ptr<std::thread> t_send_;
-  std::unique_ptr<std::thread> t_get_;
-  std::unique_ptr<std::thread> t_prefetch_;
-
-  std::unique_ptr<framework::ExecutorPrepareContext> prefetch_ctx_;
-  framework::ProgramDesc *program_;
-  framework::Executor *executor_;
-  int selected_port_;
-
   std::mutex mutex_ready_;
   std::condition_variable condition_ready_;
+
   int ready_;
+
+  std::map<std::string, std::unique_ptr<::grpc::ServerCompletionQueue>> rpc_cq_;
+  std::map<std::string, std::vector<std::unique_ptr<std::thread>>> rpc_threads_;
+  std::map<std::string, std::vector<RequestBase*>> rpc_reqs_;
 };
 
 };  // namespace detail
diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h
index e6dab2f5a3..e0505c2b9d 100644
--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/detail/grpc_service.h
@@ -25,6 +25,8 @@
 #include <grpc++/support/byte_buffer.h>
 #include "paddle/fluid/operators/detail/variable_response.h"
 
+#include "paddle/fluid/platform/profiler.h"
+
 // NOTE: This method was originally created by tensorflow
 //       (https://github.com/tensorflow/tensorflow/) we borrow this
 //       method and did some modifications so that we can parse gRPC
diff --git a/paddle/fluid/operators/detail/macros.h b/paddle/fluid/operators/detail/macros.h
new file mode 100644
index 0000000000..da1de72dad
--- /dev/null
+++ b/paddle/fluid/operators/detail/macros.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_GRPC
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
+#define RPCSERVER_T detail::AsyncGRPCServer
+#define RPCCLIENT_T detail::GRPCClient
+#else
+#include "paddle/fluid/operators/detail/brpc_client.h"
+#include "paddle/fluid/operators/detail/brpc_server.h"
+#define RPCSERVER_T detail::AsyncBRPCServer
+#define RPCCLIENT_T detail::BRPCClient
+#endif
diff --git a/paddle/fluid/operators/detail/request_handler.h b/paddle/fluid/operators/detail/request_handler.h
new file mode 100644
index 0000000000..a2d08747d5
--- /dev/null
+++ b/paddle/fluid/operators/detail/request_handler.h
@@ -0,0 +1,129 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <time.h>
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+constexpr char kRequestSend[] = "RequestSend";
+constexpr char kRequestGet[] = "RequestGet";
+constexpr char kRequestPrefetch[] = "RequestPrefetch";
+
+#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
+#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
+#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
+#define COMPLETE_MESSAGE "COMPLETE@RECV"
+
+class RPCServer;
+
+class RequestHandler {
+ public:
+  explicit RequestHandler(bool sync_mode)
+      : sync_mode_(sync_mode),
+        dev_ctx_(nullptr),
+        executor_(nullptr),
+        scope_(nullptr),
+        program_(nullptr),
+        rpc_server_(nullptr) {}
+
+  virtual ~RequestHandler() {}
+
+  // Set attributes.
+  void SetScope(framework::Scope* scope) { scope_ = scope; }
+  void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
+  void SetProgram(framework::ProgramDesc* program) { program_ = program; }
+  void SetExecutor(framework::Executor* executor) { executor_ = executor; }
+
+  // Used for dist lookup table prefetch
+  void SetPrefetchPreparedCtx(
+      std::unordered_map<
+          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
+    prefetch_var_name_to_prepared_ctx_ = g;
+  }
+
+  // Used for async.
+  void SetGradToPreparedCtx(
+      std::unordered_map<
+          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
+    grad_to_prepared_ctx_ = g;
+  }
+
+  void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; }
+
+  // Get attributes.
+  bool sync_mode() { return sync_mode_; }
+  framework::Scope* scope() { return scope_; }
+  const platform::DeviceContext* dev_ctx() { return dev_ctx_; }
+  framework::ProgramDesc* program() { return program_; }
+  framework::Executor* executor() { return executor_; }
+
+  // This function processes user's rpc request.
+  // The implemention is in request_handler_impl.
+  // example:
+  //    std::string varname = request_.varname();
+  //
+  //    auto scope = request_handler_->scope();
+  //    auto invar = scope->FindVar(varname);
+  //    framework::Variable* outvar = nullptr;
+  //
+  //    request_handler_->Handle(varname, scope, invar, &outvar);
+  //    if (outvar) {
+  //        SerializeToByteBuffer(varname, outvar,
+  //           *request_handler_->dev_ctx(), &reply_);
+  //    }
+  virtual bool Handle(const std::string& varname, framework::Scope* scope,
+                      framework::Variable* var, framework::Variable** outvar,
+                      const std::string& out_var_name = "") = 0;
+
+ protected:
+  const bool sync_mode_;
+
+  const platform::DeviceContext* dev_ctx_;
+  framework::Executor* executor_;
+  framework::Scope* scope_;
+  framework::ProgramDesc* program_;
+
+  // used for distribute lookup table prefetch
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>*
+      prefetch_var_name_to_prepared_ctx_;
+
+  // Used for async.
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>*
+      grad_to_prepared_ctx_;
+
+  RPCServer* rpc_server_;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/detail/request_handler_impl.cc
new file mode 100644
index 0000000000..7425bee798
--- /dev/null
+++ b/paddle/fluid/operators/detail/request_handler_impl.cc
@@ -0,0 +1,124 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/operators/detail/rpc_server.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+bool RequestSendHandler::Handle(const std::string& varname,
+                                framework::Scope* scope,
+                                framework::Variable* invar,
+                                framework::Variable** outvar,
+                                const std::string& out_var_name) {
+  VLOG(4) << "RequestSendHandler:" << varname;
+
+  // Async
+  if (!sync_mode_) {
+    try {
+      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
+                                    scope);
+    } catch (std::exception& e) {
+      LOG(ERROR) << "async: run sub program error " << e.what();
+      return false;
+    }
+    return true;
+  }
+
+  // Sync
+  if (varname == BATCH_BARRIER_MESSAGE) {
+    VLOG(3) << "sync: recv batch barrier message";
+    rpc_server_->IncreaseBatchBarrier(kRequestSend);
+  } else if (varname == COMPLETE_MESSAGE) {
+    VLOG(3) << "sync: recv complete message";
+    rpc_server_->DecreaseClientNum();
+  } else {
+    VLOG(3) << "sync: received var_name: " << varname;
+    if (sync_mode_) {
+      rpc_server_->WaitCond(kRequestSend);
+    }
+
+    if (invar == nullptr) {
+      LOG(ERROR) << "sync: Can not find server side var: " << varname;
+      PADDLE_THROW("sync: Can not find server side var");
+      return false;
+    }
+    if (invar->IsType<framework::SelectedRows>()) {
+      std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
+      sparse_vars_.push_back(invar);
+    }
+  }
+  return true;
+}
+
+void RequestSendHandler::ResetSparseVarRecorder() {
+  std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
+  for (auto* var : sparse_vars_) {
+    var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
+  }
+  sparse_vars_.clear();
+}
+
+bool RequestGetHandler::Handle(const std::string& varname,
+                               framework::Scope* scope,
+                               framework::Variable* invar,
+                               framework::Variable** outvar,
+                               const std::string& out_var_name) {
+  VLOG(4) << "RequestGetHandler:" << varname;
+
+  if (varname != FETCH_BARRIER_MESSAGE) {
+    if (sync_mode_) {
+      rpc_server_->WaitCond(kRequestGet);
+    }
+    *outvar = scope_->FindVar(varname);
+    return true;
+  }
+
+  // FETCH_BARRIER_MESSAGE
+  if (sync_mode_) {
+    VLOG(3) << "sync: recv fetch barrier message";
+    rpc_server_->IncreaseBatchBarrier(kRequestGet);
+  }
+
+  return true;
+}
+
+bool RequestPrefetchHandler::Handle(const std::string& varname,
+                                    framework::Scope* scope,
+                                    framework::Variable* invar,
+                                    framework::Variable** outvar,
+                                    const std::string& out_var_name) {
+  VLOG(4) << "RequestPrefetchHandler " << varname;
+
+  auto var_desc = program_->Block(0).FindVar(out_var_name);
+  InitializeVariable(*outvar, var_desc->GetType());
+  executor_->RunPreparedContext(
+      (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope);
+
+  return true;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/request_handler_impl.h b/paddle/fluid/operators/detail/request_handler_impl.h
new file mode 100644
index 0000000000..3f77c09a95
--- /dev/null
+++ b/paddle/fluid/operators/detail/request_handler_impl.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <time.h>
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/detail/request_handler.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class RequestSendHandler final : public RequestHandler {
+ public:
+  explicit RequestSendHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestSendHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
+  void ResetSparseVarRecorder();
+
+ private:
+  std::mutex mutex_sparse_vars_;
+  std::vector<framework::Variable*> sparse_vars_;
+};
+
+class RequestGetHandler final : public RequestHandler {
+ public:
+  explicit RequestGetHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestGetHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
+};
+
+class RequestPrefetchHandler final : public RequestHandler {
+ public:
+  explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestPrefetchHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_client.cc b/paddle/fluid/operators/detail/rpc_client.cc
new file mode 100644
index 0000000000..9a791403e3
--- /dev/null
+++ b/paddle/fluid/operators/detail/rpc_client.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/detail/rpc_client.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+std::once_flag RPCClient::init_flag_;
+std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_client.h b/paddle/fluid/operators/detail/rpc_client.h
new file mode 100644
index 0000000000..47c6ffb4fd
--- /dev/null
+++ b/paddle/fluid/operators/detail/rpc_client.h
@@ -0,0 +1,89 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class RPCClient {
+ public:
+  RPCClient() {}
+  virtual ~RPCClient() {}
+  virtual bool AsyncSendVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name,
+                            int64_t time_out = rpc_time_out) = 0;
+
+  virtual bool AsyncGetVar(const std::string& ep,
+                           const platform::DeviceContext& ctx,
+                           const framework::Scope& scope,
+                           const std::string& var_name,
+                           int64_t time_out = rpc_time_out) = 0;
+
+  virtual bool AsyncPrefetchVar(const std::string& ep,
+                                const platform::DeviceContext& ctx,
+                                const framework::Scope& scope,
+                                const std::string& in_var_name,
+                                const std::string& out_var_name,
+                                int64_t time_out = rpc_time_out) = 0;
+
+  virtual void AsyncSendBatchBarrier(const std::string& ep,
+                                     int64_t time_out = rpc_time_out) = 0;
+
+  virtual void AsyncSendFetchBarrier(const std::string& ep,
+                                     int64_t time_out = rpc_time_out) = 0;
+
+  // SendComplete tells all the server that current trainer have no more data
+  // to train, so that the pserver can reduce it's barrier count, and continue
+  // to train with other trainers.
+  virtual void SendComplete() = 0;
+
+  virtual void Wait() = 0;
+
+  static constexpr int64_t rpc_time_out = 120 * 1000;
+
+  template <typename T>
+  static RPCClient* GetInstance() {
+    std::call_once(init_flag_, &RPCClient::Init<T>);
+    return rpc_client_.get();
+  }
+
+  // Init is called by GetInstance.
+  template <typename T>
+  static void Init() {
+    if (rpc_client_.get() == nullptr) {
+      rpc_client_.reset(new T());
+      rpc_client_->InitImpl();
+    }
+  }
+
+ protected:
+  virtual void InitImpl() {}
+
+ private:
+  static std::once_flag init_flag_;
+  static std::unique_ptr<RPCClient> rpc_client_;
+};
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_server.cc b/paddle/fluid/operators/detail/rpc_server.cc
new file mode 100644
index 0000000000..cd0fe96e23
--- /dev/null
+++ b/paddle/fluid/operators/detail/rpc_server.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <string>
+
+#include "paddle/fluid/operators/detail/rpc_server.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+void RPCServer::ShutDown() {
+  LOG(INFO) << "RPCServer ShutDown ";
+  ShutDownImpl();
+
+  exit_flag_ = true;
+  barrier_cond_.notify_all();
+  rpc_cond_.notify_all();
+}
+
+void RPCServer::SavePort() const {
+  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
+  std::ofstream port_file;
+  port_file.open(file_path);
+  port_file << selected_port_;
+  port_file.close();
+  VLOG(4) << "selected port written to " << file_path;
+}
+
+void RPCServer::WaitBarrier(const std::string& rpc_name) {
+  std::unique_lock<std::mutex> lock(this->mutex_);
+  barrier_cond_.wait(lock, [this, &rpc_name] {
+    return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load());
+  });
+
+  VLOG(3) << "batch_barrier_:" << barrier_counter_[rpc_name];
+}
+
+void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
+  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  int b = 0;
+  std::unique_lock<std::mutex> lock(mutex_);
+  b = ++barrier_counter_[rpc_name];
+  if (b >= client_num_) {
+    lock.unlock();
+    barrier_cond_.notify_all();
+    lock.lock();
+  }
+}
+
+void RPCServer::DecreaseClientNum() {
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    client_num_--;
+  }
+  barrier_cond_.notify_all();
+}
+
+void RPCServer::ResetBarrierCounter() {
+  VLOG(3) << "RPCServer ResetBarrierCounter ";
+  std::unique_lock<std::mutex> lock(mutex_);
+  for (auto& t : barrier_counter_) {
+    t.second = 0;
+  }
+}
+
+void RPCServer::RegisterRPC(const std::string& rpc_name,
+                            RequestHandler* handler, int thread_num) {
+  rpc_call_map_[rpc_name] = handler;
+  rpc_thread_num_[rpc_name] = thread_num;
+
+  static int cond = -1;
+  rpc_cond_map_[rpc_name] = ++cond;
+  VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
+          << ", cond:" << rpc_cond_map_[rpc_name];
+}
+
+void RPCServer::SetCond(const std::string& rpc_name) {
+  VLOG(3) << "RPCServer SetCond " << rpc_name;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cur_cond_ = rpc_cond_map_[rpc_name];
+  }
+
+  rpc_cond_.notify_all();
+}
+
+void RPCServer::WaitCond(const std::string& rpc_name) {
+  VLOG(3) << "RPCServer WaitCond " << rpc_name;
+  int cond = 0;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cond = rpc_cond_map_[rpc_name];
+  }
+
+  std::unique_lock<std::mutex> lock(mutex_);
+  rpc_cond_.wait(
+      lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_server.h b/paddle/fluid/operators/detail/rpc_server.h
new file mode 100644
index 0000000000..2e3342428c
--- /dev/null
+++ b/paddle/fluid/operators/detail/rpc_server.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+#include "paddle/fluid/operators/detail/request_handler.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class RPCServer {
+ public:
+  explicit RPCServer(const std::string& address, int client_num)
+      : cur_cond_(0),
+        bind_address_(address),
+        exit_flag_(false),
+        selected_port_(0),
+        client_num_(client_num) {}
+
+  virtual ~RPCServer() {}
+  virtual void StartServer() = 0;
+  virtual void WaitServerReady() = 0;
+
+  void ShutDown();
+
+  bool IsExit() { return exit_flag_.load(); }
+
+  int GetSelectedPort() const { return selected_port_; }
+  void SavePort() const;
+
+  // RegisterRPC, register the rpc method name to a handler
+  // class, and auto generate a condition id for this call
+  // to be used for the barrier.
+  void RegisterRPC(const std::string& rpc_name, RequestHandler* handler,
+                   int thread_num = 5);
+
+  // Wait util all the clients have reached the barrier for one
+  // rpc method. This function should be called in the
+  // RequestHandler if you want to run the server/client in a
+  // synchronous mode.
+  void WaitBarrier(const std::string& rpc_name);
+
+  void SetCond(const std::string& rpc_name);
+  void WaitCond(const std::string& rpc_name);
+  void IncreaseBatchBarrier(const std::string rpc_name);
+  void DecreaseClientNum();
+  void ResetBarrierCounter();
+
+ protected:
+  virtual void ShutDownImpl() = 0;
+
+ private:
+  std::mutex mutex_;
+  std::unordered_map<std::string, int> barrier_counter_;
+  std::condition_variable barrier_cond_;
+
+  std::unordered_map<std::string, int> rpc_cond_map_;
+  std::atomic<int> cur_cond_;
+  std::condition_variable rpc_cond_;
+
+ protected:
+  std::string bind_address_;
+  std::atomic<int> exit_flag_;
+  int selected_port_;
+  int client_num_;
+
+  std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
+  std::unordered_map<std::string, int> rpc_thread_num_;
+  friend class RequestHandler;
+};
+
+};  // namespace detail
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/rpc_server_test.cc
similarity index 60%
rename from paddle/fluid/operators/detail/grpc_server_test.cc
rename to paddle/fluid/operators/detail/rpc_server_test.cc
index b8db0ad987..463a7b80cf 100644
--- a/paddle/fluid/operators/detail/grpc_server_test.cc
+++ b/paddle/fluid/operators/detail/rpc_server_test.cc
@@ -17,20 +17,23 @@ limitations under the License. */
 #include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/operators/detail/grpc_client.h"
-#include "paddle/fluid/operators/detail/grpc_server.h"
-
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/operators/detail/rpc_client.h"
+#include "paddle/fluid/operators/detail/rpc_server.h"
+
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace detail = paddle::operators::detail;
 
 USE_OP(lookup_table);
 
-std::unique_ptr<detail::AsyncGRPCServer> rpc_service_;
+std::unique_ptr<detail::RPCServer> g_rpc_service;
+std::unique_ptr<detail::RequestHandler> g_req_handler;
 
 framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
   auto root_block = program->MutableBlock(0);
@@ -88,53 +91,72 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
   }
 }
 
-void StartServer(const std::string& endpoint) {
-  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint, true));
+void StartServer() {
   framework::ProgramDesc program;
   framework::Scope scope;
   platform::CPUPlace place;
   framework::Executor exe(place);
   platform::CPUDeviceContext ctx(place);
   auto* block = AppendPrefetchBlcok(&program);
-  auto prepared = exe.Prepare(program, block->ID());
+  std::string in_var_name("ids");
+  std::vector<int> prefetch_block_ids{block->ID()};
+  auto prepared = exe.Prepare(program, prefetch_block_ids);
   InitTensorsOnServer(&scope, &place, 10);
 
-  rpc_service_->SetProgram(&program);
-  rpc_service_->SetPrefetchPreparedCtx(std::move(prepared));
-  rpc_service_->SetDevCtx(&ctx);
-  rpc_service_->SetScope(&scope);
-  rpc_service_->SetExecutor(&exe);
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      prefetch_var_name_to_prepared;
+  prefetch_var_name_to_prepared[in_var_name] = prepared[0];
+  g_req_handler->SetProgram(&program);
+  g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
+  g_req_handler->SetDevCtx(&ctx);
+  g_req_handler->SetScope(&scope);
+  g_req_handler->SetExecutor(&exe);
+
+  g_rpc_service->RegisterRPC(detail::kRequestPrefetch, g_req_handler.get());
+  g_req_handler->SetRPCServer(g_rpc_service.get());
 
-  rpc_service_->RunSyncUpdate();
+  std::thread server_thread(
+      std::bind(&detail::RPCServer::StartServer, g_rpc_service.get()));
+
+  server_thread.join();
 }
 
 TEST(PREFETCH, CPU) {
-  // start up a server instance backend
-  std::thread server_thread(StartServer, "127.0.0.1:8889");
-  sleep(2);
+  g_req_handler.reset(new detail::RequestPrefetchHandler(true));
+  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
+  detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>();
+
+  std::thread server_thread(StartServer);
+  g_rpc_service->WaitServerReady();
+
+  int port = g_rpc_service->GetSelectedPort();
+  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
+
   framework::Scope scope;
   platform::CPUPlace place;
   platform::CPUDeviceContext ctx(place);
-  // create var on local scope
-  int64_t rows_numel = 5;
-  InitTensorsOnClient(&scope, &place, rows_numel);
-  std::string in_var_name("ids");
-  std::string out_var_name("out");
-
-  detail::RPCClient client;
-  client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name,
-                               out_var_name);
-  client.Wait();
-
-  auto var = scope.Var(out_var_name);
-  auto value = var->GetMutable<framework::SelectedRows>()->value();
-  auto ptr = value.mutable_data<float>(place);
+  {
+    // create var on local scope
+    int64_t rows_numel = 5;
+    InitTensorsOnClient(&scope, &place, rows_numel);
+    std::string in_var_name("ids");
+    std::string out_var_name("out");
+
+    client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name);
+    client->Wait();
+    auto var = scope.Var(out_var_name);
+    auto value = var->GetMutable<framework::SelectedRows>()->value();
+    auto ptr = value.mutable_data<float>(place);
+
+    for (int64_t i = 0; i < rows_numel; ++i) {
+      EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
+    }
+  }
 
-  rpc_service_->ShutDown();
+  g_rpc_service->ShutDown();
   server_thread.join();
-  rpc_service_.reset(nullptr);
-
-  for (int64_t i = 0; i < rows_numel; ++i) {
-    EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
-  }
+  LOG(INFO) << "begin reset";
+  g_rpc_service.reset(nullptr);
+  g_req_handler.reset(nullptr);
 }
diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto
index 9478c5702b..54cb93e04d 100644
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -14,6 +14,8 @@ limitations under the License. */
 syntax = "proto3";
 package sendrecv;
 
+// option cc_generic_services = true;
+
 service SendRecvService {
   // For parameter server round-robin like hashing, do not split tensors.
   // Send and recv only one tensor
@@ -70,10 +72,10 @@ message VariableMessage {
   bytes rows = 9;
   // Look up table block execution output variable name.
   string out_varname = 10;
-  // If true, the ps server will start profiling, the ps
+  // If 1, the ps server will start profiling, the ps
   // server stops profiling and generates a profile to /tmp/profile_ps_*
-  // when profile switches from true to false.
-  bool profile = 11;
+  // when profile switches from 1 to 2.
+  int64 profile = 11;
 }
 
 message VoidMessage {}
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc
index 07c43554bc..507b465435 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -58,12 +58,13 @@ void GetTensorPayload(framework::Variable* var,
   if (platform::is_gpu_place(ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
-    platform::CPUPlace cpu;
+    platform::CUDAPinnedPlace cuda_pinned;
     auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
     auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
-    *payload = memory::Alloc(cpu, copy_size);
+    *payload = memory::Alloc(cuda_pinned, copy_size);
 
-    memory::Copy(cpu, *payload, boost::get<platform::CUDAPlace>(tensor.place()),
+    memory::Copy(cuda_pinned, *payload,
+                 boost::get<platform::CUDAPlace>(tensor.place()),
                  reinterpret_cast<const void*>(tensor.data<void>()), copy_size,
                  gpu_dev_ctx.stream());
     ctx.Wait();
@@ -90,11 +91,11 @@ void GetSelectedRowsPayload(framework::Variable* var,
   auto* tensor = slr->mutable_value();
   if (platform::is_gpu_place(ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
-    platform::CPUPlace cpu;
+    platform::CUDAPinnedPlace cuda_pinned;
     auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
     auto copy_size = tensor->numel() * framework::SizeOfType(tensor->type());
-    *payload = memory::Alloc(cpu, copy_size);
-    memory::Copy(cpu, *payload,
+    *payload = memory::Alloc(cuda_pinned, copy_size);
+    memory::Copy(cuda_pinned, *payload,
                  boost::get<platform::CUDAPlace>(tensor->place()),
                  reinterpret_cast<const void*>(tensor->data<void>()), copy_size,
                  gpu_dev_ctx.stream());
@@ -122,7 +123,13 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   // 1 trainer returns true for ShouldSendProfileState(). It tells PS
   // servers the trainer's profiling state so that PS can follow the
   // trainer.
-  request.set_profile(platform::IsProfileEnabled());
+  if (platform::ShouldSendProfileState()) {
+    if (platform::IsProfileEnabled()) {
+      request.set_profile(platform::kEnableProfiler);
+    } else {
+      request.set_profile(platform::kDisableProfiler);
+    }
+  }
   if (!out_name.empty()) {
     request.set_out_varname(out_name);
   }
@@ -142,12 +149,14 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   }
 
   if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
     // GPU data is copied to CPU buffer when sending,
     // free the buffer when possible.
     destroy_callback = [](void* backing) {
-      platform::CPUPlace cpu;
-      memory::Free(cpu, backing);
+      platform::CUDAPinnedPlace cuda_pinned;
+      memory::Free(cuda_pinned, backing);
     };
+#endif
   }
 
   std::string header;
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h
index c72e1bd076..bd16bf1dab 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
@@ -32,16 +32,6 @@ namespace paddle {
 namespace operators {
 namespace detail {
 
-#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
-#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
-#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
-
-static int64_t GetTimestamp() {
-  struct timeval tp;
-  gettimeofday(&tp, NULL);
-  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
-}
-
 typedef void (*DestroyCallback)(void*);
 
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc
index 462e303096..24cb91a3bb 100644
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
@@ -449,8 +449,8 @@ int VariableResponse::Parse(Source* source) {
         break;
       }
       case sendrecv::VariableMessage::kProfileFieldNumber: {
-        bool profiling;
-        if (!input.ReadRaw(reinterpret_cast<void*>(&profiling), 1)) {
+        uint64_t profiling = 0;
+        if (!input.ReadVarint64(&profiling)) {
           return tag;
         }
         meta_.set_profile(profiling);
@@ -458,9 +458,11 @@ int VariableResponse::Parse(Source* source) {
         if (listener_id <= 0) {
           break;
         }
-        if (profiling && !platform::IsProfileEnabled()) {
+        if (profiling == platform::kEnableProfiler &&
+            !platform::IsProfileEnabled()) {
           platform::EnableProfiler(platform::ProfilerState::kCPU);
-        } else if (!profiling && platform::IsProfileEnabled()) {
+        } else if (profiling == platform::kDisableProfiler &&
+                   platform::IsProfileEnabled()) {
           // TODO(panyx0718): Should we allow to customize file dir.
           platform::DisableProfiler(
               platform::EventSortingKey::kDefault,
diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/detail/variable_response.h
index bf624da2a6..69cfd784f8 100644
--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
@@ -67,8 +67,8 @@ class VariableResponse {
 
   framework::Scope* GetMutableLocalScope() const { return local_scope_; }
 
-  inline std::string Varname() { return meta_.varname(); }
-  inline std::string OutVarname() { return meta_.out_varname(); }
+  inline std::string Varname() const { return meta_.varname(); }
+  inline std::string OutVarname() const { return meta_.out_varname(); }
 
   // should call parse first.
   framework::Variable* GetVar() {
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index a5bb58c2f4..20d960f9fe 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -24,6 +24,8 @@ detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc)
 detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
 detection_library(target_assign_op SRCS target_assign_op.cc
 target_assign_op.cu)
+detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
+    polygon_box_transform_op.cu)
 
 # Export local libraries to parent
 set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
index 74848005d0..8c4b4321b7 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -22,21 +22,21 @@ class BoxCoderOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
                    "Input(PriorBox) of BoxCoderOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
-                   "Input(PriorBoxVar) of BoxCoderOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
                    "Input(TargetBox) of BoxCoderOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
                    "Output(OutputBox) of BoxCoderOp should not be null.");
 
     auto prior_box_dims = ctx->GetInputDim("PriorBox");
-    auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
     auto target_box_dims = ctx->GetInputDim("TargetBox");
 
     PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
                       "The rank of Input of PriorBoxVar must be 2");
     PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
-    PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
+    if (ctx->HasInput("PriorBoxVar")) {
+      auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
+      PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
+    }
 
     auto code_type = GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
     if (code_type == BoxCodeType::kEncodeCenterSize) {
@@ -71,9 +71,11 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
         "of the coordinate system. [xmax, ymax] is the right bottom "
         "coordinate of the anchor box.");
     AddInput("PriorBoxVar",
-             "(Tensor, default Tensor<float>) "
+             "(Tensor, default Tensor<float>, optional) "
              "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
-             "of variance.");
+             "of variance. PriorBoxVar will set all elements to 1 by "
+             "default.")
+        .AsDispensable();
     AddInput(
         "TargetBox",
         "(LoDTensor or Tensor) This input can be a 2-D LoDTensor with shape "
@@ -91,6 +93,10 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
                          "the code type used with the target box")
         .SetDefault("encode_center_size")
         .InEnum({"encode_center_size", "decode_center_size"});
+    AddAttr<bool>("box_normalized",
+                  "(bool, default true) "
+                  "whether treat the priorbox as a noramlized box")
+        .SetDefault(true);
     AddOutput("OutputBox",
               "(LoDTensor or Tensor) "
               "When code_type is 'encode_center_size', the output tensor of "
@@ -127,5 +133,6 @@ width and height.
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker,
                   paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
-                       ops::BoxCoderKernel<double>);
+REGISTER_OP_CPU_KERNEL(
+    box_coder, ops::BoxCoderKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BoxCoderKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu
index 8cef8e0343..a7af111f63 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cu
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
@@ -20,15 +20,16 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data,
                                        const T* prior_box_var_data,
                                        const T* target_box_data, const int row,
                                        const int col, const int len,
-                                       T* output) {
+                                       const bool normalized, T* output) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < row * col) {
     const int row_idx = idx / col;
     const int col_idx = idx % col;
-    T prior_box_width =
-        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
-    T prior_box_height =
-        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_width = prior_box_data[col_idx * len + 2] -
+                        prior_box_data[col_idx * len] + (normalized == false);
+    T prior_box_height = prior_box_data[col_idx * len + 3] -
+                         prior_box_data[col_idx * len + 1] +
+                         (normalized == false);
     T prior_box_center_x =
         (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
     T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
@@ -41,20 +42,24 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data,
     T target_box_center_y = (target_box_data[row_idx * len + 3] +
                              target_box_data[row_idx * len + 1]) /
                             2;
-    T target_box_width =
-        target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
-    T target_box_height =
-        target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
+    T target_box_width = target_box_data[row_idx * len + 2] -
+                         target_box_data[row_idx * len] + (normalized == false);
+    T target_box_height = target_box_data[row_idx * len + 3] -
+                          target_box_data[row_idx * len + 1] +
+                          (normalized == false);
 
-    output[idx * len] = (target_box_center_x - prior_box_center_x) /
-                        prior_box_width / prior_box_var_data[col_idx * len];
-    output[idx * len + 1] = (target_box_center_y - prior_box_center_y) /
-                            prior_box_height /
-                            prior_box_var_data[col_idx * len + 1];
-    output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) /
-                            prior_box_var_data[col_idx * len + 2];
-    output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) /
-                            prior_box_var_data[col_idx * len + 3];
+    output[idx * len] =
+        (target_box_center_x - prior_box_center_x) / prior_box_width;
+    output[idx * len + 1] =
+        (target_box_center_y - prior_box_center_y) / prior_box_height;
+    output[idx * len + 2] = log(fabs(target_box_width / prior_box_width));
+    output[idx * len + 3] = log(fabs(target_box_height / prior_box_height));
+    if (prior_box_var_data) {
+      output[idx * len] /= prior_box_var_data[col_idx * len];
+      output[idx * len + 1] /= prior_box_var_data[col_idx * len + 1];
+      output[idx * len + 2] /= prior_box_var_data[col_idx * len + 2];
+      output[idx * len + 3] /= prior_box_var_data[col_idx * len + 3];
+    }
   }
 }
 
@@ -63,42 +68,56 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data,
                                        const T* prior_box_var_data,
                                        const T* target_box_data, const int row,
                                        const int col, const int len,
-                                       T* output) {
+                                       const bool normalized, T* output) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < row * col) {
     const int col_idx = idx % col;
-    T prior_box_width =
-        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
-    T prior_box_height =
-        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_width = prior_box_data[col_idx * len + 2] -
+                        prior_box_data[col_idx * len] + (normalized == false);
+    T prior_box_height = prior_box_data[col_idx * len + 3] -
+                         prior_box_data[col_idx * len + 1] +
+                         (normalized == false);
     T prior_box_center_x =
         (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
     T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
                             prior_box_data[col_idx * len + 1]) /
                            2;
-
-    T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
+    T target_box_width, target_box_height;
+    T target_box_center_x, target_box_center_y;
+    if (prior_box_var_data) {
+      target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
                              target_box_data[idx * len + 2]) *
                          prior_box_width;
-    T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
+      target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
                               target_box_data[idx * len + 3]) *
                           prior_box_height;
-    T target_box_center_x = prior_box_var_data[col_idx * len] *
+      target_box_center_x = prior_box_var_data[col_idx * len] *
                                 target_box_data[idx * len] * prior_box_width +
                             prior_box_center_x;
-    T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
+      target_box_center_y = prior_box_var_data[col_idx * len + 1] *
                                 target_box_data[idx * len + 1] *
                                 prior_box_height +
                             prior_box_center_y;
+    } else {
+      target_box_width = exp(target_box_data[idx * len + 2]) * prior_box_width;
+      target_box_height =
+          exp(target_box_data[idx * len + 3]) * prior_box_height;
+      target_box_center_x =
+          target_box_data[idx * len] * prior_box_width + prior_box_center_x;
+      target_box_center_y = target_box_data[idx * len + 1] * prior_box_height +
+                            prior_box_center_y;
+    }
 
     output[idx * len] = target_box_center_x - target_box_width / 2;
     output[idx * len + 1] = target_box_center_y - target_box_height / 2;
-    output[idx * len + 2] = target_box_center_x + target_box_width / 2;
-    output[idx * len + 3] = target_box_center_y + target_box_height / 2;
+    output[idx * len + 2] =
+        target_box_center_x + target_box_width / 2 - (normalized == false);
+    output[idx * len + 3] =
+        target_box_center_y + target_box_height / 2 - (normalized == false);
   }
 }
 
-template <typename T>
+template <typename DeviceContext, typename T>
 class BoxCoderCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -109,6 +128,11 @@ class BoxCoderCUDAKernel : public framework::OpKernel<T> {
     auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
     auto* output_box = context.Output<framework::Tensor>("OutputBox");
 
+    const T* prior_box_data = prior_box->data<T>();
+    const T* target_box_data = target_box->data<T>();
+    const T* prior_box_var_data = nullptr;
+    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
+
     if (target_box->lod().size()) {
       PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
                         "Only support 1 level of LoD.");
@@ -120,22 +144,19 @@ class BoxCoderCUDAKernel : public framework::OpKernel<T> {
     int grid = (row * col + block - 1) / block;
     auto& device_ctx = context.cuda_device_context();
 
-    const T* prior_box_data = prior_box->data<T>();
-    const T* prior_box_var_data = prior_box_var->data<T>();
-    const T* target_box_data = target_box->data<T>();
-
     output_box->mutable_data<T>({row, col, len}, context.GetPlace());
     T* output = output_box->data<T>();
 
     auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    bool normalized = context.Attr<bool>("box_normalized");
     if (code_type == BoxCodeType::kEncodeCenterSize) {
       EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
           prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          output);
+          normalized, output);
     } else if (code_type == BoxCodeType::kDecodeCenterSize) {
       DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
           prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          output);
+          normalized, output);
     }
   }
 };
@@ -144,5 +165,7 @@ class BoxCoderCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel<float>,
-                        ops::BoxCoderCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    box_coder,
+    ops::BoxCoderCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::BoxCoderCUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h
index 77fc6c2b62..5ed8520acd 100644
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -28,26 +28,28 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) {
   PADDLE_THROW("Not support type %s.", type);
 }
 
-template <typename T>
+template <typename DeviceContext, typename T>
 class BoxCoderKernel : public framework::OpKernel<T> {
  public:
-  void EncodeCenterSize(const framework::Tensor& target_box,
-                        const framework::Tensor& prior_box,
-                        const framework::Tensor& prior_box_var,
-                        T* output) const {
-    int64_t row = target_box.dims()[0];
-    int64_t col = prior_box.dims()[0];
-    int64_t len = prior_box.dims()[1];
-    auto* target_box_data = target_box.data<T>();
-    auto* prior_box_data = prior_box.data<T>();
-    auto* prior_box_var_data = prior_box_var.data<T>();
+  void EncodeCenterSize(const framework::Tensor* target_box,
+                        const framework::Tensor* prior_box,
+                        const framework::Tensor* prior_box_var,
+                        const bool normalized, T* output) const {
+    int64_t row = target_box->dims()[0];
+    int64_t col = prior_box->dims()[0];
+    int64_t len = prior_box->dims()[1];
+    auto* target_box_data = target_box->data<T>();
+    auto* prior_box_data = prior_box->data<T>();
+    const T* prior_box_var_data = nullptr;
+    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
 
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
-        T prior_box_width =
-            prior_box_data[j * len + 2] - prior_box_data[j * len];
-        T prior_box_height =
-            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+        T prior_box_width = prior_box_data[j * len + 2] -
+                            prior_box_data[j * len] + (normalized == false);
+        T prior_box_height = prior_box_data[j * len + 3] -
+                             prior_box_data[j * len + 1] +
+                             (normalized == false);
         T prior_box_center_x =
             (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
         T prior_box_center_y =
@@ -57,67 +59,89 @@ class BoxCoderKernel : public framework::OpKernel<T> {
             (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
         T target_box_center_y =
             (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
-        T target_box_width =
-            target_box_data[i * len + 2] - target_box_data[i * len];
-        T target_box_height =
-            target_box_data[i * len + 3] - target_box_data[i * len + 1];
+        T target_box_width = target_box_data[i * len + 2] -
+                             target_box_data[i * len] + (normalized == false);
+        T target_box_height = target_box_data[i * len + 3] -
+                              target_box_data[i * len + 1] +
+                              (normalized == false);
 
         size_t offset = i * col * len + j * len;
-        output[offset] = (target_box_center_x - prior_box_center_x) /
-                         prior_box_width / prior_box_var_data[j * len];
-        output[offset + 1] = (target_box_center_y - prior_box_center_y) /
-                             prior_box_height / prior_box_var_data[j * len + 1];
+        output[offset] =
+            (target_box_center_x - prior_box_center_x) / prior_box_width;
+        output[offset + 1] =
+            (target_box_center_y - prior_box_center_y) / prior_box_height;
         output[offset + 2] =
-            std::log(std::fabs(target_box_width / prior_box_width)) /
-            prior_box_var_data[j * len + 2];
+            std::log(std::fabs(target_box_width / prior_box_width));
         output[offset + 3] =
-            std::log(std::fabs(target_box_height / prior_box_height)) /
-            prior_box_var_data[j * len + 3];
+            std::log(std::fabs(target_box_height / prior_box_height));
+        if (prior_box_var) {
+          output[offset] /= prior_box_var_data[j * len];
+          output[offset + 1] /= prior_box_var_data[j * len + 1];
+          output[offset + 2] /= prior_box_var_data[j * len + 2];
+          output[offset + 3] /= prior_box_var_data[j * len + 3];
+        }
       }
     }
   }
-  void DecodeCenterSize(const framework::Tensor& target_box,
-                        const framework::Tensor& prior_box,
-                        const framework::Tensor& prior_box_var,
-                        T* output) const {
-    int64_t row = target_box.dims()[0];
-    int64_t col = prior_box.dims()[0];
-    int64_t len = prior_box.dims()[1];
-
-    auto* target_box_data = target_box.data<T>();
-    auto* prior_box_data = prior_box.data<T>();
-    auto* prior_box_var_data = prior_box_var.data<T>();
+  void DecodeCenterSize(const framework::Tensor* target_box,
+                        const framework::Tensor* prior_box,
+                        const framework::Tensor* prior_box_var,
+                        const bool normalized, T* output) const {
+    int64_t row = target_box->dims()[0];
+    int64_t col = prior_box->dims()[0];
+    int64_t len = prior_box->dims()[1];
+
+    auto* target_box_data = target_box->data<T>();
+    auto* prior_box_data = prior_box->data<T>();
+    const T* prior_box_var_data = nullptr;
+    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
 
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
         size_t offset = i * col * len + j * len;
-        T prior_box_width =
-            prior_box_data[j * len + 2] - prior_box_data[j * len];
-        T prior_box_height =
-            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+        T prior_box_width = prior_box_data[j * len + 2] -
+                            prior_box_data[j * len] + (normalized == false);
+        T prior_box_height = prior_box_data[j * len + 3] -
+                             prior_box_data[j * len + 1] +
+                             (normalized == false);
         T prior_box_center_x =
             (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
         T prior_box_center_y =
             (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
 
-        T target_box_center_x = prior_box_var_data[j * len] *
+        T target_box_center_x = 0, target_box_center_y = 0;
+        T target_box_width = 0, target_box_height = 0;
+        if (prior_box_var) {
+          target_box_center_x = prior_box_var_data[j * len] *
                                     target_box_data[offset] * prior_box_width +
                                 prior_box_center_x;
-        T target_box_center_y = prior_box_var_data[j * len + 1] *
+          target_box_center_y = prior_box_var_data[j * len + 1] *
                                     target_box_data[offset + 1] *
                                     prior_box_height +
                                 prior_box_center_y;
-        T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
+          target_box_width = std::exp(prior_box_var_data[j * len + 2] *
                                       target_box_data[offset + 2]) *
                              prior_box_width;
-        T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
+          target_box_height = std::exp(prior_box_var_data[j * len + 3] *
                                        target_box_data[offset + 3]) *
                               prior_box_height;
+        } else {
+          target_box_center_x =
+              target_box_data[offset] * prior_box_width + prior_box_center_x;
+          target_box_center_y = target_box_data[offset + 1] * prior_box_height +
+                                prior_box_center_y;
+          target_box_width =
+              std::exp(target_box_data[offset + 2]) * prior_box_width;
+          target_box_height =
+              std::exp(target_box_data[offset + 3]) * prior_box_height;
+        }
 
         output[offset] = target_box_center_x - target_box_width / 2;
         output[offset + 1] = target_box_center_y - target_box_height / 2;
-        output[offset + 2] = target_box_center_x + target_box_width / 2;
-        output[offset + 3] = target_box_center_y + target_box_height / 2;
+        output[offset + 2] =
+            target_box_center_x + target_box_width / 2 - (normalized == false);
+        output[offset + 3] =
+            target_box_center_y + target_box_height / 2 - (normalized == false);
       }
     }
   }
@@ -139,11 +163,14 @@ class BoxCoderKernel : public framework::OpKernel<T> {
     output_box->mutable_data<T>({row, col, len}, context.GetPlace());
 
     auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    bool normalized = context.Attr<bool>("box_normalized");
     T* output = output_box->data<T>();
     if (code_type == BoxCodeType::kEncodeCenterSize) {
-      EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+      EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
+                       output);
     } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-      DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+      DecodeCenterSize(target_box, prior_box, prior_box_var, normalized,
+                       output);
     }
   }
 };
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
new file mode 100644
index 0000000000..335e8dd470
--- /dev/null
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* in = ctx.Input<Tensor>("Input");
+    auto in_dims = in->dims();
+    const T* in_data = in->data<T>();
+    auto* out = ctx.Output<Tensor>("Output");
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int batch_size = in_dims[0];
+    int geo_channel = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int id = 0;
+    for (int id_n = 0; id_n < batch_size * geo_channel; ++id_n) {
+      for (int id_h = 0; id_h < height; ++id_h) {
+        for (int id_w = 0; id_w < width; ++id_w) {
+          id = id_n * height * width + width * id_h + id_w;
+          if (id_n % 2 == 0) {
+            out_data[id] = id_w - in_data[id];
+          } else {
+            out_data[id] = id_h - in_data[id];
+          }
+        }
+      }
+    }
+  }
+};
+
+class PolygonBoxTransformOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("Input"),
+        "Input (Input) of polygon_box transform op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Output"),
+        "Output (Output) of polygon_box transform op should not be null.");
+
+    auto in_dim = ctx->GetInputDim("Input");
+
+    PADDLE_ENFORCE_EQ(in_dim.size(), 4, "input's rank must be 4.");
+    PADDLE_ENFORCE_EQ(in_dim[1] % 2, 0,
+                      "input's second dimension must be even.");
+
+    ctx->SetOutputDim("Output", in_dim);
+  }
+};
+
+class PolygonBoxTransformOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "Input",
+        "The input with shape [batch_size, geometry_channels, height, width]");
+    AddOutput("Output", "The output with the same shape as input");
+
+    AddComment(R"DOC(
+PolygonBoxTransform Operator.
+The input is the final geometry output in detection network.
+We use 2*n numbers to denote the coordinate shift from n corner vertices of
+the polygon_box to the pixel location. As each distance offset contains two numbers (xi, yi),
+the geometry output contains 2*n channels.
+PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(polygon_box_transform, ops::PolygonBoxTransformOp,
+                  ops::PolygonBoxTransformOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    polygon_box_transform,
+    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, float>,
+    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
new file mode 100644
index 0000000000..6187ac6622
--- /dev/null
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using platform::PADDLE_CUDA_NUM_THREADS;
+#define CUDA_BLOCK_SIZE 16
+
+template <typename T>
+__global__ void PolygonBoxTransformKernel(const int n, const int h, const int w,
+                                          const T* input, T* output) {
+  int id_n = threadIdx.x + blockDim.x * blockIdx.x;
+  int id_h = threadIdx.y + blockDim.y * blockIdx.y;
+  int id_w = threadIdx.z + blockDim.z * blockIdx.z;
+  if (id_n < n && id_h < h && id_w < w) {
+    int id = id_n * h * w + w * id_h + id_w;
+    if (id_n % 2 == 0) {
+      output[id] = id_w - input[id];
+    } else {
+      output[id] = id_h - input[id];
+    }
+  }
+}
+
+template <typename T>
+class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* in = ctx.Input<Tensor>("Input");
+    auto in_dims = in->dims();
+    const T* in_data = in->data<T>();
+    auto* out = ctx.Output<Tensor>("Output");
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int batch_size = in_dims[0];
+    int geo_channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    dim3 threadsPerBlock(
+        PADDLE_CUDA_NUM_THREADS / (CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE),
+        CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE);
+    dim3 numBlocks((batch_size * geo_channels) / threadsPerBlock.x,
+                   (height + threadsPerBlock.y - 1) / threadsPerBlock.y,
+                   (width + threadsPerBlock.z - 1) / threadsPerBlock.z);
+    auto stream = ctx.cuda_device_context().stream();
+    PolygonBoxTransformKernel<T><<<numBlocks, threadsPerBlock, 0, stream>>>(
+        batch_size * geo_channels, height, width, in_data, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(
+    polygon_box_transform,
+    paddle::operators::PolygonBoxTransformOpCUDAKernel<float>,
+    paddle::operators::PolygonBoxTransformOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index 0ccf701b61..716c8625d3 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -51,7 +51,8 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(label_dims.size(), 2,
                       "The rank of Input(Label) must be 2, "
                       "the shape is [N, 6].");
-    PADDLE_ENFORCE_EQ(label_dims[1], 6, "The shape is of Input(Label) [N, 6].");
+    PADDLE_ENFORCE(label_dims[1] == 6 || label_dims[1] == 5,
+                   "The shape of Input(Label) is [N, 6] or [N, 5].");
 
     if (ctx->HasInput("PosCount")) {
       PADDLE_ENFORCE(ctx->HasInput("TruePos"),
@@ -88,9 +89,10 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
              "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
              "no detected data.");
     AddInput("Label",
-             "(LoDTensor) A 2-D LoDTensor with shape[N, 6] represents the"
+             "(LoDTensor) A 2-D LoDTensor represents the"
              "Labeled ground-truth data. Each row has 6 values: "
-             "[label, is_difficult, xmin, ymin, xmax, ymax], N is the total "
+             "[label, xmin, ymin, xmax, ymax, is_difficult] or 5 values: "
+             "[label, xmin, ymin, xmax, ymax], where N is the total "
              "number of ground-truth data in this mini-batch. For each "
              "instance, the offsets in first dimension are called LoD, "
              "the number of offset is N + 1, if LoD[i + 1] - LoD[i] == 0, "
diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h
index 431812e2bf..dd1ab85fd8 100644
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -72,7 +72,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     auto* out_false_pos = ctx.Output<framework::LoDTensor>("AccumFalsePos");
 
     float overlap_threshold = ctx.Attr<float>("overlap_threshold");
-    float evaluate_difficult = ctx.Attr<bool>("evaluate_difficult");
+    bool evaluate_difficult = ctx.Attr<bool>("evaluate_difficult");
     auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
     int class_num = ctx.Attr<int>("class_num");
 
@@ -175,14 +175,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     for (int n = 0; n < batch_size; ++n) {
       std::map<int, std::vector<Box>> boxes;
       for (size_t i = label_index[n]; i < label_index[n + 1]; ++i) {
-        Box box(labels(i, 2), labels(i, 3), labels(i, 4), labels(i, 5));
         int label = labels(i, 0);
-        auto is_difficult = labels(i, 1);
-        if (std::abs(is_difficult - 0.0) < 1e-6)
-          box.is_difficult = false;
-        else
-          box.is_difficult = true;
-        boxes[label].push_back(box);
+        if (input_label.dims()[1] == 6) {
+          Box box(labels(i, 2), labels(i, 3), labels(i, 4), labels(i, 5));
+          auto is_difficult = labels(i, 1);
+          if (std::abs(is_difficult - 0.0) < 1e-6)
+            box.is_difficult = false;
+          else
+            box.is_difficult = true;
+          boxes[label].push_back(box);
+        } else {
+          PADDLE_ENFORCE_EQ(input_label.dims()[1], 5);
+          Box box(labels(i, 1), labels(i, 2), labels(i, 3), labels(i, 4));
+          boxes[label].push_back(box);
+        }
       }
       gt_boxes->push_back(boxes);
     }
diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h
index 253964562c..baf04c30b1 100644
--- a/paddle/fluid/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
 namespace operators {
@@ -24,19 +26,57 @@ struct AddFunctor {
   inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
 };
 
+template <typename DeviceContext, typename T>
+void default_elementwise_add(const framework::ExecutionContext& ctx,
+                             const framework::Tensor* x,
+                             const framework::Tensor* y, framework::Tensor* z) {
+  int axis = ctx.Attr<int>("axis");
+  ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                        AddFunctor<T>(), z);
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_add(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
+  auto eigen_x = framework::EigenVector<T>::Flatten(*x);
+  auto eigen_y = framework::EigenVector<T>::Flatten(*y);
+  auto eigen_z = framework::EigenVector<T>::Flatten(*z);
+
+  auto blas = math::GetBlas<DeviceContext, T>(ctx);
+  blas.VADD(x->numel(), eigen_x.data(), eigen_y.data(), eigen_z.data());
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    !std::is_floating_point<T>::value ||
+    !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_add(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
+  default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
+}
+
 template <typename DeviceContext, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
+    const auto x = ctx.Input<Tensor>("X");
+    const auto y = ctx.Input<Tensor>("Y");
+    auto z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          AddFunctor<T>(), z);
+
+    auto dims_equal = x->dims() == y->dims();
+    if (dims_equal) {
+      elementwise_add<DeviceContext, T>(ctx, x, y, z);
+    } else {
+      default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
+    }
   }
 };
 
@@ -45,6 +85,55 @@ struct IdentityGrad {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
 };
 
+template <typename DeviceContext, typename T>
+void default_elementwise_add_grad(const framework::ExecutionContext& ctx,
+                                  const framework::Tensor* x,
+                                  const framework::Tensor* y,
+                                  const framework::Tensor* out,
+                                  const framework::Tensor* dout,
+                                  framework::Tensor* dx,
+                                  framework::Tensor* dy) {
+  int axis = ctx.Attr<int>("axis");
+
+  ElemwiseGradCompute<DeviceContext, T, IdentityGrad<T>, IdentityGrad<T>>(
+      ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
+      IdentityGrad<T>());
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_add_grad(const framework::ExecutionContext& ctx,
+                     const framework::Tensor* x, const framework::Tensor* y,
+                     const framework::Tensor* out,
+                     const framework::Tensor* dout, framework::Tensor* dx,
+                     framework::Tensor* dy) {
+  auto blas = math::GetBlas<DeviceContext, T>(ctx);
+
+  if (dx) {
+    blas.VCOPY(dout->numel(), dout->data<T>(),
+               dx->mutable_data<T>(ctx.GetPlace()));
+  }
+
+  if (dy) {
+    blas.VCOPY(dout->numel(), dout->data<T>(),
+               dy->mutable_data<T>(ctx.GetPlace()));
+  }
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    !std::is_floating_point<T>::value ||
+    !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_add_grad(const framework::ExecutionContext& ctx,
+                     const framework::Tensor* x, const framework::Tensor* y,
+                     const framework::Tensor* out,
+                     const framework::Tensor* dout, framework::Tensor* dx,
+                     framework::Tensor* dy) {
+  default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
+}
+
 template <typename DeviceContext, typename T>
 class ElementwiseAddGradKernel : public framework::OpKernel<T> {
  public:
@@ -57,10 +146,13 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, IdentityGrad<T>, IdentityGrad<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
-        IdentityGrad<T>());
+
+    if (platform::is_cpu_place(ctx.GetPlace()) && (x->dims() == y->dims())) {
+      elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
+    } else {
+      default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
+                                                     dy);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index f4cec8ad97..12364fff96 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -59,47 +59,48 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() final {
     AddInput("X", "(Tensor), The first input tensor of elementwise op.");
     AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
-    AddOutput("Out", "The output of elementwise op.");
+    AddOutput("Out", "The output of elementwise op.").Reuse("X");
     AddAttr<int>("axis",
                  "(int, default -1). The start dimension index "
                  "for broadcasting Y onto X.")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
     AddComment(string::Sprintf(R"DOC(
-Limited Elementwise %s Operator.
+Limited Elementwise %s Operator
 
 The equation is:
 
 $$%s$$
 
-$X$ is a tensor of any dimension and the dimensions of tensor $Y$ must be
-smaller than or equal to the dimensions of $X$.
+- $X$: a tensor of any dimension. 
+- $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$.
 
 There are two cases for this operator:
-1. The shape of $Y$ is same with $X$;
-2. The shape of $Y$ is a congiguous subsequencet of $X$. The trailing dimensions
-   of size 1 for $Y$ will be ignored for the consideration of subsequence.
 
+1. The shape of $Y$ is the same with $X$.
+2. The shape of $Y$ is a continuous subsequence of $X$.
 
 For case 2:
 
-$Y$ will be broadcasted to match the shape of $X$ and axis should be
-set to index of the start dimension to broadcast $Y$ onto $X$.
+1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index 
+   for broadcasting $Y$ onto $X$. 
+2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$.
+3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of 
+   subsequence, such as shape(Y) = (2, 1) => (2).
 
-If axis is -1, it is treated as axis=rank(X)-rank(Y).
+For example:
 
-For example
   .. code-block:: python
 
     shape(X) = (2, 3, 4, 5), shape(Y) = (,)
     shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2
     shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
     shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
     shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
 
-Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details)
-information. However, the output only shares the LoD information with input $X$.
+The inputs $X$ and $Y$ can carry the different LoD information. 
+But the output only shares the LoD information with the input $X$.
 
 )DOC",
                                GetName(), GetEquation()));
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
new file mode 100644
index 0000000000..43f9491111
--- /dev/null
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fake_dequantize_op.h"
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
+ public:
+  FakeDequantizeMaxAbsOp(const std::string &type,
+                         const framework::VariableNameMap &inputs,
+                         const framework::VariableNameMap &outputs,
+                         const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of FakeDequantizeMaxAbsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FakeDequantizeMaxAbsOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class FakeDequantizeMaxAbsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) The input with float-32/64 type is the "
+             "low precision tensor.");
+    AddOutput("Out",
+              "(Tensor) The output is the dequantized high "
+              "precision tensor.");
+    AddAttr<int>("num_bits",
+                 "(int) `num_bits` is the quantization level bits, "
+                 "such as 2, 5, 8.");
+    AddAttr<float>("scale",
+                   "(float) The maximum absolute value of low precision tensor."
+                   "It is usually calculated by the fake_quantize_max_abs_op.");
+    AddComment(R"DOC(
+FakeDequantizeMaxAbsOp operator.
+
+This calculation is an opposite operation of FakeQuantizeMaxAbsOp:
+
+$$Out = \frac{scale*X}{2^{num_bits} - 1}$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp,
+                  ops::FakeDequantizeMaxAbsOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs,
+                       ops::FakeDequantizeMaxAbsKernel<CPU, float>,
+                       ops::FakeDequantizeMaxAbsKernel<CPU, double>);
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu
new file mode 100644
index 0000000000..1bd38d1bd2
--- /dev/null
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fake_dequantize_op.h"
+
+namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
+REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
+                        ops::FakeDequantizeMaxAbsKernel<CUDA, float>,
+                        ops::FakeDequantizeMaxAbsKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h
new file mode 100644
index 0000000000..0901e68b37
--- /dev/null
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(in->place());
+
+    int num_bits = ctx.Attr<int>("num_bits");
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    int range = std::pow(2, num_bits) - 1;
+
+    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    eigen_out.device(dev) = (scale / range) * eigen_in;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 8843a1c44b..a9ae1396db 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -43,7 +43,7 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType FCOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library{framework::LibraryType::kMKLDNN};
-  framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+  framework::DataLayout layout{framework::DataLayout::kMKLDNN};
 
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
@@ -65,7 +65,7 @@ void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType FCOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library{framework::LibraryType::kMKLDNN};
-  framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+  framework::DataLayout layout{framework::DataLayout::kMKLDNN};
 
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc
new file mode 100644
index 0000000000..98b051afb5
--- /dev/null
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+
+class FetchBarrierOp : public framework::OperatorBase {
+ public:
+  FetchBarrierOp(const std::string& type,
+                 const framework::VariableNameMap& inputs,
+                 const framework::VariableNameMap& outputs,
+                 const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
+
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<RPCCLIENT_T>();
+
+    rpc_client->Wait();
+
+    for (auto& ep : eps) {
+      VLOG(3) << "fetch barrier, ep: " << ep;
+      rpc_client->AsyncSendFetchBarrier(ep);
+    }
+    rpc_client->Wait();
+  }
+};
+
+class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddComment(R"DOC(
+SendBarrier operator
+
+This operator will send a send barrier signal to list_and_serv op, so that
+the Parameter Server would knew all variables have been sent.
+)DOC");
+
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.")
+        .SetDefault({"127.0.0.1:6164"});
+  }
+};
+
+class FetchBarrierOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(fetch_barrier, ops::FetchBarrierOp,
+                  paddle::framework::EmptyGradOpMaker, ops::FetchBarrierOpMaker,
+                  ops::FetchBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
index 1ae78675a0..453a1b32a0 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
@@ -32,16 +32,16 @@ class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp {
 class FillConstantBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
  protected:
   void Apply() override {
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
+    AddAttr<int>(
+        "dtype",
+        "It could be numpy.dtype. Output data type. Default is float32")
         .SetDefault(framework::proto::VarType::FP32);
-    AddAttr<float>("value", "(float, default 0) The value to be filled")
+    AddAttr<float>("value", "default 0. The value to be filled")
         .SetDefault(0.0f);
     AddComment(R"DOC(
-FillConstantBatchSizeLike Operator.
-
-Fill up a variable with specified constant value.
+This function creates a tensor of specified *shape*, *dtype* and batch size,
+and initializes this with a constant supplied in *value*. The batch size is
+obtained from the `input` tensor.
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.h b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
index 2a7df149a9..63ea60678f 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
@@ -24,6 +24,14 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* in = ctx.Input<framework::LoDTensor>("Input");
+    if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
+      // set the correct batch size for the LoDTensor.
+      auto odims = out->dims();
+      int output_dim_idx = ctx.Attr<int>("output_dim_idx");
+      odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
+      out->mutable_data<T>(odims, ctx.GetPlace());
+    }
     out->mutable_data<T>(ctx.GetPlace());
     auto value = ctx.Attr<float>("value");
 
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index e21b572589..aa3e05b83b 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -33,7 +33,6 @@ class GatherOp : public framework::OperatorWithKernel {
     auto index_dims = ctx->GetInputDim("Index");
     PADDLE_ENFORCE(index_dims.size() == 1);
     int batch_size = ctx->GetInputDim("Index")[0];
-    PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
     framework::DDim output_dims(ctx->GetInputDim("X"));
     output_dims[0] = batch_size;
     ctx->SetOutputDim("Out", output_dims);
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
index 9c0561b016..f6b156eb30 100644
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -43,7 +43,8 @@ TEST(Gather, GatherData) {
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
   paddle::operators::CPUGather<int>(ctx, *src, *index, output);
-
+  delete cpu_place;
+  cpu_place = NULL;
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
   for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
 
diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc
index a5678f6346..111e58844c 100644
--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/grpc_client.h"
-#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 
 namespace paddle {
@@ -60,12 +60,13 @@ class GenNCCLIdOp : public framework::OperatorBase {
 
     std::vector<std::string> endpoint_list =
         Attr<std::vector<std::string>>("endpoint_list");
-    detail::RPCClient client;
+    detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>();
+
     for (auto& ep : endpoint_list) {
       VLOG(3) << "sending nccl id to " << ep;
-      client.AsyncSendVariable(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
+      client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
     }
-    client.Wait();
+    client->Wait();
     VLOG(3) << "sending completed...";
   }
 
@@ -75,21 +76,28 @@ class GenNCCLIdOp : public framework::OperatorBase {
     // NOTE: Can not use unique_ptr here because the default
     // deleter will call GRPC Server's base class's dtor and
     // that will cause a wired crash.
-    detail::AsyncGRPCServer rpc_service(endpoint, true);
+    detail::RequestSendHandler rpc_h(true);
+    std::unique_ptr<detail::RPCServer> rpc_service(
+        new RPCSERVER_T(endpoint, 1));
+
+    rpc_service->RegisterRPC(detail::kRequestSend, &rpc_h);
+    rpc_h.SetRPCServer(rpc_service.get());
+
     framework::ProgramDesc empty_program;
     framework::Executor executor(dev_ctx.GetPlace());
-    rpc_service.SetScope(scope);
-    rpc_service.SetDevCtx(&dev_ctx);
-    rpc_service.SetProgram(&empty_program);
-    rpc_service.SetExecutor(&executor);
+    rpc_h.SetScope(scope);
+    rpc_h.SetDevCtx(&dev_ctx);
+    rpc_h.SetProgram(&empty_program);
+    rpc_h.SetExecutor(&executor);
 
     std::thread server_thread(
-        std::bind(&detail::AsyncGRPCServer::RunSyncUpdate, &rpc_service));
-    rpc_service.SetCond(0);
+        std::bind(&detail::RPCServer::StartServer, rpc_service.get()));
+
+    rpc_service->SetCond(detail::kRequestSend);
     VLOG(3) << "start getting nccl id from trainer 0...";
-    auto recv = rpc_service.Get();
+    rpc_service->WaitBarrier(detail::kRequestSend);
     VLOG(3) << "got nccl id and stop server...";
-    rpc_service.ShutDown();
+    rpc_service->ShutDown();
     VLOG(3) << "rpc server stopped";
     server_thread.join();
   }
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index e38525cd7f..a711da3627 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -67,8 +67,6 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
         "mini-batch. Note: S is equal to the sequence number in a mini-batch. "
         "The output is no longer a LoDTensor.");
     AddComment(R"DOC(
-LinearChainCRF Operator.
-
 Conditional Random Field defines an undirected probabilistic graph with nodes
 denoting random variables and edges denoting dependencies between these
 variables. CRF learns the conditional probability $P(Y|X)$, where
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 57eb5d9a0e..4d12278799 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -12,22 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <stdio.h>  // for removing the port file
+#include <csignal>
+#include <cstdlib>
 #include <fstream>
-#include <ostream>
 #include <thread>  // NOLINT
 #include <vector>
 
+#include "paddle/fluid/operators/detail/macros.h"
+
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
 
-void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
-  service->RunSyncUpdate();
+void RunServer(std::shared_ptr<detail::RPCServer> service) {
+  service->StartServer();
   VLOG(4) << "RunServer thread end";
 }
-
 static void split(const std::string &str, char sep,
                   std::vector<std::string> *pieces) {
   pieces->clear();
@@ -58,7 +62,7 @@ static void ParallelExecuteBlocks(
           int run_block = idx;  // thread local
           try {
             executor->RunPreparedContext(prepared[run_block].get(), scope);
-          } catch (std::exception &e) {
+          } catch (const std::exception &e) {
             LOG(ERROR) << "run sub program error " << e.what();
           }
         }));
@@ -66,180 +70,117 @@ static void ParallelExecuteBlocks(
   for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
 }
 
-std::atomic_int ListenAndServOp::selected_port_{0};
-
 ListenAndServOp::ListenAndServOp(const std::string &type,
                                  const framework::VariableNameMap &inputs,
                                  const framework::VariableNameMap &outputs,
                                  const framework::AttributeMap &attrs)
     : OperatorBase(type, inputs, outputs, attrs) {}
 
+ListenAndServOp::~ListenAndServOp() { Stop(); }
+
 void ListenAndServOp::Stop() {
-  rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
+  rpc_service_->ShutDown();
   server_thread_->join();
+  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
+  remove(file_path.c_str());
 }
 
-void ListenAndServOp::SavePort(const std::string &file_path) const {
+void ListenAndServOp::SavePort() const {
   // NOTE: default write file to /tmp/paddle.selected_port
-  selected_port_ = rpc_service_->GetSelectedPort();
-
-  std::ofstream port_file;
-  port_file.open(file_path);
-  port_file << selected_port_.load();
-  port_file.close();
-  VLOG(4) << "selected port written to " << file_path;
+  rpc_service_->SavePort();
 }
 
-void ListenAndServOp::WaitServerReady() {
-  while (selected_port_.load() == 0) {
-  }
+static int64_t GetTimestamp() {
+  struct timeval tp;
+  gettimeofday(&tp, NULL);
+  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
 }
 
-void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
-                                  framework::ProgramDesc *program,
-                                  framework::Scope *recv_scope,
-                                  framework::BlockDesc *prefetch_block) const {
-  auto fan_in = Attr<int>("Fanin");
-
+void ListenAndServOp::RunSyncLoop(
+    framework::Executor *executor, framework::ProgramDesc *program,
+    framework::Scope *recv_scope,
+    const std::vector<int> &prefetch_block_id_list) const {
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
                     "server program should have at least 2 blocks");
 
-  std::vector<int> block_list;
-  for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
-    block_list.push_back(blkid);
+  std::vector<int> optimize_block_id_list;
+  for (int blkid = 1; blkid < num_blocks; ++blkid) {
+    if (std::find(prefetch_block_id_list.begin(), prefetch_block_id_list.end(),
+                  blkid) == prefetch_block_id_list.end()) {
+      optimize_block_id_list.push_back(blkid);
+    }
   }
-  auto optimize_prepared = executor->Prepare(*program, block_list);
+  auto optimize_prepared = executor->Prepare(*program, optimize_block_id_list);
   // Insert placeholder for block0 which holds current op itself.
   optimize_prepared.insert(
       optimize_prepared.begin(),
       std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
 
-  bool exit_flag = false;
-  // Record received sparse variables, so that
-  // we could reset those after execute optimize program
-  std::vector<framework::Variable *> sparse_vars;
-  while (!exit_flag) {
+  rpc_service_->ResetBarrierCounter();
+  while (true) {
     // Get from multiple trainers, we don't care about the order in which
     // the gradients arrives, just add suffix 0~n and merge the gradient.
-    rpc_service_->SetCond(0);
-    size_t recv_var_cnt = 0;
-    int batch_barrier = 0;
-    while (batch_barrier != fan_in) {
-      const detail::ReceivedMessage v = rpc_service_->Get();
-      auto recv_var_name = v.first;
-      if (recv_var_name == LISTEN_TERMINATE_MESSAGE) {
-        LOG(INFO) << "received terminate message and exit";
-        exit_flag = true;
-        break;
-      } else if (recv_var_name == BATCH_BARRIER_MESSAGE) {
-        VLOG(3) << "recv batch barrier message";
-        batch_barrier++;
-        continue;
-      } else {
-        VLOG(3) << "received grad: " << recv_var_name;
-        recv_var_cnt++;
-        auto var = v.second->GetVar();
-        if (var == nullptr) {
-          LOG(ERROR) << "Can not find server side var: " << recv_var_name;
-          PADDLE_THROW("Can not find server side var");
-        }
-        if (var->IsType<framework::SelectedRows>()) {
-          sparse_vars.push_back(var);
-        }
-      }
-    }
-    if (exit_flag) {
-      rpc_service_->SetCond(1);
-      rpc_service_->ShutDown();
+    rpc_service_->SetCond(detail::kRequestSend);
+    rpc_service_->WaitBarrier(detail::kRequestSend);
+
+    if (rpc_service_->IsExit()) {
+      LOG(WARNING) << "get exit!rpc_processor break!";
+      rpc_service_->SetCond(detail::kRequestGet);
       break;
     }
 
     // NOTE: if is_gpu_place, CUDA kernels are launched by multiple threads
     // and this will still work.
-
     // The optimize blocks which have the same parent ID would run parallel
     // TODO(Yancey1989): need to use ParallelExecutor for future
     int32_t last_parent_blkid = program->Block(1).Parent();
     std::vector<size_t> parallel_blkids;
     parallel_blkids.push_back(1);
-    double ts = detail::GetTimestamp();
-    for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
-      if (blkid != static_cast<size_t>(prefetch_block->ID())) {
-        if (program->Block(blkid).Parent() != last_parent_blkid) {
-          ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-                                program, recv_scope);
-          parallel_blkids.clear();
-          last_parent_blkid = program->Block(blkid).Parent();
-        }
-        parallel_blkids.push_back(blkid);
+    double ts = GetTimestamp();
+    for (size_t i = 1; i < optimize_block_id_list.size(); ++i) {
+      // skip the first optimize block because it is already in the
+      // parallel_blkids.
+      int blkid = optimize_block_id_list[i];
+      if (program->Block(blkid).Parent() != last_parent_blkid) {
+        ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
+                              program, recv_scope);
+        parallel_blkids.clear();
+        last_parent_blkid = program->Block(blkid).Parent();
       }
+      parallel_blkids.push_back(blkid);
     }
     ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
                           recv_scope);
-    VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)";
-
-    // Reset the received sparse variables, the sum operator would not
-    // sum the input sparse variables which rows is empty at the next
-    // mini-batch.
-    // TODO(Yancey1989): move the reset action into an operator, we couldn't
-    // have any hide logic in the operator.
-    for (auto &var : sparse_vars) {
-      var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
-    }
-    rpc_service_->SetCond(1);
-    // FIXME(typhoonzero): use another condition to sync wait clients get.
-    rpc_service_->WaitClientGet(fan_in);
-    sparse_vars.clear();
+    VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
+
+    rpc_service_->SetCond(detail::kRequestGet);
+    rpc_service_->WaitBarrier(detail::kRequestGet);
+    rpc_service_->ResetBarrierCounter();
+    // reset received sparse vars to avoid reuse it in the next mini-batch
+    dynamic_cast<detail::RequestSendHandler *>(request_send_handler_.get())
+        ->ResetSparseVarRecorder();
   }  // while(true)
 }
 
-static void AsyncUpdateThread(
-    const std::string &var_name, const bool &exit_flag,
-    const std::shared_ptr<detail::ReceivedQueue> &queue,
-    framework::Executor *executor,
-    framework::ExecutorPrepareContext *prepared) {
-  VLOG(3) << "update thread for " << var_name << " started";
-  while (!exit_flag) {
-    const detail::ReceivedMessage v = queue->Pop();
-    auto recv_var_name = v.first;
-    auto var = v.second->GetVar();
-    if (var == nullptr) {
-      LOG(ERROR) << "Can not find server side var: " << recv_var_name;
-      PADDLE_THROW("Can not find server side var");
-    }
-    auto fs = framework::Async([var_name, &executor, &v, prepared] {
-      try {
-        executor->RunPreparedContext(prepared,
-                                     v.second->GetMutableLocalScope());
-      } catch (std::exception &e) {
-        LOG(ERROR) << "run sub program error " << e.what();
-      }
-    });
-    fs.wait();
-  }
-}
-
 void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
                                    framework::ProgramDesc *program) const {
   VLOG(3) << "RunAsyncLoop in";
   // grad name to block id
   std::unordered_map<std::string, int32_t> grad_to_block_id;
   std::unordered_map<int32_t, std::string> id_to_grad;
-  std::unordered_map<std::string, std::shared_ptr<detail::ReceivedQueue>>
-      grad_to_queue;
 
   auto grad_to_block_id_str =
       Attr<std::vector<std::string>>("grad_to_block_id");
-  for (auto &grad_and_id : grad_to_block_id_str) {
+  for (const auto &grad_and_id : grad_to_block_id_str) {
     std::vector<std::string> pieces;
     split(grad_and_id, ':', &pieces);
     VLOG(3) << "after split, grad = " << pieces[0] << ", id=" << pieces[1];
     PADDLE_ENFORCE_EQ(pieces.size(), 2);
     PADDLE_ENFORCE_EQ(grad_to_block_id.count(pieces[0]), 0);
+
     int block_id = std::stoi(pieces[1]);
     grad_to_block_id[pieces[0]] = block_id;
-    grad_to_queue[pieces[0]] = std::make_shared<detail::ReceivedQueue>();
     id_to_grad[block_id] = pieces[0];
   }
   size_t num_blocks = program->Size();
@@ -258,40 +199,37 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
     grad_to_prepared_ctx[id_to_grad[block_list[i]]] = optimize_prepared[i];
   }
 
-  bool exit_flag = false;
-
-  VLOG(3) << "start async optimize threads";
-  std::vector<std::future<void>> fs;
-  for (auto iter = grad_to_queue.begin(); iter != grad_to_queue.end(); iter++) {
-    std::string grad_name = iter->first;
-    VLOG(3) << "create async update thread for " << grad_name;
-    fs.push_back(framework::AsyncIO([grad_name, &exit_flag, &executor,
-                                     &grad_to_queue, &grad_to_prepared_ctx]() {
-      AsyncUpdateThread(grad_name, exit_flag, grad_to_queue[grad_name],
-                        executor, grad_to_prepared_ctx[grad_name].get());
-    }));
-  }
+  request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
+  request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
+  request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
 
   VLOG(3) << "RunAsyncLoop into while";
-  while (!exit_flag) {
-    const detail::ReceivedMessage v = rpc_service_->Get();
-    auto recv_var_name = v.first;
-    if (recv_var_name == LISTEN_TERMINATE_MESSAGE) {
-      LOG(INFO) << "received terminate message and exit";
-      exit_flag = true;
+  while (true) {
+    if (rpc_service_->IsExit()) {
+      LOG(INFO) << "get exit!rpc_processor break!";
       break;
-    } else {
-      VLOG(3) << "received grad: " << recv_var_name;
-      grad_to_queue[recv_var_name]->Push(v);
     }
 
-    if (exit_flag) {
-      rpc_service_->ShutDown();
-      break;
-    }
+    sleep(1);
   }  // while(true)
 }
 
+static void FillRequestCtx(
+    detail::RequestHandler *h, framework::Scope *scope,
+    platform::DeviceContext *dev_ctx, framework::Executor *executor,
+    framework::ProgramDesc *program,
+    std::unordered_map<std::string,
+                       std::shared_ptr<framework::ExecutorPrepareContext>>
+        *prefetch_ctx,
+    detail::RPCServer *rpc_server) {
+  h->SetScope(scope);
+  h->SetDevCtx(dev_ctx);
+  h->SetExecutor(executor);
+  h->SetProgram(program);
+  h->SetPrefetchPreparedCtx(prefetch_ctx);
+  h->SetRPCServer(rpc_server);
+}
+
 void ListenAndServOp::RunImpl(const framework::Scope &scope,
                               const platform::Place &dev_place) const {
   // Mark this as PS that it should decide profiling by listening from trainer.
@@ -301,39 +239,81 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   framework::Scope &recv_scope = scope.NewScope();
 
   bool sync_mode = Attr<bool>("sync_mode");
+  auto fan_in = Attr<int>("Fanin");
 
   PADDLE_ENFORCE(!rpc_service_);
   std::string endpoint = Attr<std::string>("endpoint");
 
-  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint, sync_mode));
+  LOG(INFO) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
+            << ", end_point:" << endpoint;
+
+  rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
+
+  request_send_handler_.reset(new detail::RequestSendHandler(sync_mode));
+  request_get_handler_.reset(new detail::RequestGetHandler(sync_mode));
+  request_prefetch_handler_.reset(
+      new detail::RequestPrefetchHandler(sync_mode));
+
+  rpc_service_->RegisterRPC(detail::kRequestSend, request_send_handler_.get());
+  rpc_service_->RegisterRPC(detail::kRequestGet, request_get_handler_.get());
+  rpc_service_->RegisterRPC(detail::kRequestPrefetch,
+                            request_prefetch_handler_.get());
 
   auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-  auto *prefetch_block = Attr<framework::BlockDesc *>(kPrefetchBlock);
   auto *program = optimize_block->Program();
   framework::Executor executor(dev_place);
 
-  // prepare rpc_service
-  rpc_service_->SetScope(&recv_scope);
-  rpc_service_->SetDevCtx(&dev_ctx);
-  rpc_service_->SetProgram(program);
-  rpc_service_->SetExecutor(&executor);
-
   // prepare for prefetch
-  VLOG(3) << "prefetch block id is " << prefetch_block->ID();
-  auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID());
-  rpc_service_->SetPrefetchPreparedCtx(std::move(prefetch_prepared));
+  std::vector<int> prefetch_block_id_list;
+  std::unordered_map<int, std::string> block_id_to_prefetch_var_name;
+
+  auto prefetch_var_name_to_block_id_str =
+      Attr<std::vector<std::string>>(kPrefetchVarNameToBlockId);
+  for (const auto &prefetch_var_name_and_id :
+       prefetch_var_name_to_block_id_str) {
+    std::vector<std::string> pieces;
+    split(prefetch_var_name_and_id, ':', &pieces);
+    VLOG(3) << "after split, prefetch_var = " << pieces[0]
+            << ", id=" << pieces[1];
+    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+
+    int block_id = std::stoi(pieces[1]);
+    prefetch_block_id_list.push_back(block_id);
+    block_id_to_prefetch_var_name[block_id] = pieces[0];
+  }
+
+  auto prefetch_prepared = executor.Prepare(*program, prefetch_block_id_list);
+
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      prefetch_var_name_to_prepared_ctx;
+  for (size_t i = 0; i < prefetch_block_id_list.size(); ++i) {
+    auto block_id = prefetch_block_id_list[i];
+    auto prefetch_var_name = block_id_to_prefetch_var_name[block_id];
+    prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i];
+  }
+
+  auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope,
+                     &dev_ctx, &executor, program,
+                     &prefetch_var_name_to_prepared_ctx, rpc_service_.get());
+
+  f(request_send_handler_.get());
+  f(request_get_handler_.get());
+  f(request_prefetch_handler_.get());
 
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
   VLOG(3) << "wait server thread to become ready...";
   rpc_service_->WaitServerReady();
 
+  // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
+  signal(SIGINT, SignalHandler::StopAndExit);
+  signal(SIGTERM, SignalHandler::StopAndExit);
+
   // Write to a file of server selected port for python use.
-  std::string file_path = string::Sprintf("/tmp/paddle.%d.selected_port",
-                                          static_cast<int>(::getpid()));
-  SavePort(file_path);
+  SavePort();
   if (sync_mode) {
-    RunSyncLoop(&executor, program, &recv_scope, prefetch_block);
+    RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list);
   } else {
     RunAsyncLoop(&executor, program);
   }
@@ -343,12 +323,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
     AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
-    AddComment(R"DOC(
-ListenAndServ operator
-
-This operator will start a RPC server which can receive variables
-from send_op and send back variables to recv_op.
-)DOC");
+    AddComment(R"DOC(" + "ListenAndServ operator" + "\n" + "This operator" +
+" will start a RPC server which can receive variables from send_op and send" +
+"back variables to recv_op.)DOC");
     AddAttr<std::string>("endpoint",
                          "(string, default 127.0.0.1:6164)"
                          "IP address to listen on.")
@@ -362,13 +339,19 @@ from send_op and send back variables to recv_op.
     AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
     AddAttr<framework::BlockDesc *>(kOptimizeBlock,
                                     "BlockID to run on server side.");
-    AddAttr<framework::BlockDesc *>(kPrefetchBlock,
-                                    "prefetch block to run on server side.");
+    AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
+                                      "prefetch blocks to run on server side.")
+        .SetDefault({});
     AddAttr<int>("Fanin", "How many clients send to this server.")
         .SetDefault(1);
   }
 };
 
+void SignalHandler::StopAndExit(int signal_num) {
+  VLOG(3) << "Catch interrupt signal: " << signal_num << ", program will exit";
+  exit(0);
+}
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index f52a55c5c2..46c3a19e20 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -16,22 +16,24 @@ limitations under the License. */
 
 #include <stdint.h>
 #include <atomic>
-#include <ostream>
+#include <set>
 #include <string>
+#include <vector>
 
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/detail/rpc_server.h"
 
 namespace paddle {
 namespace operators {
 
 constexpr char kOptimizeBlock[] = "OptimizeBlock";
-constexpr char kPrefetchBlock[] = "PrefetchBlock";
+constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
 
-void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service);
+void RunServer(std::shared_ptr<detail::RPCServer> service);
 
 class ListenAndServOp : public framework::OperatorBase {
  public:
@@ -40,33 +42,40 @@ class ListenAndServOp : public framework::OperatorBase {
                   const framework::VariableNameMap& outputs,
                   const framework::AttributeMap& attrs);
 
+  virtual ~ListenAndServOp();
+
   void RunSyncLoop(framework::Executor* executor,
                    framework::ProgramDesc* program,
                    framework::Scope* recv_scope,
-                   framework::BlockDesc* prefetch_block) const;
+                   const std::vector<int>& prefetch_block_id_list) const;
 
   void RunAsyncLoop(framework::Executor* executor,
                     framework::ProgramDesc* program) const;
 
-  void SavePort(
-      const std::string& file_path = "/tmp/paddle.selected_port") const;
-
-  void WaitServerReady();
+  void SavePort() const;
 
-  int GetSelectedPort() { return selected_port_; }
+  int GetSelectedPort() { return rpc_service_->GetSelectedPort(); }
 
   void Stop() override;
 
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override;
 
-  static void ResetPort() { selected_port_ = 0; }
-
  protected:
-  mutable std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
+  mutable std::shared_ptr<detail::RPCServer> rpc_service_;
+  mutable std::shared_ptr<detail::RequestHandler> request_send_handler_;
+  mutable std::shared_ptr<detail::RequestHandler> request_get_handler_;
+  mutable std::shared_ptr<detail::RequestHandler> request_prefetch_handler_;
+
   mutable std::shared_ptr<std::thread> server_thread_;
-  // FIXME(wuyi): it's static so that the operator can be cloned.
-  static std::atomic_int selected_port_;
+};
+
+class SignalHandler {
+ public:
+  static void StopAndExit(int signal_num);
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(SignalHandler);
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 93f45cff8a..8f4b504927 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -74,25 +74,18 @@ class LoadOp : public framework::OperatorBase {
 class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddOutput("Out", "(Tensor) The tensor need to be loaded");
+    AddOutput("Out", "The tensor need to be loaded");
     AddAttr<bool>(
         "load_as_fp16",
-        "(boolean, default false)"
         "If true, the tensor will be first loaded and then "
         "converted to float16 data type. Otherwise, the tensor will be "
-        "directly loaded without data type conversion.")
+        "directly loaded without data type conversion. Default is false.")
         .SetDefault(false);
     AddAttr<std::string>("file_path",
-                         "(string) "
-                         "Variable will be loaded from \"file_path\".")
+                         R"(Variable will be loaded from "file_path")")
         .AddCustomChecker(
             [](const std::string &path) { return !path.empty(); });
-    AddComment(R"DOC(
-Load Operator.
-
-Load operator will load a tensor variable from disk file.
-
-)DOC");
+    AddComment("Load operator will load a tensor variable from disk file.");
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc
index d07a819685..2ce11e712f 100644
--- a/paddle/fluid/operators/lookup_sparse_table_op.cc
+++ b/paddle/fluid/operators/lookup_sparse_table_op.cc
@@ -127,7 +127,7 @@ class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(-1.0f);
     AddAttr<float>("max",
                    "(float, default 1.0) "
-                   "Maximun value of uniform random")
+                   "Maximum value of uniform random")
         .SetDefault(1.0f);
     AddAttr<int>("seed",
                  "(int, default 0) "
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 52b9cd7fb7..52b459a6a2 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -124,16 +124,17 @@ namespace {
 framework::OpKernelType GetExpectedLRNKernel(
     const framework::ExecutionContext& ctx) {
   framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 #ifdef PADDLE_WITH_MKLDNN
   if (library_ == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
 #endif
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
       layout_, library_);
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index dabde43850..1a37cb39d5 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -125,6 +125,12 @@ class Blas {
   template <typename T>
   void AXPY(int n, T alpha, const T* x, T* y) const;
 
+  template <typename T>
+  void VADD(int n, const T* x, const T* y, T* z) const;
+
+  template <typename T>
+  void VCOPY(int n, const T* x, T* y) const;
+
   template <typename T>
   void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta,
             T* C) const;
@@ -163,6 +169,16 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template AXPY<T>(args...);
   }
 
+  template <typename... ARGS>
+  void VADD(ARGS... args) const {
+    Base()->template VADD<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void VCOPY(ARGS... args) const {
+    Base()->template VCOPY<T>(args...);
+  }
+
   template <typename... ARGS>
   void GEMV(ARGS... args) const {
     Base()->template GEMV<T>(args...);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 14b3624b42..ae20406bc2 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -34,6 +34,18 @@ struct CBlas<float> {
     cblas_saxpy(args...);
   }
 
+#ifdef PADDLE_WITH_MKLML
+  template <typename... ARGS>
+  static void VADD(ARGS... args) {
+    vsAdd(args...);
+  }
+#endif
+
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    cblas_scopy(args...);
+  }
+
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
     cblas_sgemv(args...);
@@ -59,6 +71,18 @@ struct CBlas<double> {
     cblas_daxpy(args...);
   }
 
+#ifdef PADDLE_WITH_MKLML
+  template <typename... ARGS>
+  static void VADD(ARGS... args) {
+    vdAdd(args...);
+  }
+#endif
+
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    cblas_dcopy(args...);
+  }
+
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
     cblas_dgemv(args...);
@@ -139,6 +163,24 @@ void Blas<platform::CPUDeviceContext>::AXPY(int n, T alpha, const T *x,
   CBlas<T>::AXPY(n, alpha, x, 1, y, 1);
 }
 
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VCOPY(int n, const T *x, T *y) const {
+  CBlas<T>::VCOPY(n, x, 1, y, 1);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VADD(int n, const T *x, const T *y,
+                                            T *z) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VADD(n, x, y, z);
+#else
+  this->template VCOPY<T>(n, y, z);
+  this->template AXPY<T>(n, 1., x, z);
+#endif
+}
+
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc
index fc0fca5ad3..caff35e03a 100644
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -46,7 +46,10 @@ class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
 
       const int64_t* label_data = labels->data<int64_t>();
       for (int i = 0; i < batch_size; ++i) {
-        int index = i * class_num + label_data[i];
+        int lbl = label_data[i];
+        PADDLE_ENFORCE_GE(lbl, 0);
+        PADDLE_ENFORCE_LT(lbl, class_num);
+        int index = i * class_num + lbl;
         loss_data[i] = -math::TolerableValue<T>()(std::log(prob_data[index]));
       }
     }
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
index 3719a264e9..b545671b43 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -77,6 +77,8 @@ TEST(math_function, gemm_trans_clbas) {
   paddle::platform::CPUDeviceContext context(*cpu_place);
   GetBlas<float>(context).GEMM(false, true, m, n, k, 1, input1_ptr, 3,
                                input2_ptr + 3, 3, 1, input3_ptr + 1, 4);
+  delete cpu_place;
+  cpu_place = NULL;
 
   EXPECT_EQ(input3_ptr[0], 0);
   EXPECT_EQ(input3_ptr[1], 24);
diff --git a/paddle/fluid/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc
index 8e508b68ee..b1e69f375d 100644
--- a/paddle/fluid/operators/max_sequence_len_op.cc
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
@@ -42,10 +42,15 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
 class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("RankTable", "The lod_rank_table.");
-    AddOutput("Out", "The max sequence length.");
-    AddComment(
-        R"DOC(Calculate the max sequence length through lod_rank_table.)DOC");
+    AddInput("RankTable", "Input variable which is a LoDRankTable object");
+    AddOutput("Out", "The max sequence length");
+    AddComment(R"DOC(
+    Given a LoDRankTable object, this layer returns the max length of
+    a batch of sequences. In fact, a LoDRankTable object contains a list of
+    tuples(<sequence index, sequence length>) and the list is already sorted by
+    sequence length in descending order, so the operator just returns the
+    sequence length of the first tuple element
+)DOC");
   }
 };
 
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 74477eb439..4881cff4a3 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -34,7 +34,7 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X", "The input of mean op");
-    AddOutput("Out", "The output of mean op");
+    AddOutput("Out", "The output of mean op").Reuse("X");
     AddComment(R"DOC(
 Mean Operator.
 
diff --git a/paddle/fluid/operators/mkldnn_activation_op.h b/paddle/fluid/operators/mkldnn_activation_op.h
index f26a165b5a..85664623d7 100644
--- a/paddle/fluid/operators/mkldnn_activation_op.h
+++ b/paddle/fluid/operators/mkldnn_activation_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
@@ -60,52 +62,5 @@ class MKLDNNActivationGradKernel
   }
 };
 
-namespace {  // NOLINT
-framework::OpKernelType GetKernelType(
-    const framework::ExecutionContext& ctx,
-    const framework::OperatorWithKernel& oper) {
-  framework::LibraryType library{framework::LibraryType::kPlain};
-#ifdef PADDLE_WITH_MKLDNN
-  if (library == framework::LibraryType::kPlain &&
-      platform::CanMKLDNNBeUsed(ctx)) {
-    library = framework::LibraryType::kMKLDNN;
-  }
-#endif
-  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-      ctx.GetPlace(), layout, library);
-}
-}  // anonymous namespace
-
-class ActivationWithMKLDNNOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return GetKernelType(ctx, *this);
-  }
-};
-
-class ActivationWithMKLDNNOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return GetKernelType(ctx, *this);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/mul_mkldnn_op.cc b/paddle/fluid/operators/mul_mkldnn_op.cc
deleted file mode 100644
index a5f3a98f67..0000000000
--- a/paddle/fluid/operators/mul_mkldnn_op.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "mkldnn.hpp"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/mul_op.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using paddle::framework::Tensor;
-using paddle::platform::MKLDNNDeviceContext;
-
-template <typename Format = mkldnn::memory::format>
-mkldnn::memory::desc type(const std::vector<int>& dims, Format&& f) {
-  return platform::MKLDNNMemDesc(dims, mkldnn::memory::data_type::f32, f);
-}
-
-template <typename T>
-class MulMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    auto mkldnn_engine = dev_ctx.GetEngine();
-
-    auto input = ctx.Input<Tensor>("X");
-    auto weight = ctx.Input<Tensor>("Y");
-
-    PADDLE_ENFORCE(input->dims().size() & (2 | 4),
-                   "Input must be with 2 or 4 dimensions, i.e. NC or NCHW");
-    PADDLE_ENFORCE(weight->dims().size() & (2 | 4),
-                   "Weights must be with 2 or 4 dimensions, i.e. OI or OIHW");
-
-    std::vector<int> w_tz = paddle::framework::vectorize2int(weight->dims());
-    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
-
-    auto src_md =
-        src_tz.size() != 2
-            ? type(src_tz, mkldnn::memory::format::nchw)
-            : type({src_tz[0], src_tz[1]}, mkldnn::memory::format::nc);
-
-    auto dst_md = type({src_tz[0], w_tz[1]}, mkldnn::memory::format::nc);
-
-    auto weights_md =
-        src_tz.size() != 2
-            ? type({w_tz[1], src_tz[1], src_tz[2], src_tz[3]},
-                   mkldnn::memory::format::oihw)
-            : type({w_tz[1], src_tz[1]}, mkldnn::memory::format::oi);
-
-    auto output = ctx.Output<Tensor>("Out");
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-
-    const std::string key = ctx.op().Output("Out");
-    const std::string key_fc_pd = key + "@mul_pd";
-
-    const T* input_data = input->data<T>();
-    const T* w_data = weight->data<T>();
-
-    auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data);
-
-    auto src_memory = mkldnn::memory({src_md, mkldnn_engine},
-                                     platform::to_void_cast(input_data));
-
-    auto weights_memory = mkldnn::memory({weights_md, mkldnn_engine},
-                                         platform::to_void_cast(w_data));
-
-    auto pd = platform::MKLDNNFwdPrimitiveDesc<mkldnn::inner_product_forward>(
-        mkldnn_engine, src_md, weights_md, dst_md);
-
-    dev_ctx.SetBlob(key_fc_pd, pd);
-
-    auto forward = mkldnn::inner_product_forward(*pd, src_memory,
-                                                 weights_memory, dst_memory);
-
-    std::vector<mkldnn::primitive> pipeline = {forward};
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-  }
-};
-
-template <typename T>
-class MulMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    auto mkldnn_engine = dev_ctx.GetEngine();
-
-    const Tensor* input = ctx.Input<Tensor>("X");
-    const Tensor* w = ctx.Input<Tensor>("Y");
-
-    const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    Tensor* w_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    const std::string key = ctx.op().Input("Out");
-    const std::string key_fc_pd = key + "@mul_pd";
-
-    const T* input_data = input->data<T>();
-    const T* w_data = w->data<T>();
-    const T* out_grad_data = out_grad->data<T>();
-    T* input_grad_data = nullptr;
-    T* w_grad_data = nullptr;
-
-    if (input_grad) {
-      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-    }
-    if (w_grad) {
-      w_grad_data = w_grad->mutable_data<T>(ctx.GetPlace());
-    }
-
-    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
-    std::vector<int> w_tz = paddle::framework::vectorize2int(w->dims());
-
-    auto src_md =
-        src_tz.size() != 2
-            ? type(src_tz, mkldnn::memory::format::nchw)
-            : type({src_tz[0], src_tz[1]}, mkldnn::memory::format::nc);
-
-    auto dst_md = type({src_tz[0], w_tz[1]}, mkldnn::memory::format::nc);
-
-    auto weights_md =
-        src_tz.size() != 2
-            ? type({w_tz[1], src_tz[1], src_tz[2], src_tz[3]},
-                   mkldnn::memory::format::oihw)
-            : type({w_tz[1], src_tz[1]}, mkldnn::memory::format::oi);
-
-    auto src_memory = mkldnn::memory({src_md, mkldnn_engine},
-                                     platform::to_void_cast(input_data));
-
-    auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine},
-                                     platform::to_void_cast(out_grad_data));
-
-    auto weight_memory = mkldnn::memory({weights_md, mkldnn_engine},
-                                        platform::to_void_cast(w_data));
-
-    auto pd =
-        std::static_pointer_cast<mkldnn::inner_product_forward::primitive_desc>(
-            dev_ctx.GetBlob(key_fc_pd));
-
-    PADDLE_ENFORCE(pd != nullptr, "Fail to find pd in device context");
-
-    if (w_grad) {
-      auto weights_grad_memory = mkldnn::memory(
-          {weights_md, mkldnn_engine}, platform::to_void_cast(w_grad_data));
-
-      auto bwd_weight_pd = platform::MKLDNNBwdPrimitiveDesc<
-          mkldnn::inner_product_backward_weights>(mkldnn_engine, *pd, src_md,
-                                                  weights_md, dst_md);
-
-      auto bwd_weights_prim = mkldnn::inner_product_backward_weights(
-          bwd_weight_pd, src_memory, dst_memory, weights_grad_memory);
-
-      std::vector<mkldnn::primitive> pipeline{bwd_weights_prim};
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-    }
-
-    if (input_grad) {
-      auto src_grad_memory = mkldnn::memory(
-          {src_md, mkldnn_engine}, platform::to_void_cast(input_grad_data));
-
-      auto bwd_data_pd =
-          platform::MKLDNNBwdPrimitiveDesc<mkldnn::inner_product_backward_data>(
-              mkldnn_engine, *pd, src_md, weights_md, dst_md);
-
-      auto bwd_data_prim = mkldnn::inner_product_backward_data(
-          bwd_data_pd, dst_memory, weight_memory, src_grad_memory);
-
-      std::vector<mkldnn::primitive> pipeline{bwd_data_prim};
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_KERNEL(mul, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::MulMKLDNNOpKernel<float>);
-
-REGISTER_OP_KERNEL(mul_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::MulMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index a43739463c..51993398bd 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -16,10 +16,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
 namespace paddle {
 namespace operators {
 
@@ -76,22 +72,6 @@ class MulOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
-
- private:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library{framework::LibraryType::kPlain};
-#ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library = framework::LibraryType::kMKLDNN;
-    }
-#endif
-    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout, library);
-  }
 };
 
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -120,9 +100,6 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
         )DOC")
         .SetDefault(1)
         .EqualGreaterThan(1);
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
     AddAttr<int>(
         "y_num_col_dims",
         R"DOC((int, default 1), The mul_op can take tensors with more than two,
@@ -177,22 +154,6 @@ class MulGradOp : public framework::OperatorWithKernel {
       ctx->SetOutputDim(y_grad_name, y_dims);
     }
   }
-
- private:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library{framework::LibraryType::kPlain};
-#ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library = framework::LibraryType::kMKLDNN;
-    }
-#endif
-    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout, library);
-  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
index cdbc975c02..aa19c62c83 100644
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -16,40 +16,34 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename AttrType>
 class NormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor of norm operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of feature.");
-    AddInput("Scale",
-             "(Tensor) The input tensor of norm operator. "
-             "The format of input tensor is C * 1.");
-    AddAttr<AttrType>("epsilon",
-                      "(float, default 1e-10) Constant "
-                      "for numerical stability.")
+    AddInput("X", "(Tensor) A tensor of rank >= axis.");
+    AddAttr<int>("axis",
+                 "The axis on which to apply normalization. If axis < 0, "
+                 "the dimension to normalization is rank(X) + axis. -1 is "
+                 "the last dimension.");
+    AddAttr<float>("epsilon",
+                   "(float, default 1e-10) The epsilon value is used "
+                   "to avoid division by zero.")
         .SetDefault(1.0e-10f);
-    AddOutput("Out",
-              "(Tensor) The output tensor of norm operator."
-              "N * M."
-              "M = C * H * W");
+    AddOutput("Norm",
+              "(Tensor) A tensor saved the `sqrt(sum(x) + epsion)` will "
+              "be used in backward kernel.")
+        .AsIntermediate();
+    AddOutput("Out", "(Tensor) A tensor of the same shape as X.");
     AddComment(R"DOC(
-       "Input shape: $(N, C, H, W)$
-        Scale shape: $(C, 1)$
-        Output shape: $(N, C, H, W)$
-        Where
-        forward
-          $$
-            [\frac {x_{1}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{2}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{3}}{\sqrt{\sum{x_{i}^{2}}}} \cdot  \cdot  \cdot \frac {x_{n}}{\sqrt{\sum{x_{i}^{2}}}}]
-          $$
-        backward
-          $$
-            \frac{\frac{\mathrm{d}L }{\mathrm{d}y_{1}} - \frac {x_{1}\sum {\frac{\mathrm{d} L}{\mathrm{d} y_{j}}}x_{j}}{\sum x_{j}^{2}} }{\sqrt{\sum{x_{j}^{2}}}}
-          $$
-        )DOC");
+
+Given a tensor, apply 2-normalization along the provided axis.
+
+$$
+y = \frac{x}{ \sqrt{\sum {x^2} + epsion }}
+$$
+
+where, $\sum {x^2}$ is calculated along the `axis` dimension.
+        
+)DOC");
   }
 };
 
@@ -58,15 +52,15 @@ class NormOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of NormOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Scale"),
-                   "Input(Scale) of NormOp"
-                   "should not be null.");
+                   "Input(X) of NormOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of NormOp should not be null.");
-    auto in_x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", in_x_dims);
+    auto xdim = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", xdim);
+    int axis = ctx->Attrs().Get<int>("axis");
+    if (axis < 0) axis = xdim.size() + axis;
+    xdim[axis] = 1;
+    ctx->SetOutputDim("Norm", xdim);
   }
 };
 
@@ -84,12 +78,12 @@ class NormOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker<float>,
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(norm_grad, ops::NormOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    norm, ops::NormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::NormKernel<paddle::platform::CPUDeviceContext, double, float>);
-REGISTER_OP_CPU_KERNEL(
-    norm_grad, ops::NormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::NormGradKernel<paddle::platform::CPUDeviceContext, double, float>);
+REGISTER_OP_CPU_KERNEL(norm, ops::NormKernel<CPU, float>,
+                       ops::NormKernel<CPU, double>);
+REGISTER_OP_CPU_KERNEL(norm_grad, ops::NormGradKernel<CPU, float>,
+                       ops::NormGradKernel<CPU, double>);
diff --git a/paddle/fluid/operators/norm_op.cu b/paddle/fluid/operators/norm_op.cu
index d1d9be5074..1d0021d33f 100644
--- a/paddle/fluid/operators/norm_op.cu
+++ b/paddle/fluid/operators/norm_op.cu
@@ -16,9 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/norm_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    norm, ops::NormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::NormKernel<paddle::platform::CUDADeviceContext, double, float>);
-REGISTER_OP_CUDA_KERNEL(
-    norm_grad, ops::NormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::NormGradKernel<paddle::platform::CUDADeviceContext, double, float>);
+using CUDA = paddle::platform::CUDADeviceContext;
+
+REGISTER_OP_CUDA_KERNEL(norm, ops::NormKernel<CUDA, float>,
+                        ops::NormKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(norm_grad, ops::NormGradKernel<CUDA, float>,
+                        ops::NormGradKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h
index 0ad29e8a03..3167bdc8ac 100644
--- a/paddle/fluid/operators/norm_op.h
+++ b/paddle/fluid/operators/norm_op.h
@@ -19,156 +19,110 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T, typename AttrType = T>
+inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n,
+                    int* post) {
+  *pre = 1;
+  *post = 1;
+  *n = dim[axis];
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= dim[i];
+  }
+  for (int i = axis + 1; i < dim.size(); ++i) {
+    (*post) *= dim[i];
+  }
+}
+
+template <typename DeviceContext, typename T>
 class NormKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
-    out->mutable_data<T>(context.GetPlace());
-    int batch_size = in_x->dims()[0];
-    int channels = in_x->dims()[1];
-    int height = in_x->dims()[2];
-    int width = in_x->dims()[3];
-    int fea_len = height * width;
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-    auto x =
-        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
-    // get square
-    framework::Tensor x_square;
-    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
-    auto x_square_eigen =
-        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            x_square, framework::make_ddim({batch_size, fea_len * channels}));
-    x_square_eigen.device(*place) = x.square();
-    auto scale_eigen =
-        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
-            *scale);
-    for (int n = 0; n < batch_size; ++n) {
-      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
-      auto in_x_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              in_x_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
-      auto x_square_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              x_square_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor out_batch = out->Slice(n, n + 1);
-      auto out_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              out_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor tmp_tensor;
-      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
-                                 context.GetPlace());
-      auto tmp = framework::EigenVector<T, Eigen::RowMajor,
-                                        Eigen::DenseIndex>::Flatten(tmp_tensor);
-      // get colsum and sqrt , inverse
-      auto dim = Eigen::array<int, 1>({{0}});
-      tmp.device(*place) = x_square_batch_eigen.sum(dim);
-      tmp.device(*place) = (tmp + epsilon).sqrt().inverse();
-      Eigen::array<int, 2> broadcast_dim_col;
-      broadcast_dim_col[1] = 1;
-      broadcast_dim_col[0] = channels;
-      out_batch_eigen.device(*place) =
-          in_x_batch_eigen * (tmp.broadcast(broadcast_dim_col));
-      Eigen::array<int, 2> broadcast_dim_row;
-      broadcast_dim_row[1] = fea_len;
-      broadcast_dim_row[0] = 1;
-      out_batch_eigen.device(*place) =
-          out_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
-    }
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_x = ctx.Input<framework::Tensor>("X");
+    auto* out_y = ctx.Output<framework::Tensor>("Out");
+    auto* out_norm = ctx.Output<framework::Tensor>("Norm");
+    out_y->mutable_data<T>(ctx.GetPlace());
+    out_norm->mutable_data<T>(ctx.GetPlace());
+
+    auto xdim = in_x->dims();
+    auto ndim = out_norm->dims();
+    T eps = static_cast<T>(ctx.Attr<float>("epsilon"));
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) axis = xdim.size() + axis;
+    int pre, n, post;
+    GetDims(xdim, axis, &pre, &n, &post);
+
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
+
+    Eigen::DSizes<int, 3> shape(pre, n, post);
+    Eigen::DSizes<int, 2> norm_shape(pre, post);
+
+    auto x_e = framework::EigenVector<T>::Flatten(*in_x);
+    auto y_e = framework::EigenVector<T>::Flatten(*out_y);
+    auto norm_e = framework::EigenVector<T>::Flatten(*out_norm);
+    auto x = x_e.reshape(shape);
+    auto y = y_e.reshape(shape);
+    auto norm = norm_e.reshape(norm_shape);
+
+    Eigen::DSizes<int, 1> rdim(1);
+    // y = x / sqrt((sum(x * x) + epsilon))
+    // norm = sqrt(sum(x * x) + epsilon)
+    auto sum = x.pow(2).sum(rdim) + eps;
+    norm.device(*place) = sum.sqrt();
+    // y = x / norm
+    Eigen::DSizes<int, 3> rshape(pre, 1, post);
+    Eigen::DSizes<int, 3> bcast(1, n, 1);
+    y.device(*place) = x / norm.reshape(rshape).broadcast(bcast);
   }
 };
 template <typename DeviceContext, typename T, typename AttrType = T>
 class NormGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
-    const framework::Tensor* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
-    framework::Tensor* in_x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    in_x_grad->mutable_data<T>(context.GetPlace());
-    int batch_size = in_x->dims()[0];
-    int channels = in_x->dims()[1];
-    int height = in_x->dims()[2];
-    int width = in_x->dims()[3];
-    int fea_len = height * width;
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-
-    auto scale_eigen =
-        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
-            *scale);
-    auto x =
-        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
-    // get square
-    framework::Tensor x_square;
-    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
-    auto x_square_eigen =
-        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            x_square, framework::make_ddim({batch_size, fea_len * channels}));
-    x_square_eigen.device(*place) = x.square();
-
-    for (int n = 0; n < batch_size; ++n) {
-      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
-      auto in_x_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              in_x_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor in_g_batch = in_x_grad->Slice(n, n + 1);
-      auto in_g_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              in_g_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
-      auto x_square_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              x_square_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor outg_batch = out_grad->Slice(n, n + 1);
-      auto outg_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              outg_batch, framework::make_ddim({channels, fea_len}));
-
-      framework::Tensor tmp_tensor;
-      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
-                                 context.GetPlace());
-      auto tmp_eigen =
-          framework::EigenVector<T, Eigen::RowMajor,
-                                 Eigen::DenseIndex>::Flatten(tmp_tensor);
-      auto dim = Eigen::array<int, 1>({{0}});
-      tmp_eigen.device(*place) = (in_x_batch_eigen * outg_batch_eigen).sum(dim);
-      framework::Tensor norm_tmp_tensor;
-      norm_tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
-                                      context.GetPlace());
-      auto norm_tmp_eigen =
-          framework::EigenVector<T, Eigen::RowMajor,
-                                 Eigen::DenseIndex>::Flatten(norm_tmp_tensor);
-      norm_tmp_eigen.device(*place) =
-          (x_square_batch_eigen.sum(dim) + epsilon).sqrt();
-      Eigen::array<int, 2> broadcast_dim_col;
-      broadcast_dim_col[1] = 1;
-      broadcast_dim_col[0] = channels;
-      in_g_batch_eigen.device(*place) =
-          in_x_batch_eigen * tmp_eigen.broadcast(broadcast_dim_col);
-      in_g_batch_eigen.device(*place) =
-          in_g_batch_eigen /
-          (norm_tmp_eigen * norm_tmp_eigen).broadcast(broadcast_dim_col);
-      in_g_batch_eigen.device(*place) = outg_batch_eigen - in_g_batch_eigen;
-      // outg_batch_eigen + (in_g_batch_eigen * -1);
-      in_g_batch_eigen.device(*place) =
-          in_g_batch_eigen / norm_tmp_eigen.broadcast(broadcast_dim_col);
-      Eigen::array<int, 2> broadcast_dim_row;
-      broadcast_dim_row[1] = fea_len;
-      broadcast_dim_row[0] = 1;
-      in_g_batch_eigen.device(*place) =
-          in_g_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
-    }
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_x = ctx.Input<framework::Tensor>("X");
+    auto* in_norm = ctx.Input<framework::Tensor>("Norm");
+    auto* in_dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* out_dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    out_dx->mutable_data<T>(ctx.GetPlace());
+
+    auto xdim = in_x->dims();
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) axis = xdim.size() + axis;
+    int pre, n, post;
+    GetDims(xdim, axis, &pre, &n, &post);
+
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
+
+    auto x_e = framework::EigenVector<T>::Flatten(*in_x);
+    auto dy_e = framework::EigenVector<T>::Flatten(*in_dy);
+    auto norm_e = framework::EigenVector<T>::Flatten(*in_norm);
+    auto dx_e = framework::EigenVector<T>::Flatten(*out_dx);
+
+    Eigen::DSizes<int, 3> shape(pre, n, post);
+    Eigen::DSizes<int, 2> norm_shape(pre, post);
+    auto x = x_e.reshape(shape);
+    auto dy = dy_e.reshape(shape);
+    auto norm = norm_e.reshape(norm_shape);
+    auto dx = dx_e.reshape(shape);
+
+    framework::Tensor rsum;
+    rsum.mutable_data<T>({pre, post}, ctx.GetPlace());
+    auto sum = framework::EigenTensor<T, 2>::From(rsum);
+
+    Eigen::DSizes<int, 1> rdim(1);
+    Eigen::DSizes<int, 3> bcast(1, n, 1);
+    Eigen::DSizes<int, 3> rshape(pre, 1, post);
+
+    // dx = ( dy/sqrt(sum(x*x)) ) * [1 - x*sum(x) / (sum(x*x) + e)]
+    //    = [dy - dy * x * sum(x) / (sum(x*x) + e)] / sqrt(sum(x*x))
+    //    = [dy - x * sum(x*dy) / (sum(x*x) + e)] / sqrt(sum(x*x))
+    // 1. sum = sum(x*dy)
+    sum.device(*place) = (x * dy).sum(rdim);
+    // 2. dx = x * sum
+    dx.device(*place) = sum.reshape(rshape).broadcast(bcast) * x;
+    // 3. dx / (sum(x*x) + e)
+    // where, norm.pow(2) = sum(x*x) + e, which is calculated in forward.
+    dx.device(*place) = dx / norm.pow(2).broadcast(bcast);
+    // 4. [dy - dx] / sqrt(sum(x*x))
+    dx.device(*place) = (dy - dx) / norm.broadcast(bcast);
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index d60a99994e..be55bc43b1 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -135,7 +135,11 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 
     PoolingMode pooling_mode;
     if (pooling_type == "max") {
-      pooling_mode = PoolingMode::kMaximum;
+      if (FLAGS_cudnn_deterministic) {
+        pooling_mode = PoolingMode::kMaximumDeterministic;
+      } else {
+        pooling_mode = PoolingMode::kMaximum;
+      }
     } else {
       pooling_mode = PoolingMode::kAverage;
     }
diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc
index 60e936298d..5341187d1c 100644
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -18,16 +18,24 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using mkldnn::memory;  // Note: paddle has also "memory" namespace
-using mkldnn::pooling_forward;
+using framework::DataLayout;
+using mkldnn::memory;
 using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using mkldnn::stream;
+using platform::to_void_cast;
 
 // Generate keys for storing/retriving primitives for this operator
 // TODO(jczaja): Make hashing function more optimial
-static std::string gethash(memory::dims& input_dims, std::string& pooling_type,
-                           std::vector<int>& ksize, std::vector<int>& strides,
-                           std::vector<int>& paddings, std::string suffix) {
-  auto dims2str = [](memory::dims& operand_dims) {
+static std::string gethash(const memory::dims& input_dims,
+                           const std::string& pooling_type,
+                           const std::vector<int>& ksize,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::string& suffix) {
+  auto dims2str = [](const memory::dims& operand_dims) {
     std::string dstr = "";
     for (size_t i = 0; i < operand_dims.size(); ++i) {
       dstr += std::to_string(operand_dims[i]) + "-";
@@ -52,8 +60,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const Tensor* input = ctx.Input<Tensor>("X");
     Tensor* output = ctx.Output<Tensor>("Out");
 
-    // Get an unique name from "argument" name of "Out" variable
-    // This name will be used as key when saving info into device context
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
@@ -79,6 +88,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
+    auto input_format = input->format();
+    memory::format output_format{memory::format::format_undef};
+
     const std::string key = gethash(src_tz, pooling_type, ksize, strides,
                                     paddings, ctx.op().Output("Out"));
     const std::string key_pool_p = key + "@pool_p";
@@ -91,16 +103,17 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto pool_p =
         std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p));
     if (pool_p == nullptr) {
-      // TODO(pzelazko-intel): support more formats
+      auto src_md = platform::MKLDNNMemDesc(
+          src_tz, platform::MKLDNNGetDataType<T>(), input_format);
 
-      auto src_md =
-          platform::MKLDNNMemDesc(src_tz, platform::MKLDNNGetDataType<T>(),
-                                  mkldnn::memory::format::nchw);
-      auto dst_md =
-          platform::MKLDNNMemDesc(dst_tz, platform::MKLDNNGetDataType<T>(),
-                                  mkldnn::memory::format::nchw);
+      /* create memory descriptor for pooling without specified format
+       * ('any') which lets a primitive (pooling in this case) choose
+       * the memory format preferred for best performance
+       */
+      auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32,
+                                            mkldnn::memory::format::any);
 
-      std::shared_ptr<pooling_forward::primitive_desc> pool_pd =
+      std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
           CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize,
                               pooling_type, mkldnn_engine);
 
@@ -113,20 +126,22 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       // save pool_workspace_memory to be referred in backward path
       dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
 
-      auto pool_src_memory_p = std::make_shared<memory>(
-          memory::primitive_desc{src_md, mkldnn_engine},
-          static_cast<void*>(const_cast<T*>(input_data)));
-      dev_ctx.SetBlob(key_pool_src_mem_p, pool_src_memory_p);
+      auto src_memory = std::make_shared<memory>(pool_pd->src_primitive_desc(),
+                                                 to_void_cast<T>(input_data));
+      auto dst_memory =
+          std::make_shared<memory>(pool_pd->dst_primitive_desc(), output_data);
 
-      auto pool_dst_memory_p = std::make_shared<memory>(
-          memory::primitive_desc{dst_md, mkldnn_engine},
-          static_cast<void*>(output_data));
-      dev_ctx.SetBlob(key_pool_dst_mem_p, pool_dst_memory_p);
+      dev_ctx.SetBlob(key_pool_src_mem_p, src_memory);
+      dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory);
+
+      pool_p = std::make_shared<pooling_forward>(*pool_pd, *(src_memory.get()),
+                                                 *(dst_memory.get()),
+                                                 *workspace_memory);
 
-      pool_p = std::make_shared<pooling_forward>(
-          *pool_pd, *(pool_src_memory_p.get()), *(pool_dst_memory_p.get()),
-          *workspace_memory);
       dev_ctx.SetBlob(key_pool_p, pool_p);
+
+      output_format =
+          (memory::format)dst_memory->get_primitive_desc().desc().data.format;
     } else {
       // Primitives already exist
       auto pool_src_memory_p =
@@ -137,14 +152,20 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_dst_mem_p));
       PADDLE_ENFORCE(pool_dst_memory_p != nullptr,
                      "Fail to find pooling dst mem_p in device context");
-      pool_src_memory_p->set_data_handle(
-          reinterpret_cast<void*>(const_cast<T*>(input_data)));
+      pool_src_memory_p->set_data_handle(to_void_cast<T>(input_data));
       pool_dst_memory_p->set_data_handle(output_data);
+
+      output_format = (memory::format)pool_dst_memory_p->get_primitive_desc()
+                          .desc()
+                          .data.format;
     }
 
     // push primitive to stream and wait until it's executed
     std::vector<mkldnn::primitive> pipeline{*(pool_p.get())};
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    stream(stream::kind::eager).submit(pipeline).wait();
+
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(output_format);
   }
 
  private:
@@ -191,6 +212,13 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
+    PADDLE_ENFORCE(in_x->layout() == DataLayout::kMKLDNN &&
+                       in_x->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input X tensor");
+    PADDLE_ENFORCE(out_grad->layout() == DataLayout::kMKLDNN &&
+                       out_grad->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input output_grad tensor");
+
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
@@ -209,6 +237,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     const T* out_grad_data = out_grad->data<T>();
     T* in_x_grad_data = in_x_grad->mutable_data<T>(ctx.GetPlace());
+    memory::format in_x_grad_format{memory::format::format_undef};
 
     std::vector<int> diff_src_tz =
         paddle::framework::vectorize2int(in_x_grad->dims());
@@ -222,39 +251,48 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const std::string key_pool_bwd_p = key + "@pool_bwd_p";
     const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p";
     const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p";
+    const std::string key_pool_src_mem_p = key + "@pool_src_mem_p";
+    const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p";
     const std::string key_pool_pd = key + "@pool_pd";
     const std::string key_pool_workspace_memory =
         key + "@pool_workspace_memory";
 
+    auto user_diff_dst_memory =
+        memory({{{diff_dst_tz}, memory::data_type::f32, out_grad->format()},
+                mkldnn_engine},
+               to_void_cast<T>(out_grad_data));
+
+    std::shared_ptr<memory> diff_src_memory;
+    std::shared_ptr<memory> diff_dst_memory;
+    auto dst_memory =
+        std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_dst_mem_p));
+    PADDLE_ENFORCE(dst_memory != nullptr,
+                   "Fail to find dst_memory in device context");
+
+    primitive reorder_diff_dst;
+    bool is_diff_dst_reordered = false;
     auto pool_bwd_p = std::static_pointer_cast<pooling_backward>(
         dev_ctx.GetBlob(key_pool_bwd_p));
     if (pool_bwd_p == nullptr) {
-      auto diff_src_md =
-          platform::MKLDNNMemDesc(diff_src_tz, platform::MKLDNNGetDataType<T>(),
-                                  mkldnn::memory::format::nchw);
-      auto diff_dst_md =
-          platform::MKLDNNMemDesc(diff_dst_tz, platform::MKLDNNGetDataType<T>(),
-                                  mkldnn::memory::format::nchw);
+      // Retrieve src_memory/dst_memory saved in forward pass
+      auto src_memory =
+          std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_src_mem_p));
+      PADDLE_ENFORCE(src_memory != nullptr,
+                     "Fail to find src_memory in device context");
       // Retrieve pool_pd/pool_workspace_memory from device context
       auto pool_pd =
           std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
               dev_ctx.GetBlob(key_pool_pd));
       PADDLE_ENFORCE(pool_pd != nullptr,
                      "Fail to find pool_pd in device context");
-
-      auto workspace_memory = std::static_pointer_cast<mkldnn::memory>(
+      auto workspace_memory = std::static_pointer_cast<memory>(
           dev_ctx.GetBlob(key_pool_workspace_memory));
       PADDLE_ENFORCE(workspace_memory != nullptr,
                      "Fail to find workspace_memory in device context");
 
-      auto pool_diff_src_memory_p = std::make_shared<memory>(memory(
-          {diff_src_md, mkldnn_engine}, static_cast<void*>(in_x_grad_data)));
-      dev_ctx.SetBlob(key_pool_diff_src_mem_p, pool_diff_src_memory_p);
-
-      auto pool_diff_dst_memory_p = std::make_shared<memory>(
-          memory({diff_dst_md, mkldnn_engine},
-                 static_cast<void*>(const_cast<T*>(out_grad_data))));
-      dev_ctx.SetBlob(key_pool_diff_dst_mem_p, pool_diff_dst_memory_p);
+      // create memory descriptors for pooling
+      auto diff_src_md = src_memory.get()->get_primitive_desc().desc();
+      auto diff_dst_md = dst_memory.get()->get_primitive_desc().desc();
 
       auto pool_bwd_desc = mkldnn::pooling_backward::desc(
           pooling_type == "max" ? mkldnn::algorithm::pooling_max
@@ -264,35 +302,74 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc(
           pool_bwd_desc, mkldnn_engine, *pool_pd);
 
+      // reorder between user_diff_dst and pool diff_dst if needed
+      diff_dst_memory = std::make_shared<memory>(user_diff_dst_memory);
+      if (memory::primitive_desc(dst_memory->get_primitive_desc()) !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory =
+            std::make_shared<memory>(dst_memory.get()->get_primitive_desc());
+        reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
+        is_diff_dst_reordered = true;
+      }
+
+      diff_src_memory = std::make_shared<memory>(
+          pool_bwd_pd.diff_src_primitive_desc(), in_x_grad_data);
+
+      dev_ctx.SetBlob(key_pool_diff_src_mem_p, diff_src_memory);
+      dev_ctx.SetBlob(key_pool_diff_dst_mem_p, diff_dst_memory);
+
       pool_bwd_p = std::make_shared<pooling_backward>(
-          pool_bwd_pd, *(pool_diff_dst_memory_p.get()), *workspace_memory,
-          *(pool_diff_src_memory_p));
+          pool_bwd_pd, *(diff_dst_memory.get()), *workspace_memory,
+          *(diff_src_memory));
       dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p);
+
     } else {
       // Primitives already exist
-      auto pool_diff_src_memory_p = std::static_pointer_cast<memory>(
+      diff_src_memory = std::static_pointer_cast<memory>(
           dev_ctx.GetBlob(key_pool_diff_src_mem_p));
-      PADDLE_ENFORCE(pool_diff_src_memory_p != nullptr,
+      PADDLE_ENFORCE(diff_src_memory != nullptr,
                      "Fail to find pooling src mem_p in device context");
-      auto pool_diff_dst_memory_p = std::static_pointer_cast<memory>(
+      diff_dst_memory = std::static_pointer_cast<memory>(
           dev_ctx.GetBlob(key_pool_diff_dst_mem_p));
-      PADDLE_ENFORCE(pool_diff_dst_memory_p != nullptr,
+      PADDLE_ENFORCE(diff_dst_memory != nullptr,
                      "Fail to find pooling dst mem_p in device context");
-      pool_diff_src_memory_p->set_data_handle(
-          reinterpret_cast<void*>(in_x_grad_data));
-      pool_diff_dst_memory_p->set_data_handle(const_cast<T*>(out_grad_data));
+
+      diff_src_memory->set_data_handle(reinterpret_cast<void*>(in_x_grad_data));
+      diff_dst_memory->set_data_handle(const_cast<T*>(out_grad_data));
+
+      // reorder between user_diff_dst and pool diff_dst if needed
+      if (memory::primitive_desc(dst_memory->get_primitive_desc()) !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory =
+            std::make_shared<memory>(dst_memory.get()->get_primitive_desc());
+        reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
+        is_diff_dst_reordered = true;
+      }
     }
 
+    in_x_grad_format = (memory::format)diff_src_memory->get_primitive_desc()
+                           .desc()
+                           .data.format;
+
     // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{*(pool_bwd_p.get())};
+    std::vector<mkldnn::primitive> pipeline;
+    if (is_diff_dst_reordered) {
+      pipeline.push_back(reorder_diff_dst);
+    }
+    pipeline.push_back(*(pool_bwd_p.get()));
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+
+    in_x_grad->set_layout(DataLayout::kMKLDNN);
+    in_x_grad->set_format(in_x_grad_format);
   }  // Compute()
 };
 
 }  // namespace operators
 }  // namespace paddle
 
+namespace ops = paddle::operators;
+
 REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::PoolMKLDNNOpKernel<float>);
+                   ops::PoolMKLDNNOpKernel<float>);
 REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::PoolMKLDNNGradOpKernel<float>);
+                   ops::PoolMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index f4fb2b132f..6707cdded4 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -83,6 +83,9 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
 framework::OpKernelType PoolOp::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
   if (platform::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
@@ -92,11 +95,10 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
   if (library_ == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
 #endif
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
       layout_, library_);
@@ -112,6 +114,9 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
 framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
   if (platform::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
@@ -121,6 +126,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
   if (library_ == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
 #endif
 
@@ -129,8 +135,6 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
     PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
                       "float16 can only be used when CUDNN is used");
   }
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
                                  library_);
 }
@@ -147,7 +151,8 @@ void Pool2dOpMaker::Make() {
             "The format of output tensor is also NCHW, "
             "where N is batch size, C is the number of channels, "
             "H is the height of the feature, "
-            "and W is the width of the feature.");
+            "and W is the width of the feature.")
+      .Reuse("X");
 
   AddAttr<std::string>("pooling_type",
                        "(string), pooling type, can be \"max\" for max-pooling "
@@ -240,7 +245,8 @@ void Pool3dOpMaker::Make() {
             "The format of output tensor is also NCDHW, "
             "where N is batch size, C is "
             "the number of channels, and D, H and W is the depth, height and "
-            "width of the feature, respectively.");
+            "width of the feature, respectively.")
+      .Reuse("X");
 
   AddAttr<std::string>("pooling_type",
                        "(string) Pooling type, can be \"max\" for max-pooling "
diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc
index 4cfea958e8..f71ba84b31 100644
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/operators/send_recv_util.h"
 
 namespace paddle {
@@ -41,24 +41,19 @@ class PrefetchOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
 
-    auto client_var_name = Output("RPCClient");
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
-                            "Can not find variable '%s' in the scope.",
-                            client_var_name);
-    auto* client_var = scope.FindVar(client_var_name);
-    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<RPCCLIENT_T>();
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
         VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
                 << outs[i] << " back";
-        rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i],
-                                          outs[i]);
+        rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, ins[i], outs[i]);
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
     }
-    PADDLE_ENFORCE(rpc_client->Wait());
+    rpc_client->Wait();
   }
 };
 
@@ -66,9 +61,6 @@ class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
     AddInput("X", "(LoDTensor) Input Id variables to be sent").AsDuplicable();
-    AddOutput("RPCClient",
-              "(RPCClient) The RPC client object which will be"
-              "initialized at most once.");
     AddOutput("Out",
               "(LoDTensor) result "
               "to be fetched from parameter server")
@@ -87,17 +79,6 @@ the parameter server and fetch result back.
   }
 };
 
-class PrefetchOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output("RPCClient").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
-  }
-};
-
 class PrefetchOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext* ctx) const override {}
@@ -110,5 +91,4 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(prefetch, ops::PrefetchOp,
                   paddle::framework::EmptyGradOpMaker, ops::PrefetchOpMaker,
-                  ops::PrefetchOpVarTypeInference,
                   ops::PrefetchOpShapeInference);
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
new file mode 100644
index 0000000000..528a6e4a1b
--- /dev/null
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/random_crop_op.h"
+
+namespace paddle {
+namespace operators {
+
+class RandomCropOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "A batch of instances to random crop.");
+    AddInput("Seed", "The random seed.");
+    AddOutput("Out", "The cropped instance batch.");
+    AddOutput("SeedOut", "The random seed after random cropping.")
+        .AsIntermediate();
+    AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
+    AddComment(R"DOC(
+      This operator takes a batch of instance, and do random cropping on each instance.
+      It means that cropping positions differs on each instance, which is determined
+      by an uniform random generator. All cropped instances have the same shape, which 
+      is determined by the operator's attribute 'shape'.
+    )DOC");
+  }
+};
+
+class RandomCropOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    auto seed_dim = ctx->GetInputDim("Seed");
+    PADDLE_ENFORCE(seed_dim.size() == 1 && seed_dim[0] == 1);
+    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto x_dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_GT(x_dim.size(), static_cast<int64_t>(shape.size()));
+    auto out_dim = framework::vectorize2int(x_dim);
+    for (size_t i = 1; i <= shape.size(); ++i) {
+      size_t x_i = x_dim.size() - i;
+      size_t shape_i = shape.size() - i;
+      PADDLE_ENFORCE_GE(x_dim[x_i], shape[shape_i]);
+      out_dim[x_i] = shape[shape_i];
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
+    ctx->SetOutputDim("SeedOut", framework::make_ddim({1}));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace f = paddle::framework;
+REGISTER_OPERATOR(random_crop, ops::RandomCropOp, ops::RandomCropOpMaker,
+                  ops::RandomCropOpInferShape, f::EmptyGradOpMaker);
+
+template <typename T>
+using Kernel = ops::RandomCropKernel<paddle::platform::CPUDeviceContext, T>;
+REGISTER_OP_CPU_KERNEL(random_crop, Kernel<float>, Kernel<int>, Kernel<double>,
+                       Kernel<uint8_t>, Kernel<int16_t>);
diff --git a/paddle/fluid/operators/random_crop_op.cu b/paddle/fluid/operators/random_crop_op.cu
new file mode 100644
index 0000000000..6fc9bedc55
--- /dev/null
+++ b/paddle/fluid/operators/random_crop_op.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/random_crop_op.h"
+
+namespace ops = paddle::operators;
+template <typename T>
+using Kernel = ops::RandomCropKernel<paddle::platform::CUDADeviceContext, T>;
+REGISTER_OP_CUDA_KERNEL(random_crop, Kernel<float>, Kernel<int>, Kernel<double>,
+                        Kernel<uint8_t>, Kernel<int16_t>);
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
new file mode 100644
index 0000000000..f3261cbdc9
--- /dev/null
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -0,0 +1,181 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/for_range.h"
+#ifdef PADDLE_WITH_CUDA
+#include <thrust/random.h>
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext>
+struct Random;
+
+template <>
+struct Random<platform::CPUDeviceContext> {
+  using Engine = std::minstd_rand;
+
+  template <typename T>
+  using UniformIntDist = std::uniform_int_distribution<T>;
+};
+
+#ifdef PADDLE_WITH_CUDA
+template <>
+struct Random<platform::CUDADeviceContext> {
+  using Engine = thrust::minstd_rand;
+
+  template <typename T>
+  using UniformIntDist = thrust::uniform_int_distribution<T>;
+};
+#endif
+
+template <typename T>
+HOSTDEVICE inline void StridedMemcpy(const T* x, const size_t* x_dims, T* out,
+                                     const size_t* out_dims, int i, int rank,
+                                     size_t prod_x_remain,
+                                     size_t prod_out_remain,
+                                     const size_t* offsets) {
+  size_t x_dim_i = x_dims[i];
+  size_t out_dim_i = out_dims[i];
+  size_t x_stride = prod_x_remain / x_dim_i;
+  size_t out_stride = prod_out_remain / out_dim_i;
+  size_t offset_i = offsets[i];
+
+  if (i == rank - 1) {
+    PADDLE_ASSERT(x_stride == 1 && out_stride == 1);
+    x += offset_i;
+    for (size_t j = 0; j < out_dim_i; ++j) {
+      *out++ = *x++;
+    }
+  } else {
+    x += offset_i * x_stride;
+    for (size_t j = 0; j < out_dim_i; ++j) {
+      StridedMemcpy<T>(x, x_dims, out, out_dims, i + 1, rank, x_stride,
+                       out_stride, offsets);
+      x += x_stride;
+      out += out_stride;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+struct RandomCropFunctor {
+  const T* x_;
+  T* out_;
+  size_t x_dims_[9];
+  size_t out_dims_[9];
+  int num_batchsize_dims_;
+  int rank_;
+  int64_t seed_;
+
+  size_t prod_batchsize_dims_;
+  size_t prod_x_ins_dims_;
+  size_t prod_out_ins_dims_;
+
+  RandomCropFunctor(const T* x, T* out, const framework::DDim& x_dims,
+                    const framework::DDim& out_dims, int num_batchsize_dims,
+                    int64_t seed)
+      : x_(x),
+        out_(out),
+        num_batchsize_dims_(num_batchsize_dims),
+        rank_(x_dims.size()),
+        seed_(seed) {
+    PADDLE_ENFORCE_EQ(x_dims.size(), out_dims.size());
+    PADDLE_ENFORCE_GT(rank_, num_batchsize_dims_);
+    prod_batchsize_dims_ = 1;
+    prod_x_ins_dims_ = 1;
+    prod_out_ins_dims_ = 1;
+    for (size_t i = 0; i < static_cast<size_t>(rank_); ++i) {
+      size_t x_dim_i = x_dims[i];
+      size_t out_dim_i = out_dims[i];
+      x_dims_[i] = x_dim_i;
+      out_dims_[i] = out_dim_i;
+      if (i < static_cast<size_t>(num_batchsize_dims_)) {
+        PADDLE_ENFORCE_EQ(x_dim_i, out_dim_i);
+        prod_batchsize_dims_ *= x_dim_i;
+      } else {
+        prod_x_ins_dims_ *= x_dim_i;
+        prod_out_ins_dims_ *= out_dim_i;
+      }
+    }
+  }
+
+  HOSTDEVICE void operator()(size_t ins_idx) {
+    typename Random<DeviceContext>::Engine engine(seed_);
+    engine.discard(ins_idx * (rank_ - num_batchsize_dims_));
+    size_t offsets[9];
+    for (int i = num_batchsize_dims_; i < rank_; ++i) {
+      typename Random<DeviceContext>::template UniformIntDist<size_t> dist(
+          0, x_dims_[i] - out_dims_[i]);
+      offsets[i - num_batchsize_dims_] = dist(engine);
+    }
+
+    const T* x = x_ + ins_idx * prod_x_ins_dims_;
+    T* out = out_ + ins_idx * prod_out_ins_dims_;
+
+    StridedMemcpy<T>(x, x_dims_ + num_batchsize_dims_, out,
+                     out_dims_ + num_batchsize_dims_, 0,
+                     rank_ - num_batchsize_dims_, prod_x_ins_dims_,
+                     prod_out_ins_dims_, offsets);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class RandomCropKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
+    int64_t seed = 0;
+    if (platform::is_cpu_place(seed_tensor.place())) {
+      seed = *seed_tensor.data<int64_t>();
+    } else {
+      LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
+                      "your program";
+      framework::LoDTensor cpu_seed;
+      framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
+      seed = *cpu_seed.data<int64_t>();
+    }
+    auto shape = ctx.Attr<std::vector<int>>("shape");
+    auto& x = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
+    auto& out = detail::Ref(ctx.Output<framework::LoDTensor>("Out"));
+
+    int num_batchsize_dims = x.dims().size() - shape.size();
+    RandomCropFunctor<DeviceContext, T> functor(
+        x.data<T>(), out.mutable_data<T>(ctx.GetPlace()), x.dims(), out.dims(),
+        num_batchsize_dims, seed);
+    platform::ForRange<DeviceContext> for_range(
+        ctx.template device_context<DeviceContext>(),
+        functor.prod_batchsize_dims_);
+
+    for_range(functor);
+
+    Random<platform::CPUDeviceContext>::Engine engine(seed);
+    engine.discard(functor.prod_batchsize_dims_ *
+                   (functor.rank_ - functor.num_batchsize_dims_));
+    *ctx.Output<framework::LoDTensor>("SeedOut")->mutable_data<int64_t>(
+        platform::CPUPlace()) = engine();
+  }
+};
+
+// TODO(fengjiayi): Backward of random crop op
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index 3106978eb0..62532036f8 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -23,6 +23,7 @@ reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_o
 reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
 reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
+reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
 
 cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
 # Export local libraries to parent
diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc
index 4cc7cbc6e8..ecbae3894d 100644
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -20,7 +20,7 @@ namespace reader {
 
 class BatchReader : public framework::DecoratedReader {
  public:
-  BatchReader(ReaderBase* reader, int batch_size)
+  BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size)
       : DecoratedReader(reader), batch_size_(batch_size) {
     buffer_.reserve(batch_size_);
   }
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
new file mode 100644
index 0000000000..0a02fcdeaa
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -0,0 +1,188 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class CustomReader : public framework::DecoratedReader {
+ public:
+  CustomReader(const std::shared_ptr<ReaderBase>& reader,
+               const framework::BlockDesc& sub_block,
+               const std::vector<std::string>& source_var_names,
+               const std::vector<std::string>& sink_var_names)
+      : DecoratedReader(reader),
+        program_(*sub_block.Program()),
+        sub_block_id_(sub_block.ID()),
+        exe_(framework::Executor(platform::CPUPlace())),
+        source_var_names_(source_var_names),
+        sink_var_names_(sink_var_names) {}
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+
+ private:
+  const framework::ProgramDesc program_;
+  int sub_block_id_;
+  framework::Executor exe_;
+
+  std::vector<std::string> source_var_names_;
+  std::vector<std::string> sink_var_names_;
+};
+
+class CreateCustomReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    auto* sub_block = Attr<framework::BlockDesc*>("sub_block");
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    out->Reset(
+        new CustomReader(underlying_reader.Get(), *sub_block,
+                         Attr<std::vector<std::string>>("source_var_names"),
+                         Attr<std::vector<std::string>>("sink_var_names")));
+  }
+};
+
+class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddAttr<framework::BlockDesc*>(
+        "sub_block", "The block to hold all preprocessing operators.");
+    AddAttr<std::vector<std::string>>(
+        "source_var_names",
+        "Source variables are starting points of data preprocessing. They hold "
+        "preprocessing's input tensors. Each source variable corresponds to "
+        "one of underlying reader's output datas.");
+    AddAttr<std::vector<std::string>>(
+        "sink_var_names",
+        "Sink variables are ending points of data preprocessing. They hold "
+        "preprocessing's output tensors. Each sink variable corresponds to "
+        "one of custom reader's output datas.");
+    AddComment(R"DOC(
+      CreateCustomReader Operator
+
+      A custom reader can be used for input data preprocessing. 
+      A custom reader holds its own sub-block, which will be executed in CPU 
+      in its 'ReadNext()' function. Users can configurate their own 
+      preprocessing pipelines by inserting operators into custom reader's 
+      sub-block.
+    )DOC");
+  }
+};
+
+class CustomReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(!ctx->IsRuntime(),
+                   "'CustomReaderInferShape' should only be invoked during "
+                   "compile time.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "The output decorated reader should not be null.");
+    const auto* sub_block =
+        ctx->Attrs().Get<framework::BlockDesc*>("sub_block");
+    const auto sink_var_names =
+        ctx->Attrs().Get<std::vector<std::string>>("sink_var_names");
+    std::vector<std::vector<int64_t>> res_dims;
+    std::vector<int32_t> res_lod_levels;
+    for (const std::string& var_name : sink_var_names) {
+      auto* sink_var = sub_block->FindVar(var_name);
+      PADDLE_ENFORCE_NOT_NULL(sink_var);
+      res_dims.emplace_back(sink_var->GetShape());
+      res_lod_levels.push_back(sink_var->GetLoDLevel());
+    }
+    auto* out_reader =
+        boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+    out_reader->SetShapes(res_dims);
+    out_reader->SetLoDLevels(res_lod_levels);
+  }
+};
+
+class CustomReaderInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    framework::VarDesc* out_reader = block->FindVar(op_desc.Output("Out")[0]);
+    PADDLE_ENFORCE_NOT_NULL(out_reader);
+    out_reader->SetType(framework::proto::VarType::READER);
+
+    auto sink_var_names =
+        boost::get<std::vector<std::string>>(op_desc.GetAttr("sink_var_names"));
+    const auto* sub_block =
+        boost::get<framework::BlockDesc*>(op_desc.GetAttr("sub_block"));
+    std::vector<framework::proto::VarType::Type> res_data_types;
+    for (const std::string& var_name : sink_var_names) {
+      framework::VarDesc* var = sub_block->FindVar(var_name);
+      PADDLE_ENFORCE_NOT_NULL(var);
+      res_data_types.emplace_back(var->GetDataType());
+    }
+    out_reader->SetDataTypes(res_data_types);
+  }
+};
+
+void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  out->clear();
+  std::vector<framework::LoDTensor> underlying_outs;
+  reader_->ReadNext(&underlying_outs);
+  if (underlying_outs.empty()) {
+    // There is not next data.
+    return;
+  }
+  PADDLE_ENFORCE(source_var_names_.size() == underlying_outs.size(),
+                 "The size of source_var_names(%d) and the size of "
+                 "underlying_outs(%d) are not consistent. Each feeding element "
+                 "must have its own source variable.",
+                 source_var_names_.size(), underlying_outs.size());
+  // The scope for CustomReader's sub-block should be independent and shouldn't
+  // be any other computation scope's child. Otherwise, data preprocessing and
+  // compution cannot be concurrent.
+  framework::Scope scope;
+  // 1. Copy LoDTensors from underlying reader's output to source variables.
+  for (size_t i = 0; i < source_var_names_.size(); ++i) {
+    framework::Variable* var = scope.Var(source_var_names_[i]);
+    framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
+    tensor->ShareDataWith(underlying_outs[i]);
+    tensor->set_lod(underlying_outs[i].lod());
+  }
+  // 2. Run the sub-block.
+  exe_.Run(program_, &scope, sub_block_id_, false, true);
+  // 3. Copy LoDTensors from sink variables to out.
+  out->resize(sink_var_names_.size());
+  for (size_t i = 0; i < sink_var_names_.size(); ++i) {
+    const auto& tensor = detail::Ref(scope.FindVar(sink_var_names_[i]))
+                             .Get<framework::LoDTensor>();
+    framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]);
+  }
+}
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators::reader;
+REGISTER_OPERATOR(create_custom_reader, ops::CreateCustomReaderOp,
+                  ops::CreateCustomReaderOpMaker, ops::CustomReaderInferShape,
+                  ops::CustomReaderInferVarType,
+                  paddle::framework::EmptyGradOpMaker)
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index bc830a2b72..5f35b9b3ea 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -34,7 +34,8 @@ static constexpr size_t kChannelSize = 1;  // kCacheSize - 2
 class DoubleBufferReader : public framework::DecoratedReader {
  public:
   explicit DoubleBufferReader(
-      ReaderBase* reader, platform::Place target_place = platform::CPUPlace())
+      const std::shared_ptr<ReaderBase>& reader,
+      platform::Place target_place = platform::CPUPlace())
       : DecoratedReader(reader), place_(target_place) {
     cpu_tensor_cache_.resize(kCacheSize);
     gpu_tensor_cache_.resize(kCacheSize);
diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
index 249b0b7c6d..19b54110b9 100644
--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -21,7 +21,7 @@ namespace reader {
 
 class MultiPassReader : public framework::DecoratedReader {
  public:
-  MultiPassReader(ReaderBase* reader, int pass_num)
+  MultiPassReader(const std::shared_ptr<ReaderBase>& reader, int pass_num)
       : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
diff --git a/paddle/fluid/operators/reader/create_random_data_generator_op.cc b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
index 55bb9739e0..5b7e8a063a 100644
--- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
@@ -21,14 +21,15 @@ namespace reader {
 template <typename T>
 class RandomDataGenerator : public framework::ReaderBase {
  public:
-  RandomDataGenerator(const std::vector<framework::DDim>& shapes, float min,
-                      float max)
-      : framework::ReaderBase(), min_(min), max_(max), shapes_(shapes) {
-    PADDLE_ENFORCE_LE(
-        min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
+  RandomDataGenerator(const std::vector<framework::DDim>& shapes, float low,
+                      float high)
+      : framework::ReaderBase(), low_(low), high_(high), shapes_(shapes) {
+    PADDLE_ENFORCE_LE(low, high,
+                      "'low' shouldn't be greater than 'high'.(%f vs %f)", low,
+                      high);
     unsigned int seed = std::random_device()();
     engine_.seed(seed);
-    dist_ = std::uniform_real_distribution<float>(min_, max_);
+    dist_ = std::uniform_real_distribution<float>(low_, high_);
   }
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
@@ -53,8 +54,8 @@ class RandomDataGenerator : public framework::ReaderBase {
   void ReInit() override { return; }
 
  private:
-  float min_;
-  float max_;
+  float low_;
+  float high_;
   std::minstd_rand engine_;
   std::uniform_real_distribution<float> dist_;
   std::vector<framework::DDim> shapes_;
@@ -78,22 +79,22 @@ class CreateRandomDataGeneratorOp : public framework::OperatorBase {
     std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new RandomDataGenerator<T>(shapes, Attr<float>("min"),
-                                          Attr<float>("max")));
+    out->Reset(new RandomDataGenerator<T>(shapes, Attr<float>("low"),
+                                          Attr<float>("high")));
   }
 };
 
 class CreateRandomDataGeneratorOpMaker : public FileReaderMakerBase {
  protected:
   void Apply() override {
-    AddAttr<float>("min", "The lower bound of reader's uniform distribution.");
-    AddAttr<float>("max", "The upper bound of reader's uniform distribution.");
+    AddAttr<float>("low", "The lower bound of reader's uniform distribution.");
+    AddAttr<float>("high", "The upper bound of reader's uniform distribution.");
     AddComment(R"DOC(
       CreateRandomDataGenerator Operator
 
       This Op creates a random reader.
       The reader generates random data instead of really reading from files.
-      Generated data follow an uniform distribution between 'min' and 'max'.
+      Generated data follow an uniform distribution between 'low' and 'high'.
     )DOC");
   }
 };
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
index fd233be945..57e8e21214 100644
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -23,7 +23,8 @@ namespace reader {
 
 class ShuffleReader : public framework::DecoratedReader {
  public:
-  ShuffleReader(ReaderBase* reader, size_t buffer_size, size_t seed = 0)
+  ShuffleReader(const std::shared_ptr<ReaderBase>& reader, size_t buffer_size,
+                size_t seed = 0)
       : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) {
     VLOG(10) << "Create shuffle reader of " << reader_;
     if (seed_ == 0) {
diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
index 1db70f3e96..3798015146 100644
--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
@@ -21,7 +21,8 @@ namespace reader {
 
 class ThreadedReader : public framework::DecoratedReader {
  public:
-  explicit ThreadedReader(ReaderBase* reader) : DecoratedReader(reader) {}
+  explicit ThreadedReader(const std::shared_ptr<ReaderBase>& reader)
+      : DecoratedReader(reader) {}
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
     std::lock_guard<std::mutex> lock(mutex_);
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index 8c0dac65dd..31e5d81e55 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -26,7 +26,11 @@ class MultiFileReader : public framework::ReaderBase {
   MultiFileReader(const std::vector<std::string>& file_names,
                   const std::vector<framework::DDim>& dims, size_t thread_num,
                   size_t buffer_size)
-      : file_names_(file_names), dims_(dims), buffer_size_(buffer_size) {
+      : buffer_size_(buffer_size) {
+    readers_.reserve(file_names.size());
+    for (const std::string& f_name : file_names) {
+      readers_.emplace_back(CreateReaderByFileName(f_name, dims));
+    }
     prefetchers_.resize(thread_num);
     StartNewScheduler();
   }
@@ -40,14 +44,13 @@ class MultiFileReader : public framework::ReaderBase {
   void StartNewScheduler();
   void EndScheduler();
   void ScheduleThreadFunc();
-  void PrefetchThreadFunc(std::string file_name, size_t thread_idx);
+  void PrefetchThreadFunc(size_t reader_idx, size_t thread_idx);
 
-  std::vector<std::string> file_names_;
-  std::vector<framework::DDim> dims_;
+  std::vector<std::unique_ptr<framework::ReaderBase>> readers_;
   std::thread scheduler_;
   std::vector<std::thread> prefetchers_;
   size_t buffer_size_;
-  reader::BlockingQueue<size_t>* waiting_file_idx_;
+  reader::BlockingQueue<size_t>* waiting_reader_idx_;
   reader::BlockingQueue<size_t>* available_thread_idx_;
   reader::BlockingQueue<std::vector<framework::LoDTensor>>* buffer_;
 };
@@ -65,15 +68,15 @@ void MultiFileReader::ReInit() {
 
 void MultiFileReader::StartNewScheduler() {
   size_t thread_num = prefetchers_.size();
-  waiting_file_idx_ = new reader::BlockingQueue<size_t>(file_names_.size());
+  waiting_reader_idx_ = new reader::BlockingQueue<size_t>(readers_.size());
   available_thread_idx_ = new reader::BlockingQueue<size_t>(thread_num);
   buffer_ = new reader::BlockingQueue<std::vector<framework::LoDTensor>>(
       buffer_size_);
 
-  for (size_t i = 0; i < file_names_.size(); ++i) {
-    waiting_file_idx_->Send(i);
+  for (size_t i = 0; i < readers_.size(); ++i) {
+    waiting_reader_idx_->Send(i);
   }
-  waiting_file_idx_->Close();
+  waiting_reader_idx_->Close();
   for (size_t i = 0; i < thread_num; ++i) {
     available_thread_idx_->Send(i);
   }
@@ -84,13 +87,13 @@ void MultiFileReader::StartNewScheduler() {
 void MultiFileReader::EndScheduler() {
   available_thread_idx_->Close();
   buffer_->Close();
-  waiting_file_idx_->Close();
+  waiting_reader_idx_->Close();
   if (scheduler_.joinable()) {
     scheduler_.join();
   }
   delete buffer_;
   delete available_thread_idx_;
-  delete waiting_file_idx_;
+  delete waiting_reader_idx_;
 }
 
 void MultiFileReader::ScheduleThreadFunc() {
@@ -102,12 +105,11 @@ void MultiFileReader::ScheduleThreadFunc() {
     if (prefetcher.joinable()) {
       prefetcher.join();
     }
-    size_t file_idx;
-    if (waiting_file_idx_->Receive(&file_idx)) {
+    size_t reader_idx;
+    if (waiting_reader_idx_->Receive(&reader_idx)) {
       // Still have files to read. Start a new prefetch thread.
-      std::string file_name = file_names_[file_idx];
-      prefetcher = std::thread([this, file_name, thread_idx] {
-        PrefetchThreadFunc(file_name, thread_idx);
+      prefetcher = std::thread([this, reader_idx, thread_idx] {
+        PrefetchThreadFunc(reader_idx, thread_idx);
       });
     } else {
       // No more file to read.
@@ -129,23 +131,22 @@ void MultiFileReader::ScheduleThreadFunc() {
   VLOG(5) << "MultiFileReader schedule thread terminates.";
 }
 
-void MultiFileReader::PrefetchThreadFunc(std::string file_name,
-                                         size_t thread_idx) {
-  VLOG(5) << "The prefetch thread of file '" << file_name << "' starts.";
-  std::unique_ptr<framework::ReaderBase> reader =
-      CreateReaderByFileName(file_name, dims_);
+void MultiFileReader::PrefetchThreadFunc(size_t reader_idx, size_t thread_idx) {
+  VLOG(5) << "The prefetch thread of file idx '" << reader_idx << "' starts.";
+  std::unique_ptr<framework::ReaderBase>& reader = readers_[reader_idx];
   while (true) {
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
     if (ins.empty()) {
+      reader->ReInit();
       break;
     }
     try {
       buffer_->Send(std::move(ins));
     } catch (paddle::platform::EnforceNotMet e) {
       VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
-                 "thread of file '"
-              << file_name << "' will terminate.";
+                 "thread of file idx '"
+              << reader_idx << "' will terminate.";
       break;
     }
   }
@@ -154,7 +155,8 @@ void MultiFileReader::PrefetchThreadFunc(std::string file_name,
     VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
                "Fail to send thread_idx.";
   }
-  VLOG(5) << "The prefetch thread of file '" << file_name << "' terminates.";
+  VLOG(5) << "The prefetch thread of file idx '" << reader_idx
+          << "' terminates.";
 }
 
 class OpenFilesOp : public framework::OperatorBase {
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index 11f1ddebc4..612e1f5eca 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -115,6 +115,7 @@ void DecoratedReaderInferShape::operator()(
       boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
   out_reader->SetLoDLevels(in_reader->GetLoDLevels());
 }
+
 void DecoratedReaderInferVarType::operator()(
     const framework::OpDesc& op_desc, framework::BlockDesc* block) const {
   std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 7148bd0e36..15dfb5469b 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -36,19 +36,24 @@ class RecvOp : public framework::OperatorBase {
                const platform::Place& place) const override {
     auto outs = Outputs("Out");
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    int sync_mode = Attr<int>("sync_mode");
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
+
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<RPCCLIENT_T>();
 
     for (size_t i = 0; i < outs.size(); i++) {
-      VLOG(3) << "getting " << outs[i];
-      client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
+      rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
+    }
+    if (sync_mode) {
+      rpc_client->Wait();
     }
-    PADDLE_ENFORCE(client_.Wait());
   }
-
- private:
-  mutable detail::RPCClient client_;
 };
 
 class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -65,12 +70,22 @@ This operator can get variables from server side.
                                       "Server endpoints in the order of input "
                                       "variables for mapping")
         .SetDefault({});
+    AddAttr<int>("sync_mode",
+                 "(int, default 0)"
+                 "sync recv or async recv.")
+        .SetDefault(0);
   }
 };
 
+class RecvOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker);
+REGISTER_OPERATOR(recv, ops::RecvOp, paddle::framework::EmptyGradOpMaker,
+                  ops::RecvOpMaker, ops::RecvOpShapeInference);
diff --git a/paddle/fluid/operators/reduce_max_op.cc b/paddle/fluid/operators/reduce_max_op.cc
new file mode 100644
index 0000000000..95d3768e1f
--- /dev/null
+++ b/paddle/fluid/operators/reduce_max_op.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_REDUCE_OP(reduce_max);
+REGISTER_OP_CPU_KERNEL(
+    reduce_max, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::MaxFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_max_op.cu b/paddle/fluid/operators/reduce_max_op.cu
new file mode 100644
index 0000000000..0d86b3127e
--- /dev/null
+++ b/paddle/fluid/operators/reduce_max_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_max,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MaxFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.cc b/paddle/fluid/operators/reduce_mean_op.cc
new file mode 100644
index 0000000000..fc258c2496
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_mean_op.h"
+
+REGISTER_REDUCE_OP(reduce_mean);
+REGISTER_OP_CPU_KERNEL(reduce_mean,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         float, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         double, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int64_t, ops::MeanFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_mean_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::MeanGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu
new file mode 100644
index 0000000000..960cb3235b
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_mean_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_mean,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MeanFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MeanGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.h b/paddle/fluid/operators/reduce_mean_op.h
new file mode 100644
index 0000000000..1359679c47
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct MeanFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->mean(dim);
+  }
+};
+
+struct MeanGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_min_max_op.h b/paddle/fluid/operators/reduce_min_max_op.h
new file mode 100644
index 0000000000..ec59f3e71c
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_max_op.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct MaxFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->maximum(dim);
+  }
+};
+
+struct MinFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->minimum(dim);
+  }
+};
+
+struct MaxOrMinGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    auto equals = (*x) == y->broadcast(dim);
+    auto ones = dx->constant(1);
+    auto zeros = dx->constant(0);
+    // If there are multiple minimum or maximum elements, the subgradient of
+    // each is the set [0, 1], and we pass gradient to all of them here.
+    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_min_op.cc b/paddle/fluid/operators/reduce_min_op.cc
new file mode 100644
index 0000000000..330a86d2e4
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_op.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_REDUCE_OP(reduce_min);
+REGISTER_OP_CPU_KERNEL(
+    reduce_min, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::MinFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_min_op.cu b/paddle/fluid/operators/reduce_min_op.cu
new file mode 100644
index 0000000000..da466f805e
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_min,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MinFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc
deleted file mode 100644
index eb8c21179d..0000000000
--- a/paddle/fluid/operators/reduce_op.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_op.h"
-
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class ReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReduceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReduceOp should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    int dim = ctx->Attrs().Get<int>("dim");
-    if (dim < 0) dim = x_rank + dim;
-    PADDLE_ENFORCE_LT(
-        dim, x_rank,
-        "The dim should be in the range [-rank(input), rank(input)).");
-    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
-    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
-    if (reduce_all) {
-      if (keep_dim)
-        ctx->SetOutputDim(
-            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
-      else
-        ctx->SetOutputDim("Out", {1});
-    } else {
-      auto dims_vector = vectorize(x_dims);
-      if (keep_dim || x_rank == 1) {
-        dims_vector[dim] = 1;
-      } else {
-        dims_vector.erase(dims_vector.begin() + dim);
-      }
-      auto out_dims = framework::make_ddim(dims_vector);
-      ctx->SetOutputDim("Out", out_dims);
-      if (dim != 0) {
-        // Only pass LoD when not reducing on the first dim.
-        ctx->ShareLoD("X", /*->*/ "Out");
-      }
-    }
-  }
-};
-
-class ReduceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    int dim = ctx->Attrs().Get<int>("dim");
-    if (dim < 0) dim = x_rank + dim;
-    PADDLE_ENFORCE_LT(
-        dim, x_rank,
-        "The dim should be in the range [-rank(input), rank(input)).");
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-      ctx->ShareLoD("X", /*->*/ x_grad_name);
-    }
-  }
-};
-
-class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() final {
-    AddInput("X",
-             "(Tensor) The input tensor. Tensors with rank at most 6 are "
-             "supported.");
-    AddOutput("Out", "(Tensor) The result tensor.");
-    AddAttr<int>(
-        "dim",
-        "(int, default 0) The dimension to reduce. "
-        "Must be in the range [-rank(input), rank(input)). "
-        "If `dim < 0`, the dim to reduce is `rank + dim`. "
-        "Note that reducing on the first dim will make the LoD info lost.")
-        .SetDefault(0);
-    AddAttr<bool>("keep_dim",
-                  "(bool, default false) "
-                  "If true, retain the reduced dimension with length 1.")
-        .SetDefault(false);
-    AddAttr<bool>("reduce_all",
-                  "(bool, default false) "
-                  "If true, output a scalar reduced along all dimensions.")
-        .SetDefault(false);
-    AddComment(string::Sprintf(R"DOC(
-%s Operator.
-
-This operator computes the %s of input tensor along the given dimension.
-The result tensor has 1 fewer dimension than the input unless keep_dim is true.
-If reduce_all is true, just reduce along all dimensions and output a scalar.
-
-)DOC",
-                               GetOpType(), GetName()));
-  }
-
- protected:
-  virtual std::string GetName() const = 0;
-  virtual std::string GetOpType() const = 0;
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-#define REGISTER_REDUCE_OP(op_name)                                        \
-  class __##op_name##Maker__ : public ops::ReduceOpMaker {                 \
-   protected:                                                              \
-    virtual std::string GetName() const { return #op_name; }               \
-    virtual std::string GetOpType() const { return "Reduce " #op_name; }   \
-  };                                                                       \
-  REGISTER_OPERATOR(reduce_##op_name, ops::ReduceOp, __##op_name##Maker__, \
-                    paddle::framework::DefaultGradOpDescMaker<true>);      \
-  REGISTER_OPERATOR(reduce_##op_name##_grad, ops::ReduceGradOp)
-
-REGISTER_REDUCE_OP(sum);
-REGISTER_REDUCE_OP(mean);
-REGISTER_REDUCE_OP(max);
-REGISTER_REDUCE_OP(min);
-REGISTER_REDUCE_OP(prod);
-
-#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
-  REGISTER_OP_CPU_KERNEL(reduce_type,                                          \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           float, ops::functor>,               \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           double, ops::functor>,              \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           int, ops::functor>,                 \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           int64_t, ops::functor>);            \
-  REGISTER_OP_CPU_KERNEL(                                                      \
-      reduce_type##_grad,                                                      \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,         \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,        \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,           \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,       \
-                            ops::grad_functor>);
-
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL);
diff --git a/paddle/fluid/operators/reduce_op.cu b/paddle/fluid/operators/reduce_op.cu
deleted file mode 100644
index ae29587f55..0000000000
--- a/paddle/fluid/operators/reduce_op.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/fluid/operators/reduce_op.h"
-
-namespace ops = paddle::operators;
-
-#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)    \
-  REGISTER_OP_CUDA_KERNEL(                                                \
-      reduce_type, ops::ReduceKernel<paddle::platform::CUDADeviceContext, \
-                                     float, ops::functor>,                \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, double,      \
-                        ops::functor>,                                    \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int,         \
-                        ops::functor>,                                    \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int64_t,     \
-                        ops::functor>);                                   \
-  REGISTER_OP_CUDA_KERNEL(                                                \
-      reduce_type##_grad,                                                 \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,   \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,  \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,     \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t, \
-                            ops::grad_functor>);
-
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL);
diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h
index e42b4bfe42..72b6cf1773 100644
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
@@ -14,111 +14,36 @@ limitations under the License. */
 
 #pragma once
 
-#include "glog/logging.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/reduce_op_function.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-struct SumFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->sum(dim);
-  }
-};
-
-struct SumGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim);
-  }
-};
-
-struct MeanFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->mean(dim);
-  }
-};
-
-struct MeanGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
-  }
-};
-
-struct MaxFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->maximum(dim);
-  }
-};
-
-struct MinFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->minimum(dim);
-  }
-};
-
-struct MaxOrMinGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    auto equals = (*x) == y->broadcast(dim);
-    auto ones = dx->constant(1);
-    auto zeros = dx->constant(0);
-    // If there are multiple minimum or maximum elements, the subgradient of
-    // each is the set [0, 1], and we pass gradient to all of them here.
-    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
-  }
-};
-
-struct ProdFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->prod(dim);
-  }
-};
-
-struct ProdGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
+#define HANDLE_DIM(NDIM, RDIM)                                            \
+  if (ndim == NDIM && rdim == RDIM) {                                     \
+    ReduceFunctor<DeviceContext, T, NDIM, RDIM, Functor>(                 \
+        context.template device_context<DeviceContext>(), *input, output, \
+        dims, keep_dim);                                                  \
   }
-};
 
 template <typename DeviceContext, typename T, typename Functor>
 class ReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
     if (reduce_all) {
       // Flatten and reduce 1-D tensor
-      auto* input = context.Input<Tensor>("X");
-      auto* output = context.Output<Tensor>("Out");
-      output->mutable_data<T>(context.GetPlace());
       auto x = EigenVector<T>::Flatten(*input);
       auto out = EigenScalar<T>::From(*output);
       auto& place =
@@ -127,61 +52,25 @@ class ReduceKernel : public framework::OpKernel<T> {
       Functor functor;
       functor(place, &x, &out, reduce_dim);
     } else {
-      int rank = context.Input<Tensor>("X")->dims().size();
-      switch (rank) {
-        case 1:
-          ReduceCompute<1>(context);
-          break;
-        case 2:
-          ReduceCompute<2>(context);
-          break;
-        case 3:
-          ReduceCompute<3>(context);
-          break;
-        case 4:
-          ReduceCompute<4>(context);
-          break;
-        case 5:
-          ReduceCompute<5>(context);
-          break;
-        case 6:
-          ReduceCompute<6>(context);
-          break;
-      }
-    }
-  }
-
- private:
-  template <size_t D>
-  void ReduceCompute(const framework::ExecutionContext& context) const {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
-
-    auto x = EigenTensor<T, D>::From(*input);
-    auto x_rank = static_cast<int>(x.dimensions().size());
-    int dim = static_cast<int>(context.Attr<int>("dim"));
-    if (dim < 0) dim = x_rank + dim;
-    auto reduce_dim = Eigen::array<int, 1>({{dim}});
-    // construct the squeezed output tensor
-    bool keep_dim = context.Attr<bool>("keep_dim");
-    DDim dims = output->dims();
-    auto dims_vector = vectorize(dims);
-    if (keep_dim && x_rank > 1) {
-      dims_vector.erase(dims_vector.begin() + dim);
-      dims = framework::make_ddim(dims_vector);
-    }
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    Functor functor;
-
-    if (D == 1) {
-      auto out = EigenScalar<T>::From(*output);
-      functor(place, &x, &out, reduce_dim);
-    } else {
-      auto out = EigenTensor<T, (D - 1)>::From(*output, dims);
-      functor(place, &x, &out, reduce_dim);
+      int ndim = input->dims().size();
+      int rdim = dims.size();
+      // comments for accelerating compiling temporarily.
+      //      HANDLE_DIM(6, 5);
+      //      HANDLE_DIM(6, 4);
+      //      HANDLE_DIM(6, 3);
+      //      HANDLE_DIM(6, 2);
+      //      HANDLE_DIM(6, 1);
+      //      HANDLE_DIM(5, 4);
+      //      HANDLE_DIM(5, 3);
+      //      HANDLE_DIM(5, 2);
+      //      HANDLE_DIM(5, 1);
+      HANDLE_DIM(4, 3);
+      HANDLE_DIM(4, 2);
+      HANDLE_DIM(4, 1);
+      HANDLE_DIM(3, 2);
+      HANDLE_DIM(3, 1);
+      HANDLE_DIM(2, 1);
+      HANDLE_DIM(1, 1);
     }
   }
 };
@@ -191,12 +80,15 @@ class ReduceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     bool reduce_all = context.Attr<bool>("reduce_all");
+    auto dims = context.Attr<std::vector<int>>("dim");
+
+    auto* input0 = context.Input<Tensor>("X");
+    auto* input1 = context.Input<Tensor>("Out");
+    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+    output->mutable_data<T>(context.GetPlace());
+
     if (reduce_all) {
-      auto* input0 = context.Input<Tensor>("X");
-      auto* input1 = context.Input<Tensor>("Out");
-      auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* output = context.Output<Tensor>(framework::GradVarName("X"));
-      output->mutable_data<T>(context.GetPlace());
       auto x = EigenVector<T>::Flatten(*input0);
       auto x_reduce = EigenVector<T>::From(*input1);
       auto x_reduce_grad = EigenVector<T>::From(*input2);
@@ -209,66 +101,172 @@ class ReduceGradKernel : public framework::OpKernel<T> {
       functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
               broadcast_dim[0]);
     } else {
-      int rank = context.Input<Tensor>("X")->dims().size();
+      int rank = input0->dims().size();
       switch (rank) {
         case 1:
-          ReduceGradCompute<1>(context);
+          ReduceGradFunctor<DeviceContext, T, 1, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 2:
-          ReduceGradCompute<2>(context);
+          ReduceGradFunctor<DeviceContext, T, 2, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 3:
-          ReduceGradCompute<3>(context);
+          ReduceGradFunctor<DeviceContext, T, 3, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 4:
-          ReduceGradCompute<4>(context);
+          ReduceGradFunctor<DeviceContext, T, 4, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 5:
-          ReduceGradCompute<5>(context);
+          ReduceGradFunctor<DeviceContext, T, 5, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 6:
-          ReduceGradCompute<6>(context);
+          ReduceGradFunctor<DeviceContext, T, 6, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
       }
     }
   }
+};
 
- private:
-  template <size_t D>
-  void ReduceGradCompute(const framework::ExecutionContext& context) const {
-    auto* input0 = context.Input<Tensor>("X");
-    auto* input1 = context.Input<Tensor>("Out");
-    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+class ReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-    output->mutable_data<T>(context.GetPlace());
-    auto x = EigenTensor<T, D>::From(*input0);
-    auto x_grad = EigenTensor<T, D>::From(*output);
-    auto x_rank = static_cast<int>(x.dimensions().size());
-    int dim = static_cast<int>(context.Attr<int>("dim"));
-    if (dim < 0) dim = x_rank + dim;
-    DDim dims = input0->dims();
-    dims[dim] = 1;
-    auto x_reduce = EigenTensor<T, D>::From(*input1, dims);
-    auto x_reduce_grad = EigenTensor<T, D>::From(*input2, dims);
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReduceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReduceOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
+    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
+    for (size_t i = 0; i < dims.size(); ++i) {
+      if (dims[i] < 0) dims[i] = x_rank + dims[i];
+      PADDLE_ENFORCE_LT(
+          dims[i], x_rank,
+          "The dim should be in the range [-rank(input), rank(input)).");
+    }
+    sort(dims.begin(), dims.end());
+    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
+    if (reduce_all) {
+      if (keep_dim)
+        ctx->SetOutputDim(
+            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
+      else
+        ctx->SetOutputDim("Out", {1});
+    } else {
+      auto dims_vector = vectorize(x_dims);
+      if (keep_dim) {
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = 1;
+        }
+      } else {
+        const int kDelFlag = -2;
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = kDelFlag;
+        }
+        dims_vector.erase(
+            remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+            dims_vector.end());
+      }
+      auto out_dims = framework::make_ddim(dims_vector);
+      ctx->SetOutputDim("Out", out_dims);
+      if (dims[0] != 0) {
+        // Only pass LoD when not reducing on the first dim.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
+    }
+  }
+};
 
-    Eigen::array<int, D> broadcast_dim;
-    for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
-    broadcast_dim[dim] = input0->dims()[dim];
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    Functor functor;
-    functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
-            broadcast_dim[dim]);
+class ReduceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
+    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
+    for (size_t i = 0; i < dims.size(); ++i) {
+      if (dims[i] < 0) dims[i] = x_rank + dims[i];
+      PADDLE_ENFORCE_LT(
+          dims[i], x_rank,
+          "The dim should be in the range [-rank(input), rank(input)).");
+    }
+    sort(dims.begin(), dims.end());
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
+    }
+  }
+};
+
+class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() final {
+    AddInput("X",
+             "(Tensor) The input tensor. Tensors with rank at most 6 are "
+             "supported.");
+    AddOutput("Out", "(Tensor) The result tensor.");
+    AddAttr<std::vector<int>>(
+        "dim",
+        "(list<int>, default {0}) The dimensions to reduce. "
+        "Must be in the range [-rank(input), rank(input)). "
+        "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
+        "Note that reducing on the first dim will make the LoD info lost.")
+        .SetDefault({0});
+    AddAttr<bool>("keep_dim",
+                  "(bool, default false) "
+                  "If true, retain the reduced dimension with length 1.")
+        .SetDefault(false);
+    AddAttr<bool>("reduce_all",
+                  "(bool, default false) "
+                  "If true, output a scalar reduced along all dimensions.")
+        .SetDefault(false);
+    AddComment(string::Sprintf(R"DOC(
+%s Operator.
+
+This operator computes the %s of input tensor along the given dimension.
+The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+If reduce_all is true, just reduce along all dimensions and output a scalar.
+
+)DOC",
+                               GetOpType(), GetName()));
   }
+
+ protected:
+  virtual std::string GetName() const = 0;
+  virtual std::string GetOpType() const = 0;
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-#define FOR_EACH_KERNEL_FUNCTOR(__macro)                \
-  __macro(reduce_sum, SumFunctor, SumGradFunctor);      \
-  __macro(reduce_mean, MeanFunctor, MeanGradFunctor);   \
-  __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \
-  __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); \
-  __macro(reduce_prod, ProdFunctor, ProdGradFunctor);
+namespace ops = paddle::operators;
+
+#define REGISTER_REDUCE_OP(op_name)                                      \
+  class __##op_name##Maker__ : public ops::ReduceOpMaker {               \
+   protected:                                                            \
+    virtual std::string GetName() const { return #op_name; }             \
+    virtual std::string GetOpType() const { return "Reduce " #op_name; } \
+  };                                                                     \
+  REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__,        \
+                    paddle::framework::DefaultGradOpDescMaker<true>);    \
+  REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp)
diff --git a/paddle/fluid/operators/reduce_op_function.h b/paddle/fluid/operators/reduce_op_function.h
new file mode 100644
index 0000000000..3da27bc8ac
--- /dev/null
+++ b/paddle/fluid/operators/reduce_op_function.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T, size_t D, size_t R_D,
+          typename Functor>
+void ReduceFunctor(const DeviceContext& context, const framework::Tensor& input,
+                   framework::Tensor* output, const std::vector<int>& dims,
+                   bool keep_dim) {
+  auto x = EigenTensor<T, D>::From(input);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto reduce_dim = Eigen::array<int, R_D>();
+  std::vector<int> dims_ref = dims;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
+    reduce_dim[i] = dims_ref[i];
+  }
+  // construct the squeezed output tensor
+  DDim out_dims = output->dims();
+  if (keep_dim && x_rank > 1) {
+    const int kDelFlag = -2;
+    auto dims_vector = framework::vectorize(out_dims);
+    for (size_t i = 0; i < dims_ref.size(); ++i) {
+      dims_vector[dims_ref[i]] = kDelFlag;
+    }
+    dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+                      dims_vector.end());
+    out_dims = framework::make_ddim(dims_vector);
+  }
+  auto& place = *context.eigen_device();
+  Functor functor;
+
+  if (D == 1) {
+    auto out = EigenScalar<T>::From(*output);
+    functor(place, &x, &out, reduce_dim);
+  } else {
+    auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
+    functor(place, &x, &out, reduce_dim);
+  }
+}
+
+template <typename DeviceContext, typename T, size_t D, typename Functor>
+void ReduceGradFunctor(const DeviceContext& context,
+                       const framework::Tensor& input0,
+                       const framework::Tensor& input1,
+                       const framework::Tensor& input2,
+                       framework::Tensor* output,
+                       const std::vector<int>& dims) {
+  auto x = EigenTensor<T, D>::From(input0);
+  auto x_grad = EigenTensor<T, D>::From(*output);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto x_dims = input0.dims();
+  auto reduced_dims_v = framework::vectorize(x_dims);
+  std::vector<int> dims_ref = dims;
+  Eigen::array<int, D> broadcast_dim;
+  for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
+
+  int broad_cats_times = 1;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) {
+      dims_ref[i] = x_rank + dims_ref[i];
+    }
+    reduced_dims_v[dims_ref[i]] = 1;
+    broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
+    broad_cats_times *= x_dims[dims_ref[i]];
+  }
+  auto reduced_dims = framework::make_ddim(reduced_dims_v);
+  auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
+  auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
+
+  auto& place = *context.eigen_device();
+
+  Functor functor;
+  functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
+          broad_cats_times);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_prod_op.cc b/paddle/fluid/operators/reduce_prod_op.cc
new file mode 100644
index 0000000000..713728b997
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_prod_op.h"
+
+REGISTER_REDUCE_OP(reduce_prod);
+REGISTER_OP_CPU_KERNEL(reduce_prod,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         float, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         double, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int64_t, ops::ProdFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_prod_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_prod_op.cu b/paddle/fluid/operators/reduce_prod_op.cu
new file mode 100644
index 0000000000..d62e677d92
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_prod_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_prod,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::ProdFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_prod_op.h b/paddle/fluid/operators/reduce_prod_op.h
new file mode 100644
index 0000000000..97748113e0
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct ProdFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->prod(dim);
+  }
+};
+
+struct ProdGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_sum_op.cc b/paddle/fluid/operators/reduce_sum_op.cc
new file mode 100644
index 0000000000..c5b5398787
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_sum_op.h"
+
+REGISTER_REDUCE_OP(reduce_sum);
+REGISTER_OP_CPU_KERNEL(
+    reduce_sum, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::SumFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_sum_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::SumGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu
new file mode 100644
index 0000000000..f2e16955a5
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_sum_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_sum,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::SumFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::SumGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.h b/paddle/fluid/operators/reduce_sum_op.h
new file mode 100644
index 0000000000..e67d7e1da5
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct SumFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->sum(dim);
+  }
+};
+
+struct SumGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reverse_op.cc b/paddle/fluid/operators/reverse_op.cc
new file mode 100644
index 0000000000..a20f7d231f
--- /dev/null
+++ b/paddle/fluid/operators/reverse_op.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reverse_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+class ReverseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
+    const auto& x_dims = ctx->GetInputDim("X");
+    const auto& axis = ctx->Attrs().Get<std::vector<int>>("axis");
+    PADDLE_ENFORCE(!axis.empty(), "'axis' can not be empty.");
+    for (int a : axis) {
+      PADDLE_ENFORCE_LT(a, x_dims.size(),
+                        "The axis must be less than input tensor's rank.");
+    }
+    ctx->SetOutputDim("Out", x_dims);
+  }
+};
+
+class ReverseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The LoDTensor to be flipped.");
+    AddOutput("Out", "The LoDTensor after flipping.");
+    AddAttr<std::vector<int>>(
+        "axis", "The axises that along which order of elements is reversed.");
+    AddComment(R"DOC(
+      Reverse Operator.
+
+      Reverse the order of elements in the input LoDTensor along given axises.
+
+      Case 1:
+        Given
+            X = [[1, 2, 3, 4, 5]
+                 [6, 7, 8, 9, 10]
+                 [11, 12, 13, 14, 15]],
+        and
+            axis = [0],
+        we get:
+            Out = [[11, 12, 13, 14, 15]
+                   [6, 7, 8, 9, 10]
+                   [1, 2, 3, 4, 5]].
+        
+      Case 2:
+        Given
+            X = [[[1, 2, 3, 4]
+                  [5, 6, 7, 8]]
+                 [[9, 10, 11, 12]
+                  [13, 14, 15, 16]]],
+        and
+            axis = [0, 2],
+        we get:
+            Out = [[[12, 11, 10, 9]
+                    [16, 15, 14, 13]]
+                   [[4, 3, 2, 1]
+                    [8, 7, 6, 5]]],
+    )DOC");
+  }
+};
+
+class ReverseGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
+    grad_op->SetType("reverse");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("axis", GetAttr("axis"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(reverse, ops::ReverseOp, ops::ReverseOpMaker,
+                  ops::ReverseGradMaker);
+REGISTER_OPERATOR(reverse_grad, ops::ReverseOp);
+REGISTER_OP_CPU_KERNEL(
+    reverse, ops::ReverseKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, uint8_t>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, double>)
diff --git a/paddle/fluid/operators/reverse_op.cu b/paddle/fluid/operators/reverse_op.cu
new file mode 100644
index 0000000000..635c41529b
--- /dev/null
+++ b/paddle/fluid/operators/reverse_op.cu
@@ -0,0 +1,24 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reverse_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    reverse, ops::ReverseKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, uint8_t>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, double>)
diff --git a/paddle/fluid/operators/reverse_op.h b/paddle/fluid/operators/reverse_op.h
new file mode 100644
index 0000000000..9063cd59bb
--- /dev/null
+++ b/paddle/fluid/operators/reverse_op.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T, int Rank>
+struct ReverseFunctor {
+  void operator()(const DeviceContext& context, const framework::LoDTensor& in,
+                  framework::LoDTensor* out, const std::vector<int>& axis) {
+    Eigen::array<bool, Rank> reverse_axis;
+    for (int i = 0; i < Rank; ++i) {
+      reverse_axis[i] = false;
+    }
+    for (int a : axis) {
+      reverse_axis[a] = true;
+    }
+
+    auto in_eigen = framework::EigenTensor<T, Rank>::From(in);
+    auto out_eigen = framework::EigenTensor<T, Rank>::From(*out);
+    auto* dev = context.eigen_device();
+
+    out_eigen.device(*dev) = in_eigen.reverse(reverse_axis);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReverseKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::LoDTensor>("X");
+    auto* out = context.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+    const auto& axis = context.Attr<std::vector<int>>("axis");
+    int rank = x->dims().size();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    switch (rank) {
+      case 1:
+        ReverseFunctor<DeviceContext, T, 1> functor1;
+        functor1(dev_ctx, *x, out, axis);
+        break;
+      case 2:
+        ReverseFunctor<DeviceContext, T, 2> functor2;
+        functor2(dev_ctx, *x, out, axis);
+        break;
+      case 3:
+        ReverseFunctor<DeviceContext, T, 3> functor3;
+        functor3(dev_ctx, *x, out, axis);
+        break;
+      case 4:
+        ReverseFunctor<DeviceContext, T, 4> functor4;
+        functor4(dev_ctx, *x, out, axis);
+        break;
+      case 5:
+        ReverseFunctor<DeviceContext, T, 5> functor5;
+        functor5(dev_ctx, *x, out, axis);
+        break;
+      case 6:
+        ReverseFunctor<DeviceContext, T, 6> functor6;
+        functor6(dev_ctx, *x, out, axis);
+        break;
+      default:
+        PADDLE_THROW(
+            "Reserve operator doesn't supports tensors whose ranks are greater "
+            "than 6.");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
index 1ce0907f3a..c6c975a23c 100644
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -19,8 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/macros.h"
 
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -36,31 +37,33 @@ class SendBarrierOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
+    bool sync_mode = Attr<bool>("sync_mode");
 
-    auto client_var_name = Output("RPCClient");
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
-                            "Can not find variable '%s' in the scope.",
-                            client_var_name);
-    auto* client_var = scope.FindVar(client_var_name);
-    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
 
-    // need to wait before sending send_barrier message
-    PADDLE_ENFORCE(rpc_client->Wait());
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<RPCCLIENT_T>();
+
+    VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
 
-    for (auto& ep : eps) {
-      VLOG(3) << "send barrier, ep: " << ep;
-      rpc_client->AsyncSendBatchBarrier(ep);
+    // need to wait before sending send_barrier message
+    rpc_client->Wait();
+    if (sync_mode) {
+      for (auto& ep : eps) {
+        VLOG(3) << "send barrier, ep: " << ep;
+        rpc_client->AsyncSendBatchBarrier(ep);
+      }
+      rpc_client->Wait();
     }
-    PADDLE_ENFORCE(rpc_client->Wait());
   }
 };
 
 class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
-    AddOutput("RPCClient",
-              "(RPCClient) The RPC client object which is"
-              "initialized at most once.");
     AddComment(R"DOC(
 SendBarrier operator
 
@@ -72,17 +75,7 @@ the Parameter Server would knew all variables have been sent.
                                       "(string vector, default 127.0.0.1:6164)"
                                       "Server endpoints to send variables to.")
         .SetDefault({"127.0.0.1:6164"});
-  }
-};
-
-class SendBarrierOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output("RPCClient").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
+    AddAttr<bool>("sync_mode", "work in sync_mode or not").SetDefault(true);
   }
 };
 
@@ -98,5 +91,4 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(send_barrier, ops::SendBarrierOp,
                   paddle::framework::EmptyGradOpMaker, ops::SendBarrierOpMaker,
-                  ops::SendBarrierOpVarTypeInference,
                   ops::SendBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 95bb1f3c69..84ec366253 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -16,10 +16,9 @@ limitations under the License. */
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/operators/send_recv_util.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -36,12 +35,9 @@ class SendOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     auto ins = Inputs("X");
-    auto outs = Outputs("Out");
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    std::vector<std::string> endpoints =
-        Attr<std::vector<std::string>>("endpoints");
 
-    bool sync_mode = Attr<bool>("sync_mode");
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    int sync_send = Attr<int>("sync_mode");
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
@@ -49,43 +45,21 @@ class SendOp : public framework::OperatorBase {
     // For profiling
     platform::RecordEvent record_event(Type(), &ctx);
 
-    auto client_var_name = Output("RPCClient");
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
-                            "Can not find variable '%s' in the scope.",
-                            client_var_name);
-    auto* client_var = scope.FindVar(client_var_name);
-    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<RPCCLIENT_T>();
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
         VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+        // TODO(Yancey1989): we need to use an IO threadpool which has
+        // a larger number of threads than the computing threadpool.
+        rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
     }
-    PADDLE_ENFORCE(rpc_client->Wait());
-
-    if (sync_mode) {
-      for (auto& ep : endpoints) {
-        VLOG(3) << "batch barrier, ep: " << ep;
-        rpc_client->AsyncSendBatchBarrier(ep);
-      }
-      PADDLE_ENFORCE(rpc_client->Wait());
-    }
-
-    if (outs.size() > 0) {
-      for (size_t i = 0; i < outs.size(); i++) {
-        VLOG(2) << "getting " << outs[i] << " from " << epmap[i];
-        rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
-      }
-      PADDLE_ENFORCE(rpc_client->Wait());
-      // tell pservers that current trainer have called fetch
-      for (auto& ep : endpoints) {
-        VLOG(2) << "send fetch barrier, ep: " << ep;
-        rpc_client->AsyncSendFetchBarrier(ep);
-      }
-      PADDLE_ENFORCE(rpc_client->Wait());
+    if (sync_send) {
+      rpc_client->Wait();
     }
   }
 };
@@ -93,40 +67,22 @@ class SendOp : public framework::OperatorBase {
 class SendOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
-    AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable();
-    AddOutput("Out", "(Tensor) Output tensor to be received from server")
+    AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
         .AsDuplicable();
-    AddOutput("RPCClient",
-              "(RPCClient) The RPC client object which is"
-              "initialized at most once.");
     AddComment(R"DOC(
 Send operator
 
-This operator will send tensor to recv_op at the parameter server.
+This operator will send variables to listen_and_serve op at the parameter server.
 )DOC");
-    // TODO(typhoonzero): remove this attr generate de-duplicated vector from
-    // epmap when initializing.
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints to send variables to.")
-        .SetDefault({});
+    AddAttr<int>("sync_mode",
+                 "(int, default 0)"
+                 "sync send or async send.")
+        .SetDefault(0);
     AddAttr<std::vector<std::string>>("epmap",
                                       "(string vector, default 127.0.0.1:6164)"
                                       "Server endpoints in the order of input "
                                       "variables for mapping")
-        .SetDefault({});
-    AddAttr<bool>("sync_mode", "work in sync_mode or not").SetDefault(true);
-  }
-};
-
-class SendOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output("RPCClient").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
+        .SetDefault({"127.0.0.1:6164"});
   }
 };
 
@@ -141,5 +97,4 @@ class SendOpShapeInference : public framework::InferShapeBase {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(send, ops::SendOp, paddle::framework::EmptyGradOpMaker,
-                  ops::SendOpMaker, ops::SendOpVarTypeInference,
-                  ops::SendOpShapeInference);
+                  ops::SendOpMaker, ops::SendOpShapeInference);
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index d5303eaf50..e550552b19 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -156,6 +156,7 @@ TEST(SendRecvOp, CPUDense) {
   std::thread server_thread(StartServerNet, false, &initialized);
   while (!initialized) {
   }
+
   static_cast<paddle::operators::ListenAndServOp *>(listen_and_serv_op.get())
       ->WaitServerReady();
 
@@ -175,9 +176,10 @@ TEST(SendRecvOp, CPUDense) {
   std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
   attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
   attrs.insert({"epmap", std::vector<std::string>({endpoint})});
-  auto send_op = f::OpRegistry::CreateOp(
-      "send", {{"X", {"x1"}}},
-      {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
+  const f::VariableNameMap &inputs = {{"X", {"x1"}}};
+  const f::VariableNameMap &outputs = {{"Out", {"Out"}}};
+
+  auto send_op = f::OpRegistry::CreateOp("send", inputs, outputs, attrs);
   send_op->Run(scope, place);
 
   auto in_var = scope.Var("x1");
@@ -220,9 +222,8 @@ TEST(SendRecvOp, CPUSparse) {
   std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
   attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
   attrs.insert({"epmap", std::vector<std::string>({endpoint})});
-  auto send_op = f::OpRegistry::CreateOp(
-      "send", {{"X", {"x1"}}},
-      {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
+  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
+                                         {{"Out", {"Out"}}}, attrs);
   send_op->Run(scope, place);
 
   auto x0 = scope.Var("x0")->GetMutable<f::SelectedRows>();
diff --git a/paddle/fluid/operators/send_recv_util.h b/paddle/fluid/operators/send_recv_util.h
index 113513eb6b..deab005149 100644
--- a/paddle/fluid/operators/send_recv_util.h
+++ b/paddle/fluid/operators/send_recv_util.h
@@ -20,6 +20,9 @@ namespace operators {
 
 inline bool NeedSend(const framework::Scope& scope,
                      const std::string& varname) {
+  // dummy variable is only used in parallel executor to represent
+  // some dependency relationship, we don't need to send/recv it.
+  if (varname == "dummy") return false;
   auto* var = scope.FindVar(varname);
   PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
                           varname);
diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc
deleted file mode 100644
index f11e84c176..0000000000
--- a/paddle/fluid/operators/send_vars_op.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/grpc_client.h"
-#include "paddle/fluid/operators/send_recv_util.h"
-
-namespace paddle {
-namespace operators {
-
-class SendVarsOp : public framework::OperatorBase {
- public:
-  SendVarsOp(const std::string& type, const framework::VariableNameMap& inputs,
-             const framework::VariableNameMap& outputs,
-             const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto ins = Inputs("X");
-
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    int sync_send = Attr<int>("sync_send");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-
-    auto client_var_name = Output("RPCClient");
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
-                            "Can not find variable '%s' in the scope.",
-                            client_var_name);
-    auto* client_var = scope.FindVar(client_var_name);
-    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
-
-    for (size_t i = 0; i < ins.size(); i++) {
-      if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-        // TODO(Yancey1989): we need to use an IO threadpool which has
-        // a larger number of threads than the computing threadpool.
-        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
-      } else {
-        VLOG(3) << "don't send no-initialied variable: " << ins[i];
-      }
-    }
-    if (sync_send) {
-      rpc_client->Wait();
-    }
-  }
-};
-
-class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
-        .AsDuplicable();
-    AddOutput("RPCClient",
-              "(RPCClient) The RPC client object which will be"
-              "initialized at most once.");
-    AddComment(R"DOC(
-Send operator
-
-This operator will send variables to listen_and_serve op at the parameter server.
-)DOC");
-    AddAttr<int>("sync_send",
-                 "(int, default 0)"
-                 "sync send or async send.")
-        .SetDefault(0);
-    AddAttr<std::vector<std::string>>("epmap",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints in the order of input "
-                                      "variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
-  }
-};
-
-class SendVarsOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output("RPCClient").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
-  }
-};
-
-class SendVarsOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(send_vars, ops::SendVarsOp,
-                  paddle::framework::EmptyGradOpMaker, ops::SendVarsOpMaker,
-                  ops::SendVarsOpVarTypeInference,
-                  ops::SendVarsOpShapeInference);
diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc
index 7a2bdeac09..fef230e42d 100644
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -74,7 +74,8 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Grad", "(Tensor or SelectedRows) Input gradient");
     AddOutput("ParamOut",
               "(Tensor or SelectedRows, same with Param) "
-              "Output parameter, should share the same memory with Param");
+              "Output parameter, should share the same memory with Param")
+        .Reuse("Param");
     AddComment(R"DOC(
 
 SGD operator
diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
index f3e88b0a0b..2685ce217e 100644
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -96,8 +96,12 @@ class SGDOpKernel : public framework::OpKernel<T> {
         return;
       }
 
-      size_t param_row_width = param.value().numel() / param.rows().size();
-      size_t grad_row_width = grad.value().numel() / grad.rows().size();
+      auto param_row_width = param.value().dims()[1];
+      auto grad_row_width = grad.value().dims()[1];
+      VLOG(4) << " param rows: " << param.rows().size()
+              << " param memory rows: " << param.value().dims()[0]
+              << " grad rows: " << grad.rows().size()
+              << " grad memory rows: " << grad.value().dims()[0];
       PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
                         "param_row should have the same size with grad_row");
 
@@ -110,7 +114,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
         int64_t id_index = param.Index(grad.rows()[i]);
         PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
                           "id should be in the table");
-        for (size_t j = 0; j < grad_row_width; j++) {
+        for (int64_t j = 0; j < grad_row_width; j++) {
           out_data[id_index * grad_row_width + j] -=
               lr[0] * grad_data[i * grad_row_width + j];
         }
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
new file mode 100644
index 0000000000..c75fce7959
--- /dev/null
+++ b/paddle/fluid/operators/shape_op.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/shape_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ShapeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input (Input) of get_shape op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output (Out) of get_shape op should not be null.");
+    auto in_dim = ctx->GetInputDim("Input");
+    ctx->SetOutputDim("Out", {in_dim.size()});
+  }
+};
+
+class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input", "(Tensor), The input tensor.");
+    AddOutput("Out", "(Tensor), The shape of input tensor.");
+    AddComment(R"DOC(
+Shape Operator. 
+Get the shape of input tensor.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(shape, ops::ShapeOp, ops::ShapeOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int64_t>,
+                       ops::ShapeKernel<float>, ops::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
new file mode 100644
index 0000000000..7736a2a1e1
--- /dev/null
+++ b/paddle/fluid/operators/shape_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/shape_op.h"
+
+REGISTER_OP_CUDA_KERNEL(shape, paddle::operators::ShapeKernel<int>,
+                        paddle::operators::ShapeKernel<int64_t>,
+                        paddle::operators::ShapeKernel<float>,
+                        paddle::operators::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h
new file mode 100644
index 0000000000..3be86b66a5
--- /dev/null
+++ b/paddle/fluid/operators/shape_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ShapeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_t = ctx.Input<Tensor>("Input");
+    auto* out_t = ctx.Output<Tensor>("Out");
+    auto out_data = out_t->mutable_data<int64_t>(platform::CPUPlace());
+    auto in_dims = in_t->dims();
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
new file mode 100644
index 0000000000..61bb445e8b
--- /dev/null
+++ b/paddle/fluid/operators/slice_op.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/slice_op.h"
+#include <algorithm>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class SliceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input (Input) of slice op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output (Out) of slice op should not be null.");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE(in_dims.size() < 7,
+                   "The rank of input should be less than 7.");
+    framework::DDim out_dims(in_dims);
+    auto axes = ctx->Attrs().Get<std::vector<int>>("axes");
+    auto starts = ctx->Attrs().Get<std::vector<int>>("starts");
+    auto ends = ctx->Attrs().Get<std::vector<int>>("ends");
+
+    PADDLE_ENFORCE_EQ(starts.size(), ends.size());
+    PADDLE_ENFORCE_EQ(starts.size(), axes.size());
+    int dim_value, start, end;
+    for (size_t i = 0; i < axes.size(); ++i) {
+      dim_value = out_dims[axes[i]];
+      start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
+      end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
+      start = std::max(start, 0);
+      end = std::max(end, 0);
+      start = std::min(start, dim_value);
+      end = std::min(end, dim_value);
+      start = std::min(start, end);
+      out_dims[axes[i]] = end - start;
+    }
+    ctx->SetOutputDim("Out", out_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.GetPlace());
+  }
+};
+
+class SliceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input", "Tensor of data to extract slices from.");
+    AddOutput("Out", "Sliced data tensor.");
+
+    AddAttr<std::vector<int>>(
+        "axes",
+        "(list<int>) Axes that `starts` and `ends` apply to. It's optional."
+        "If not present, will be treated as [0, 1, ..., len(`starts`) - 1].");
+    AddAttr<std::vector<int>>(
+        "starts",
+        "(list<int>) Starting indices of corresponding axis in `axes`");
+    AddAttr<std::vector<int>>(
+        "ends",
+        "(list<int>) Starting indices of corresponding axis in `axes`.");
+
+    AddComment(R"DOC(
+Slice Operator.
+
+Produces a slice of the input tensor along multiple axes. Similar to numpy:
+https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
+Slice uses `axes`, `starts` and `ends` attributes to specify the start and 
+end dimension for each axis in the list of axes, it uses this information
+to slice the input data tensor. If a negative value is passed for any of 
+the start or end indices, it represents number of elements before the end 
+of that dimension. If the value passed to start or end is larger than
+the n (the number of elements in this dimension), it represents n. 
+For slicing to the end of a dimension with unknown size, it is recommended 
+to pass in INT_MAX. If axes are omitted, they are set to [0, ..., ndim-1].
+
+    Example 1:
+    Given:
+        data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+        axes = [0, 1]
+        starts = [1, 0]
+        ends = [2, 3]
+    Then:
+        result = [ [5, 6, 7], ]
+
+    Example 2:
+    Given:
+        data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+        starts = [0, 1]
+        ends = [-1, 1000]
+    Then:
+        result = [ [2, 3, 4], ]
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(slice, ops::SliceOp, ops::SliceOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    slice, ops::SliceKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SliceKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::SliceKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SliceKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu
new file mode 100644
index 0000000000..8c1767c70b
--- /dev/null
+++ b/paddle/fluid/operators/slice_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/slice_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
new file mode 100644
index 0000000000..ba231aee17
--- /dev/null
+++ b/paddle/fluid/operators/slice_op.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SliceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    int rank = ctx.Input<framework::Tensor>("Input")->dims().size();
+    switch (rank) {
+      case 1:
+        SliceCompute<1>(ctx);
+        break;
+      case 2:
+        SliceCompute<2>(ctx);
+        break;
+      case 3:
+        SliceCompute<3>(ctx);
+        break;
+      case 4:
+        SliceCompute<4>(ctx);
+        break;
+      case 5:
+        SliceCompute<5>(ctx);
+        break;
+      case 6:
+        SliceCompute<6>(ctx);
+        break;
+    }
+  }
+
+ private:
+  template <size_t D>
+  void SliceCompute(const framework::ExecutionContext& context) const {
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    auto in = context.Input<framework::Tensor>("Input");
+    auto out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+    auto out_dims = out->dims();
+    auto in_dims = in->dims();
+    auto axes = context.Attr<std::vector<int>>("axes");
+    auto starts = context.Attr<std::vector<int>>("starts");
+
+    auto offsets = Eigen::array<int, D>();
+    auto extents = Eigen::array<int, D>();
+    for (size_t i = 0; i < D; ++i) {
+      offsets[i] = 0;
+      extents[i] = out_dims[i];
+    }
+    int start;
+    for (size_t i = 0; i < axes.size(); ++i) {
+      start = starts[i];
+      if (start < 0) {
+        start = (start + in_dims[axes[i]]);
+      }
+      start = std::max(start, 0);
+      offsets[axes[i]] = start;
+    }
+    auto in_t =
+        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *in);
+    auto out_t =
+        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *out);
+    out_t.device(place) = in_t.slice(offsets, extents);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index cc256aa627..847b3cbd1b 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -49,6 +49,9 @@ class SoftmaxOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     // choose cudnn kernel if the runtime supported.
     framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
     if (platform::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
@@ -58,6 +61,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     if (library_ == framework::LibraryType::kPlain &&
         platform::CanMKLDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
     }
 #endif
 
@@ -68,9 +72,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
                      "float16 can only be used on GPU place");
     }
 
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                   framework::StringToDataLayout(data_format),
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
                                    library_);
   }
 };
@@ -81,7 +83,8 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X",
              "The input tensor of softmax. "
              "2-D with shape [batch_size, input_feature_dimensions].");
-    AddOutput("Out", "The normalized values with the same shape as X.");
+    AddOutput("Out", "The normalized values with the same shape as X.")
+        .Reuse("X");
     AddAttr<bool>(
         "use_cudnn",
         "(bool, default false) Only used in cudnn kernel, need install cudnn")
@@ -142,6 +145,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     // choose cudnn kernel if the runtime supported.
     framework::LibraryType library_{framework::LibraryType::kPlain};
+
 #ifdef PADDLE_WITH_CUDA
     if (platform::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index bcc5e22d4a..863baba9ea 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -115,7 +115,7 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
         .AsDuplicable();
-    AddOutput("Out", "(Tensor) The output tensor of sum operator.");
+    AddOutput("Out", "(Tensor) The output tensor of sum operator.").Reuse("X");
     AddComment(R"DOC(
 Sum operator.
 
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
new file mode 100644
index 0000000000..4b1208c437
--- /dev/null
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -0,0 +1,143 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+
+#include "paddle/fluid/operators/tensorrt_engine_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+namespace operators {
+
+using inference::Singleton;
+using inference::tensorrt::TRT_EngineManager;
+
+using FluidDT = framework::proto::VarType_Type;
+using TRT_DT = nvinfer1::DataType;
+
+namespace {
+
+TRT_DT FluidDataType2TRT(FluidDT type) {
+  switch (type) {
+    case FluidDT::VarType_Type_FP32:
+      return TRT_DT::kFLOAT;
+    case FluidDT::VarType_Type_INT32:
+      return TRT_DT::kINT32;
+    default:
+      return TRT_DT::kINT32;
+  }
+  PADDLE_THROW("unkown type");
+  return TRT_DT::kINT32;
+}
+
+nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
+  PADDLE_ENFORCE_GT(shape.size(), 1UL,
+                    "TensorRT' tensor input requires at least 2 dimensions");
+  PADDLE_ENFORCE_LE(shape.size(), 4UL,
+                    "TensorRT' tensor input requires at most 4 dimensions");
+
+  switch (shape.size()) {
+    case 2:
+      return nvinfer1::Dims2(shape[0], shape[1]);
+    case 3:
+      return nvinfer1::Dims3(shape[0], shape[1], shape[2]);
+    case 4:
+      return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]);
+    default:
+      return nvinfer1::Dims();
+  }
+  return nvinfer1::Dims();
+}
+
+}  // namespace
+
+template <typename DeviceContext, typename T>
+void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
+    const framework::ExecutionContext &context) const {
+  VLOG(4) << "Prepare engine";
+  // Get the ProgramDesc and pass to convert.
+  framework::proto::BlockDesc block_desc;
+  block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
+  max_batch_ = context.Attr<int>("max_batch");
+  auto max_workspace = context.Attr<int>("max_workspace");
+  engine_ = Singleton<TRT_EngineManager>::Global().Create(
+      max_batch_, max_workspace, &stream_);
+  engine_->InitNetwork();
+
+  framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
+  // Add inputs
+  VLOG(4) << "declare inputs";
+  for (auto &input : context.Inputs("Xs")) {
+    VLOG(4) << "declare input " << input;
+    auto *var = block.FindVar(input);
+    PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+                      "TensorRT engine only takes LoDTensor as input");
+    auto shape = var->GetShape();
+    engine_->DeclareInput(
+        input, FluidDataType2TRT(
+                   var->Proto()->type().lod_tensor().tensor().data_type()),
+        Vec2TRT_Dims(var->GetShape()));
+  }
+
+  // TODO(Superjomn) parameters should be passed after analysised from outside.
+  inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
+      block_desc, {}, context.scope(), engine_);
+
+  // Add outputs
+  VLOG(4) << "declare outputs";
+  for (auto &output : context.Outputs("Ys")) {
+    VLOG(4) << "declare output " << output;
+    engine_->DeclareOutput(output);
+  }
+
+  engine_->FreezeNetwork();
+}
+
+class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Xs", "A list of inputs.").AsDuplicable();
+    AddOutput("Ys", "A list of outputs").AsDuplicable();
+    AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<int>("max_batch", "the maximum batch size.");
+    AddAttr<int>("max_workspace", "the maximum batch size.");
+    AddComment("TensorRT engine operator.");
+  }
+};
+
+class TensorRTEngineInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
+                  ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    tensorrt_engine,
+    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
+
+#endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
new file mode 100644
index 0000000000..4b089601ff
--- /dev/null
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUDA
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+
+namespace paddle {
+namespace operators {
+
+class TensorRTEngineOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input0 = ctx.Inputs("Xs").front();
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::ToDataType(ctx.scope()
+                                  .FindVar(input0)
+                                  ->GetMutable<framework::LoDTensor>()
+                                  ->type()),
+        platform::CPUPlace());
+    return kt;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TensorRTEngineKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    if (!engine_) {
+      Prepare(context);
+    }
+    auto input_names = context.op().Inputs("Xs");
+    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
+    // Try to determine a batch_size
+    auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
+        context.scope(), input_names.front());
+    int batch_size = tensor0.dims()[0];
+    PADDLE_ENFORCE_LE(batch_size, max_batch_);
+
+    // Convert input tensor from fluid to engine.
+    for (const auto& x : context.Inputs("Xs")) {
+      // convert input and copy to TRT engine's buffer
+      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
+          context.scope(), x);
+      if (platform::is_cpu_place(t.place())) {
+        engine_->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
+                                 t.memory_size());
+      } else {
+        engine_->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
+                                 t.memory_size());
+      }
+    }
+    // Execute the engine.
+    PADDLE_ENFORCE_GT(batch_size, 0);
+    engine_->Execute(batch_size);
+    // Convert output tensor from engine to fluid
+    for (const auto& y : context.Outputs("Ys")) {
+      // convert output and copy to fluid.
+      nvinfer1::ITensor* trt_t = engine_->GetITensor(y);
+      auto dims = trt_t->getDimensions();
+      // Use the output ITensor's dims to reshape the Fluid Tensor.
+      std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
+
+      auto* fluid_v = context.scope().FindVar(y);
+      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
+      auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
+      fluid_t->Resize(framework::make_ddim(ddim));
+      auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
+      if (platform::is_cpu_place(fluid_t->place())) {
+        // TODO(Superjomn) change this float to dtype size.
+        engine_->GetOutputInCPU(
+            y, fluid_t->mutable_data<float>(platform::CPUPlace()),
+            size * sizeof(float));
+      } else {
+        engine_->GetOutputInGPU(
+            y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
+            size * sizeof(float));
+      }
+    }
+
+    cudaStreamSynchronize(stream_);
+  }
+
+ protected:
+  // Build the engine.
+  void Prepare(const framework::ExecutionContext& context) const;
+
+ private:
+  mutable cudaStream_t stream_;
+  mutable inference::tensorrt::TensorRTEngine* engine_{nullptr};
+  mutable int max_batch_{0};
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
new file mode 100644
index 0000000000..6f383de259
--- /dev/null
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -0,0 +1,152 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+USE_CPU_ONLY_OP(tensorrt_engine);
+
+namespace paddle {
+namespace operators {
+
+namespace {
+void CreateCPUTensor(framework::Scope* scope, const std::string& name,
+                     const std::vector<int64_t>& shape) {
+  auto* var = scope->Var(name);
+  auto* tensor = var->GetMutable<framework::LoDTensor>();
+  auto dims = framework::make_ddim(shape);
+  tensor->Resize(dims);
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  inference::tensorrt::RandomizeTensor(tensor, place, ctx);
+}
+
+void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
+                          const std::string& name,
+                          const std::vector<int64_t>& shape) {
+  using framework::proto::VarType;
+  auto* var = block->add_vars();
+  framework::VarDesc desc(name);
+  desc.SetType(VarType::LOD_TENSOR);
+  desc.SetDataType(VarType::FP32);
+  desc.SetShape(shape);
+  *var = *desc.Proto();
+}
+
+template <typename T>
+void SetAttr(framework::proto::OpDesc* op, const std::string& name,
+             const T& data);
+
+template <>
+void SetAttr<std::string>(framework::proto::OpDesc* op, const std::string& name,
+                          const std::string& data) {
+  auto* attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRING);
+  attr->set_s(data);
+}
+template <>
+void SetAttr<int>(framework::proto::OpDesc* op, const std::string& name,
+                  const int& data) {
+  auto* attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(data);
+}
+template <>
+void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
+                      const int64_t& data) {
+  auto* attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::LONG);
+  attr->set_l(data);
+}
+
+}  // namespace
+
+TEST(TensorRTEngineOp, manual) {
+  framework::ProgramDesc program;
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+  LOG(INFO) << "create mul op";
+  auto* mul = block_desc.AppendOp();
+  mul->SetType("mul");
+  mul->SetInput("X", std::vector<std::string>({"x"}));     // 2 x 4
+  mul->SetInput("Y", std::vector<std::string>({"y"}));     // 4 x 6
+  mul->SetOutput("Out", std::vector<std::string>({"z"}));  // 2 x 6
+
+  LOG(INFO) << "create fc op";
+  auto* fc = block_desc.AppendOp();
+  fc->SetType("mul");
+  fc->SetInput("X", std::vector<std::string>({"z"}));
+  fc->SetInput("Y", std::vector<std::string>({"y0"}));     // 6 x 8
+  fc->SetOutput("Out", std::vector<std::string>({"z0"}));  // 2 x 8
+
+  // Set inputs' variable shape in BlockDesc
+  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4}));
+  AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
+  AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
+  AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
+
+  // It is wired, need to copy manually.
+  *block_->add_ops() = *mul->Proto();
+  *block_->add_ops() = *fc->Proto();
+
+  ASSERT_EQ(block_->ops_size(), 2);
+
+  LOG(INFO) << "create tensorrt desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("tensorrt_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x", "y", "y0"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
+  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
+                       block_->SerializeAsString());
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 30);
+  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 1 << 10);
+
+  LOG(INFO) << "create engine op";
+  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  // Prepare variables.
+  CreateCPUTensor(&scope, "x", std::vector<int64_t>({2, 4}));
+  CreateCPUTensor(&scope, "y", std::vector<int64_t>({4, 6}));
+  CreateCPUTensor(&scope, "z", std::vector<int64_t>({2, 6}));
+
+  CreateCPUTensor(&scope, "y0", std::vector<int64_t>({6, 8}));
+  CreateCPUTensor(&scope, "z0", std::vector<int64_t>({2, 8}));
+
+  // Execute them.
+  LOG(INFO) << "engine_op run";
+  engine_op->Run(scope, place);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+USE_TRT_CONVERTER(mul)
+USE_TRT_CONVERTER(fc)
diff --git a/paddle/fluid/operators/test_send_nccl_id.cc b/paddle/fluid/operators/test_send_nccl_id.cc
index bbae1d54aa..5015b10055 100644
--- a/paddle/fluid/operators/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/test_send_nccl_id.cc
@@ -20,13 +20,18 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #include "paddle/fluid/string/printf.h"
 
+#ifdef PADDLE_WITH_GRPC
+#include "paddle/fluid/operators/send_recv_util.h"
+#endif
+
 USE_NO_KERNEL_OP(listen_and_serv);
 
 namespace f = paddle::framework;
@@ -35,42 +40,43 @@ namespace m = paddle::operators::math;
 namespace detail = paddle::operators::detail;
 namespace string = paddle::string;
 
-std::unique_ptr<detail::AsyncGRPCServer> rpc_service;
+std::unique_ptr<detail::RPCServer> g_rpc_service;
+std::unique_ptr<detail::RequestHandler> g_req_handler;
 
-void StartServer(std::atomic<bool>* initialized) {
+void StartServer() {
   f::Scope scope;
   p::CPUPlace place;
   scope.Var(NCCL_ID_VARNAME);
   p::DeviceContextPool& pool = p::DeviceContextPool::Instance();
   auto& dev_ctx = *pool.Get(p::CPUPlace());
 
-  rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", true));
-
   f::ProgramDesc empty_program;
   f::Executor executor(dev_ctx.GetPlace());
-  rpc_service->SetScope(&scope);
-  rpc_service->SetDevCtx(&dev_ctx);
-  rpc_service->SetProgram(&empty_program);
-  rpc_service->SetExecutor(&executor);
+  g_req_handler->SetScope(&scope);
+  g_req_handler->SetDevCtx(&dev_ctx);
+  g_req_handler->SetProgram(&empty_program);
+  g_req_handler->SetExecutor(&executor);
+
+  g_rpc_service->RegisterRPC(detail::kRequestSend, g_req_handler.get());
+  g_req_handler->SetRPCServer(g_rpc_service.get());
 
   std::thread server_thread(
-      std::bind(&detail::AsyncGRPCServer::RunSyncUpdate, rpc_service.get()));
-  *initialized = true;
-  rpc_service->SetCond(0);
-  auto recv = rpc_service->Get();
+      std::bind(&detail::RPCServer::StartServer, g_rpc_service.get()));
+
+  g_rpc_service->SetCond(detail::kRequestSend);
+  g_rpc_service->WaitBarrier(detail::kRequestSend);
+
   LOG(INFO) << "got nccl id and stop server...";
-  rpc_service->ShutDown();
+  g_rpc_service->ShutDown();
   server_thread.join();
 }
 
-TEST(SendNcclId, Normal) {
-  std::atomic<bool> initialized{false};
-  std::thread server_thread(StartServer, &initialized);
-  while (!initialized) {
-  }
-  // wait server to start
-  // sleep(2);
-  rpc_service->WaitServerReady();
+TEST(SendNcclId, RPCServer) {
+  g_req_handler.reset(new detail::RequestSendHandler(true));
+  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
+
+  std::thread server_thread(StartServer);
+  g_rpc_service->WaitServerReady();
 
   f::Scope scope;
   p::CPUPlace place;
@@ -78,17 +84,22 @@ TEST(SendNcclId, Normal) {
   auto& dev_ctx = *pool.Get(p::CPUPlace());
 
   auto var = scope.Var(NCCL_ID_VARNAME);
-  // var->SetType(f::proto::VarType_Type_RAW);
   auto id = var->GetMutable<ncclUniqueId>();
   p::dynload::ncclGetUniqueId(id);
 
-  int port = rpc_service->GetSelectedPort();
+  int port = g_rpc_service->GetSelectedPort();
+
   std::string ep = string::Sprintf("127.0.0.1:%d", port);
-  detail::RPCClient client;
 
-  client.AsyncSendVariable(ep, dev_ctx, scope, NCCL_ID_VARNAME);
-  client.Wait();
+  detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>();
+
+  LOG(INFO) << "connect to server" << ep;
+  client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME);
+  client->Wait();
+  client->AsyncSendBatchBarrier(ep);
+  client->Wait();
+
   server_thread.join();
-  auto* ptr = rpc_service.release();
-  delete ptr;
+  g_rpc_service.reset(nullptr);
+  g_req_handler.reset(nullptr);
 }
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index c17d1afc30..4a8ac441cf 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -50,7 +50,7 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X", "(Tensor) The input of Topk op");
-    AddOutput("Out", "(Tensor) The output tensor of Topk op");
+    AddOutput("Out", "(Tensor) The output tensor of Topk op").Reuse("X");
     AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
     AddComment(R"DOC(
 Top K operator
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index d44eeae8e6..7ddb82ef6f 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -55,6 +55,9 @@ class TopkKernel : public framework::OpKernel<T> {
     // NOTE: eigen shape doesn't affect paddle tensor.
     eg_input.reshape(flat2dims);
 
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
     for (size_t i = 0; i < row; i++) {
       std::vector<std::pair<T, size_t>> vec;
       for (size_t j = 0; j < col; j++) {
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
index 705cc894c0..ab70c1f059 100644
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -186,8 +186,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
 
     // warpctc accesses labels in CPU memory
     Tensor warpctc_label;
-    TensorCopy(*label, platform::CPUPlace(), ctx.device_context(),
-               &warpctc_label);
+    TensorCopySync(*label, platform::CPUPlace(), &warpctc_label);
     const int* warpctc_label_data = warpctc_label.data<int>();
     // warpctc stores loss in CPU memory
     Tensor warpctc_loss;
diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h
index 123d3598f4..2ce9b31bb8 100644
--- a/paddle/fluid/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
 
-#if defined(__APPLE__) && defined(__CUDA_ARCH__) && !defined(NDEBUG)
+#if defined(__CUDA_ARCH__)
 #include <stdio.h>
 #define PADDLE_ASSERT(e)                                           \
   do {                                                             \
@@ -38,6 +38,9 @@ limitations under the License. */
   } while (0)
 #else
 #include <assert.h>
-#define PADDLE_ASSERT(e) assert(e)
+// For cuda, the assertions can affect performance and it is therefore
+// recommended to disable them in production code
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
+#define PADDLE_ASSERT(e) assert((e))
 #define PADDLE_ASSERT_MSG(e, m) assert((e) && (m))
 #endif
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 4fc9aae8e3..c202eed354 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -21,12 +21,17 @@ limitations under the License. */
 #include <unistd.h>
 #endif
 
+#include <algorithm>
 #include "gflags/gflags.h"
 
 DEFINE_double(fraction_of_cpu_memory_to_use, 1,
               "Default use 100% of CPU memory for PaddlePaddle,"
               "reserve the rest for page tables, etc");
 
+DEFINE_uint64(
+    initial_cpu_memory_in_mb, 500,
+    "Default initial 500MB of CPU memory for PaddlePaddle, in MD unit.");
+
 DEFINE_double(
     fraction_of_cuda_pinned_memory_to_use, 0.5,
     "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
@@ -54,7 +59,9 @@ inline size_t CpuTotalPhysicalMemory() {
 size_t CpuMaxAllocSize() {
   // For distributed systems, it requires configuring and limiting
   // the fraction of memory to use.
-  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+  return std::min(static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use *
+                                      CpuTotalPhysicalMemory()),
+                  FLAGS_initial_cpu_memory_in_mb * 1 << 20);
 }
 
 size_t CpuMinChunkSize() {
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index c0d399d078..6ea4f8b7cb 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -22,6 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/macros.h"
 
+DECLARE_bool(cudnn_deterministic);
+
 namespace paddle {
 namespace platform {
 
@@ -76,8 +78,44 @@ enum class DataLayout {  // Not use
 enum class PoolingMode {
   kMaximum,
   kAverage,
+  kMaximumDeterministic,
 };
 
+#if CUDNN_VERSION < 6000
+#pragma message "CUDNN version under 6.0 is supported at best effort."
+#pragma message "We strongly encourage you to move to 6.0 and above."
+#pragma message "This message is intended to annoy you enough to update."
+#pragma message \
+    "please see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/"
+
+inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
+  switch (mode) {
+    case PoolingMode::kMaximumDeterministic:
+      return CUDNN_POOLING_MAX;
+    case PoolingMode::kAverage:
+      return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    case PoolingMode::kMaximum:
+      return CUDNN_POOLING_MAX;
+    default:
+      PADDLE_THROW("Unexpected pooling mode.");
+  }
+}
+#else
+
+inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
+  switch (mode) {
+    case PoolingMode::kMaximumDeterministic:
+      return CUDNN_POOLING_MAX_DETERMINISTIC;
+    case PoolingMode::kAverage:
+      return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    case PoolingMode::kMaximum:
+      return CUDNN_POOLING_MAX;
+    default:
+      PADDLE_THROW("Unexpected pooling mode.");
+  }
+}
+#endif  // CUDNN_VERSION < 6000
+
 template <typename T>
 class CudnnDataType;
 
@@ -293,9 +331,7 @@ class ScopedPoolingDescriptor {
     PADDLE_ENFORCE_EQ(kernel.size(), pads.size());
     PADDLE_ENFORCE_EQ(kernel.size(), strides.size());
     PADDLE_ENFORCE(dynload::cudnnSetPoolingNdDescriptor(
-        desc_, (mode == PoolingMode::kMaximum
-                    ? CUDNN_POOLING_MAX
-                    : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING),
+        desc_, (GetPoolingMode(mode)),
         CUDNN_PROPAGATE_NAN,  // Always propagate nans.
         kernel.size(), kernel.data(), pads.data(), strides.data()));
     return desc_;
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 1f733d71bd..6c50ab2685 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -175,7 +175,6 @@ CUDADeviceContext::~CUDADeviceContext() {
 Place CUDADeviceContext::GetPlace() const { return place_; }
 
 void CUDADeviceContext::Wait() const {
-  std::lock_guard<std::recursive_mutex> guard(mutex_);
   PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
   PADDLE_ENFORCE(cudaGetLastError());
 }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index a9c1984616..292ffef1ae 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -11,6 +11,7 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
+#include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -100,7 +101,7 @@ class CUDADeviceContext : public DeviceContext {
 
   template <typename Callback>
   void RecordEvent(cudaEvent_t ev, Callback callback) {
-    std::lock_guard<std::recursive_mutex> guard(mutex_);
+    std::lock_guard<std::mutex> guard(mtx_);
     callback();
     PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
   }
@@ -110,8 +111,6 @@ class CUDADeviceContext : public DeviceContext {
 
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;
   std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
-
-  mutable std::recursive_mutex mutex_;
   cudaStream_t stream_;
   cudnnHandle_t cudnn_handle_;
   cublasHandle_t cublas_handle_;
@@ -119,6 +118,8 @@ class CUDADeviceContext : public DeviceContext {
   int compute_capability;
   int multi_process;
   int max_threads_per_mp;
+
+  std::mutex mtx_;
 };
 
 template <>
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index c9e1063168..1a9be044e0 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -245,7 +245,6 @@ class DeviceTracerImpl : public DeviceTracer {
   void Enable() {
     std::lock_guard<std::mutex> l(trace_mu_);
     if (enabled_) {
-      fprintf(stderr, "DeviceTracer already enabled\n");
       return;
     }
     EnableActivity();
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 81acaff87d..25bcda7eed 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -45,7 +45,7 @@ extern void *cublas_dso_handle;
       std::call_once(cublas_dso_flag, []() {                                 \
         cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
       });                                                                    \
-      void *p_##__name = dlsym(cublas_dso_handle, #__name);                  \
+      static void *p_##__name = dlsym(cublas_dso_handle, #__name);           \
       return reinterpret_cast<FUNC_TYPE>(p_##__name)(args...);               \
     }                                                                        \
   };                                                                         \
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 34d83e3956..77e46fa768 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -39,7 +39,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
         cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
       });                                                                  \
       EnforceCUDNNLoaded(#__name);                                         \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                 \
+      static void* p_##__name = dlsym(cudnn_dso_handle, #__name);          \
       return reinterpret_cast<cudnn_func>(p_##__name)(args...);            \
     }                                                                      \
   };                                                                       \
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
index e64de7c20f..2ad52bc7d3 100644
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -45,7 +45,7 @@ extern void *cupti_dso_handle;
       std::call_once(cupti_dso_flag, []() {                                \
         cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
       });                                                                  \
-      void *p_##__name = dlsym(cupti_dso_handle, #__name);                 \
+      static void *p_##__name = dlsym(cupti_dso_handle, #__name);          \
       return reinterpret_cast<cuptiFunc>(p_##__name)(args...);             \
     }                                                                      \
   };                                                                       \
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
index 46ad4379d5..5b9e0820e0 100644
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -34,7 +34,7 @@ extern void *curand_dso_handle;
       std::call_once(curand_dso_flag, []() {                                 \
         curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
       });                                                                    \
-      void *p_##__name = dlsym(curand_dso_handle, #__name);                  \
+      static void *p_##__name = dlsym(curand_dso_handle, #__name);           \
       return reinterpret_cast<curandFunc>(p_##__name)(args...);              \
     }                                                                        \
   };                                                                         \
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index 37902ae20c..575516f818 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -37,7 +37,7 @@ extern void* nccl_dso_handle;
       std::call_once(nccl_dso_flag, []() {                               \
         nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
       });                                                                \
-      void* p_##__name = dlsym(nccl_dso_handle, #__name);                \
+      static void* p_##__name = dlsym(nccl_dso_handle, #__name);         \
       return reinterpret_cast<nccl_func>(p_##__name)(args...);           \
     }                                                                    \
   };                                                                     \
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index f584a49da0..5d67658b94 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -40,7 +40,7 @@ extern void* tensorrt_dso_handle;
             paddle::platform::dynload::GetTensorRtDsoHandle();          \
         PADDLE_ENFORCE(tensorrt_dso_handle, "load tensorrt so failed"); \
       });                                                               \
-      void* p_##__name = dlsym(tensorrt_dso_handle, #__name);           \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);    \
       PADDLE_ENFORCE(p_##__name, "load %s failed", #__name);            \
       return reinterpret_cast<tensorrt_func>(p_##__name)(args...);      \
     }                                                                   \
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
index 7c70649d21..d157c1fda7 100644
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -40,7 +40,7 @@ extern void* warpctc_dso_handle;
       std::call_once(warpctc_dso_flag, []() {                                  \
         warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
       });                                                                      \
-      void* p_##_name = dlsym(warpctc_dso_handle, #__name);                    \
+      static void* p_##_name = dlsym(warpctc_dso_handle, #__name);             \
       return reinterpret_cast<warpctcFunc>(p_##_name)(args...);                \
     }                                                                          \
   };                                                                           \
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 56ed5912a1..de711b7d23 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <mkldnn.h>
 #include <vector>
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace platform {
@@ -38,6 +39,11 @@ void* to_void_cast(const Type* t) {
   return static_cast<void*>(const_cast<Type*>(t));
 }
 
+template <typename Type>
+void* to_void_reinterpret_cast(const Type* t) {
+  return reinterpret_cast<void*>(const_cast<Type*>(t));
+}
+
 template <class Type>
 using tf_desc = typename Type::desc;
 
@@ -81,5 +87,17 @@ inline mkldnn::memory::data_type MKLDNNGetDataType<float>() {
   return mkldnn::memory::f32;
 }
 
+inline void Reorder(const mkldnn::memory& src, const mkldnn::memory& dst) {
+  auto reorder_prim = mkldnn::reorder(src, dst);
+  std::vector<mkldnn::primitive> pipeline;
+  pipeline.push_back(reorder_prim);
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+
+inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) {
+  return static_cast<mkldnn::memory::format>(
+      memory.get_primitive_desc().desc().data.format);
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 09367889a9..6f8e3f22db 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <stdio.h>
+#include <string>
 #include <thread>  // NOLINT
 #include <typeindex>
 #include <vector>
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 2fb5c6dc6b..01de9d7041 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -38,6 +38,7 @@ struct EventList;
 
 static int64_t profiler_lister_id = 0;
 static bool should_send_profile_state = false;
+std::mutex profiler_mu;
 
 // The profiler state, the initial value is ProfilerState::kDisabled
 static ProfilerState g_state = ProfilerState::kDisabled;
@@ -126,6 +127,7 @@ double Event::CpuElapsedMs(const Event& e) const {
 
 double Event::CudaElapsedMs(const Event& e) const {
 #ifdef PADDLE_WITH_CUDA
+  if (!has_cuda_) return 0.0;
   PADDLE_ENFORCE(e.has_cuda() && has_cuda());
   PADDLE_ENFORCE(e.device() == device());
   PADDLE_ENFORCE(cudaEventSynchronize(event_));
@@ -228,6 +230,8 @@ void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                  "Can't enbale profling, since the input state is ",
                  "ProfilerState::kDisabled");
+
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (state == g_state) {
     return;
   }
@@ -295,7 +299,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
   } else if (g_state == ProfilerState::kAll) {
     place = "All";
   } else {
-    PADDLE_THROW("Invalid profiler state");
+    PADDLE_THROW("Invalid profiler state", g_state);
   }
 
   std::cout << "Place: " << place << std::endl;
@@ -443,6 +447,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
 
 void DisableProfiler(EventSortingKey sorted_key,
                      const std::string& profile_path) {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled) return;
   // Mark the profiling stop.
   Mark("_stop_profiler_", nullptr);
@@ -466,7 +471,7 @@ void SetProfileListener() {
   std::mt19937 rng;
   rng.seed(std::random_device()());
   std::uniform_int_distribution<std::mt19937::result_type> dist6(
-      1, std::numeric_limits<std::mt19937::result_type>::max());
+      1, std::numeric_limits<int>::max());
   profiler_lister_id = dist6(rng);
 }
 int64_t ListenerId() { return profiler_lister_id; }
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 643bb6183d..bf43925373 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -116,6 +116,8 @@ void ResetProfiler();
 void DisableProfiler(EventSortingKey sorted_key,
                      const std::string& profile_path);
 
+const int kEnableProfiler = 1;
+const int kDisableProfiler = 2;
 // Test if the profiler is currently enabled.
 bool IsProfileEnabled();
 // Whether the trainer should send profiling state to PS.
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 3f28e61649..76aa7d2010 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/const_value.h"
+#include <paddle/fluid/framework/op_proto_maker.h>
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
@@ -23,6 +24,22 @@ void BindConstValue(pybind11::module* m) {
   m->def("kTempVarName", [] { return framework::kTempVarName; });
   m->def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
   m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
+
+  auto op_proto_and_checker_maker =
+      m->def_submodule("op_proto_and_checker_maker");
+
+  pybind11::enum_<framework::OpRole>(op_proto_and_checker_maker, "OpRole")
+      .value("Forward", framework::OpRole::kForward)
+      .value("Backward", framework::OpRole::kBackward)
+      .value("Optimize", framework::OpRole::kOptimize)
+      .value("Loss", framework::OpRole::kLoss)
+      .value("RPC", framework::OpRole::kRPC);
+
+  op_proto_and_checker_maker.def(
+      "kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName);
+  op_proto_and_checker_maker.def(
+      "kOpRoleVarAttrName",
+      framework::OpProtoAndCheckerMaker::OpRoleVarAttrName);
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 50a1c07251..bd5c613f8c 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -117,6 +117,7 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCPUTensorSetFromArray<int64_t>)
       .def("set", PyCPUTensorSetFromArray<bool>)
       .def("set", PyCPUTensorSetFromArray<uint16_t>)
+      .def("set", PyCPUTensorSetFromArray<uint8_t>)
 #ifdef PADDLE_WITH_CUDA
       .def("set", PyCUDATensorSetFromArray<float>)
       .def("set", PyCUDATensorSetFromArray<int>)
@@ -124,12 +125,14 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCUDATensorSetFromArray<int64_t>)
       .def("set", PyCUDATensorSetFromArray<bool>)
       .def("set", PyCUDATensorSetFromArray<uint16_t>)
+      .def("set", PyCUDATensorSetFromArray<uint8_t>)
       .def("set", PyCUDAPinnedTensorSetFromArray<float>)
       .def("set", PyCUDAPinnedTensorSetFromArray<int>)
       .def("set", PyCUDAPinnedTensorSetFromArray<double>)
       .def("set", PyCUDAPinnedTensorSetFromArray<int64_t>)
       .def("set", PyCUDAPinnedTensorSetFromArray<bool>)
       .def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<uint8_t>)
 #endif
       .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
       .def("set_float_element", TensorSetElement<float>)
@@ -410,6 +413,9 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<framework::Executor>(m, "Executor")
       .def(py::init<const platform::Place &>())
+#ifdef PADDLE_WITH_DISTRIBUTE
+      .def("complete", &Executor::Complete)
+#endif
       .def("run",
            (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
                Executor::Run);
@@ -492,6 +498,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("enable_profiler", platform::EnableProfiler);
   m.def("disable_profiler", platform::DisableProfiler);
+  m.def("is_profiler_enabled", platform::IsProfileEnabled);
   m.def("reset_profiler", platform::ResetProfiler);
 
   // -- python binds for parallel executor.
@@ -505,16 +512,24 @@ All parameter, weight, gradient are variables in Paddle.
             self.num_threads_ = num_threads;
           })
       .def_property(
-          "use_event",
-          [](const ExecutionStrategy &self) { return self.use_event_; },
-          [](ExecutionStrategy &self, bool use_event) {
-            self.use_event_ = use_event;
+          "use_cuda",
+          [](const ExecutionStrategy &self) { return self.use_cuda_; },
+          [](ExecutionStrategy &self, bool use_cuda) {
+            self.use_cuda_ = use_cuda;
           })
       .def_property(
           "allow_op_delay",
           [](const ExecutionStrategy &self) { return self.allow_op_delay_; },
           [](ExecutionStrategy &self, bool allow_op_delay) {
             self.allow_op_delay_ = allow_op_delay;
+          })
+      .def_property(
+          "num_iteration_per_drop_scope",
+          [](const ExecutionStrategy &self) {
+            return self.num_iteration_per_drop_scope_;
+          },
+          [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
+            self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
           });
   py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy");
 
@@ -541,6 +556,12 @@ All parameter, weight, gradient are variables in Paddle.
           [](BuildStrategy &self,
              BuildStrategy::GradientScaleStrategy strategy) {
             self.gradient_scale_ = strategy;
+          })
+      .def_property(
+          "debug_graphviz_path",
+          [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
+          [](BuildStrategy &self, const std::string &path) {
+            self.debug_graphviz_path_ = path;
           });
 
   pe.def(py::init<const std::vector<platform::Place> &,
diff --git a/paddle/fluid/recordio/chunk.cc b/paddle/fluid/recordio/chunk.cc
index 82d9aa601c..6c65d9160c 100644
--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
@@ -119,40 +119,56 @@ bool Chunk::Write(std::ostream& os, Compressor ct) const {
 }
 
 bool Chunk::Parse(std::istream& sin) {
-  Header hdr;
-  bool ok = hdr.Parse(sin);
+  ChunkParser parser(sin);
+  if (!parser.Init()) {
+    return false;
+  }
+  Clear();
+  while (parser.HasNext()) {
+    Add(parser.Next());
+  }
+  return true;
+}
+
+ChunkParser::ChunkParser(std::istream& sin) : in_(sin) {}
+bool ChunkParser::Init() {
+  pos_ = 0;
+  bool ok = header_.Parse(in_);
   if (!ok) {
     return ok;
   }
-  auto beg_pos = sin.tellg();
-  uint32_t crc = Crc32Stream(sin, hdr.CompressSize());
-  PADDLE_ENFORCE_EQ(hdr.Checksum(), crc);
-  Clear();
-  sin.seekg(beg_pos, sin.beg);
-  std::unique_ptr<std::istream> compressed_stream;
-  switch (hdr.CompressType()) {
+  auto beg_pos = in_.tellg();
+  uint32_t crc = Crc32Stream(in_, header_.CompressSize());
+  PADDLE_ENFORCE_EQ(header_.Checksum(), crc);
+  in_.seekg(beg_pos, in_.beg);
+
+  switch (header_.CompressType()) {
     case Compressor::kNoCompress:
       break;
     case Compressor::kSnappy:
-      compressed_stream.reset(new snappy::iSnappyStream(sin));
+      compressed_stream_.reset(new snappy::iSnappyStream(in_));
       break;
     default:
       PADDLE_THROW("Not implemented");
   }
+  return true;
+}
 
-  std::istream& stream = compressed_stream ? *compressed_stream : sin;
+bool ChunkParser::HasNext() const { return pos_ < header_.NumRecords(); }
 
-  for (uint32_t i = 0; i < hdr.NumRecords(); ++i) {
-    uint32_t rec_len;
-    stream.read(reinterpret_cast<char*>(&rec_len), sizeof(uint32_t));
-    std::string buf;
-    buf.resize(rec_len);
-    stream.read(&buf[0], rec_len);
-    PADDLE_ENFORCE_EQ(rec_len, stream.gcount());
-    Add(buf);
+std::string ChunkParser::Next() {
+  if (!HasNext()) {
+    return "";
   }
-  return true;
+  ++pos_;
+  std::istream& stream = compressed_stream_ ? *compressed_stream_ : in_;
+  uint32_t rec_len;
+  stream.read(reinterpret_cast<char*>(&rec_len), sizeof(uint32_t));
+  std::string buf;
+  buf.resize(rec_len);
+  stream.read(&buf[0], rec_len);
+  PADDLE_ENFORCE_EQ(rec_len, stream.gcount());
+  return buf;
 }
-
 }  // namespace recordio
 }  // namespace paddle
diff --git a/paddle/fluid/recordio/chunk.h b/paddle/fluid/recordio/chunk.h
index 71a1556a33..cfb954a591 100644
--- a/paddle/fluid/recordio/chunk.h
+++ b/paddle/fluid/recordio/chunk.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -53,9 +54,20 @@ class Chunk {
   DISABLE_COPY_AND_ASSIGN(Chunk);
 };
 
-size_t CompressData(const char* in, size_t in_length, Compressor ct, char* out);
+class ChunkParser {
+ public:
+  explicit ChunkParser(std::istream& sin);
+
+  bool Init();
+  std::string Next();
+  bool HasNext() const;
 
-void DeflateData(const char* in, size_t in_length, Compressor ct, char* out);
+ private:
+  Header header_;
+  uint32_t pos_{0};
+  std::istream& in_;
+  std::unique_ptr<std::istream> compressed_stream_;
+};
 
 }  // namespace recordio
 }  // namespace paddle
diff --git a/paddle/fluid/recordio/scanner.cc b/paddle/fluid/recordio/scanner.cc
index 88b4d4001b..06a13e6c5b 100644
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
@@ -22,35 +22,33 @@ namespace paddle {
 namespace recordio {
 
 Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
-    : stream_(std::move(stream)) {
+    : stream_(std::move(stream)), parser_(*stream_) {
   Reset();
 }
 
-Scanner::Scanner(const std::string &filename) {
-  stream_.reset(new std::ifstream(filename));
+Scanner::Scanner(const std::string &filename)
+    : stream_(new std::ifstream(filename)), parser_(*stream_) {
   Reset();
 }
 
 void Scanner::Reset() {
   stream_->clear();
   stream_->seekg(0, std::ios::beg);
-  ParseNextChunk();
+  parser_.Init();
 }
 
 std::string Scanner::Next() {
-  PADDLE_ENFORCE(!eof_, "StopIteration");
-  auto rec = cur_chunk_.Record(offset_++);
-  if (offset_ == cur_chunk_.NumRecords()) {
-    ParseNextChunk();
+  if (stream_->eof()) {
+    return "";
   }
-  return rec;
-}
 
-void Scanner::ParseNextChunk() {
-  eof_ = !cur_chunk_.Parse(*stream_);
-  offset_ = 0;
+  auto res = parser_.Next();
+  if (!parser_.HasNext() && HasNext()) {
+    parser_.Init();
+  }
+  return res;
 }
 
-bool Scanner::HasNext() const { return !eof_; }
+bool Scanner::HasNext() const { return !stream_->eof(); }
 }  // namespace recordio
 }  // namespace paddle
diff --git a/paddle/fluid/recordio/scanner.h b/paddle/fluid/recordio/scanner.h
index 34f1b0c78d..0d885dd87a 100644
--- a/paddle/fluid/recordio/scanner.h
+++ b/paddle/fluid/recordio/scanner.h
@@ -37,11 +37,7 @@ class Scanner {
 
  private:
   std::unique_ptr<std::istream> stream_;
-  Chunk cur_chunk_;
-  size_t offset_;
-  bool eof_;
-
-  void ParseNextChunk();
+  ChunkParser parser_;
 };
 }  // namespace recordio
 }  // namespace paddle
diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt
new file mode 100644
index 0000000000..78d6e5ff55
--- /dev/null
+++ b/paddle/fluid/train/demo/CMakeLists.txt
@@ -0,0 +1,66 @@
+cmake_minimum_required(VERSION 3.0)
+
+project(cpp_train_demo CXX C)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+if(NOT DEFINED PADDLE_LIB)
+  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/paddle/lib/dir")
+endif()
+
+option(WITH_MKLDNN     "Compile PaddlePaddle with MKLDNN"                                   OFF)
+option(WITH_MKL        "Compile PaddlePaddle with MKL support, default use openblas."       OFF)
+
+include_directories("${PADDLE_LIB}")
+include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
+include_directories("${PADDLE_LIB}/third_party/install/glog/include")
+include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
+include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+
+include_directories("${PADDLE_LIB}/third_party/boost")
+include_directories("${PADDLE_LIB}/third_party/eigen3")
+
+link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
+link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
+link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
+link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
+link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+
+add_executable(demo_trainer demo_trainer.cc)
+
+if(WITH_MKLDNN)
+  include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include")
+  set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
+endif()
+
+if(WITH_MKL)
+  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
+else()
+  if(APPLE)
+    set(MATH_LIB cblas)
+  else(APPLE)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
+  endif(APPLE)
+endif()
+
+if(APPLE)
+  set(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
+else(APPLE)
+  set(ARCHIVE_START "-Wl,--whole-archive")
+  set(ARCHIVE_END "-Wl,--no-whole-archive")
+  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+endif(APPLE)
+
+target_link_libraries(demo_trainer
+        ${MACOS_LD_FLAGS}
+        ${ARCHIVE_START}
+        ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a
+        ${ARCHIVE_END}
+        ${MATH_LIB}
+        ${MKLDNN_LIB}
+        glog gflags protobuf snappystream snappy z
+        ${EXTERNAL_LIB})
diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md
new file mode 100644
index 0000000000..41b01d3382
--- /dev/null
+++ b/paddle/fluid/train/demo/README.md
@@ -0,0 +1,66 @@
+
+### step 1. build paddle lib
+
+```
+
+# WITH_MKL=ON|OFF
+# WITH_MKLDNN=ON|OFF
+
+PADDLE_LIB=/paddle/lib/dir
+cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
+         -DCMAKE_BUILD_TYPE=Release \
+         -DWITH_FLUID_ONLY=ON \
+         -DWITH_GPU=OFF \
+         -DWITH_STYLE_CHECK=OFF \
+         -DWITH_MKL=OFF \
+         -DWITH_MKLDNN=OFF
+make -j8
+make -j8 inference_lib_dist
+```
+
+### step 2. generate program desc
+```
+# please install paddle before run this scripe
+pip install --upgrade paddlepaddle-*.whl
+python demo_network.py
+```
+
+This will generate two program desc files:
+  - startup_program: used to init all parameters
+  - main_program: main logic of the network
+
+### step 3. build demo_trainer and run it.
+
+
+```
+# Make a build dir at the same dir of this README.md document.
+# The demo dir can be put anywhere.
+mkdir build
+cd build
+
+# WITH_MKL=ON|OFF
+# WITH_MKLDNN=ON|OFF
+PADDLE_LIB=/paddle/lib/dir
+
+# PADDLE_LIB is the same with FLUID_INSTALL_DIR when building the lib
+cmake .. -DPADDLE_LIB=$PADDLE_LIB \
+         -DWITH_MKLDNN=OFF \
+         -DWITH_MKL=OFF
+make
+
+# copy startup_program and main_program to this dir
+cp ../startup_program .
+cp ../main_program .
+
+# run demo cpp trainer
+./demo_trainer
+
+```
+
+The output will be:
+```
+step: 0 loss: 1069.02
+step: 1 loss: 1069.02
+step: 2 loss: 1069.02
+....
+```
diff --git a/paddle/fluid/train/demo/demo_network.py b/paddle/fluid/train/demo/demo_network.py
new file mode 100644
index 0000000000..41e98c6a24
--- /dev/null
+++ b/paddle/fluid/train/demo/demo_network.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+
+
+def train_network(with_optimize):
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+    if with_optimize:
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.00001)
+        sgd_optimizer.minimize(avg_cost)
+    else:
+        fluid.backward.append_backward(avg_cost)
+
+
+def save_program_desc(network_func):
+    startup_program = framework.Program()
+    train_program = framework.Program()
+
+    with framework.program_guard(train_program, startup_program):
+        network_func(with_optimize=False)
+
+    with open("startup_program", "w") as f:
+        f.write(startup_program.desc.serialize_to_string())
+    with open("main_program", "w") as f:
+        f.write(train_program.desc.serialize_to_string())
+
+
+save_program_desc(train_network)
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
new file mode 100644
index 0000000000..813d838686
--- /dev/null
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -0,0 +1,103 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace train {
+
+void ReadBinaryFile(const std::string& filename, std::string* contents) {
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  fin.seekg(0, std::ios::end);
+  contents->clear();
+  contents->resize(fin.tellg());
+  fin.seekg(0, std::ios::beg);
+  fin.read(&(contents->at(0)), contents->size());
+  fin.close();
+}
+
+std::unique_ptr<paddle::framework::ProgramDesc> Load(
+    paddle::framework::Executor* executor, const std::string& model_filename) {
+  VLOG(3) << "loading model from " << model_filename;
+  std::string program_desc_str;
+  ReadBinaryFile(model_filename, &program_desc_str);
+
+  std::unique_ptr<paddle::framework::ProgramDesc> main_program(
+      new paddle::framework::ProgramDesc(program_desc_str));
+  return main_program;
+}
+
+}  // namespace train
+}  // namespace paddle
+
+int main() {
+  paddle::framework::InitDevices(false);
+
+  const auto cpu_place = paddle::platform::CPUPlace();
+
+  paddle::framework::Executor executor(cpu_place);
+  paddle::framework::Scope scope;
+  auto startup_program = paddle::train::Load(&executor, "startup_program");
+  auto train_program = paddle::train::Load(&executor, "main_program");
+
+  std::string loss_name = "";
+  for (auto op_desc : train_program->Block(0).AllOps()) {
+    if (op_desc->Type() == "mean") {
+      loss_name = op_desc->Output("Out")[0];
+      break;
+    }
+  }
+
+  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
+
+  // init all parameters
+  executor.Run(*startup_program.get(), &scope, 0);
+
+  // prepare data
+  auto x_var = scope.Var("x");
+  auto x_tensor = x_var->GetMutable<paddle::framework::LoDTensor>();
+  x_tensor->Resize({2, 13});
+
+  auto x_data = x_tensor->mutable_data<float>(cpu_place);
+  for (int i = 0; i < 2 * 13; ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+
+  auto y_var = scope.Var("y");
+  auto y_tensor = y_var->GetMutable<paddle::framework::LoDTensor>();
+  y_tensor->Resize({2, 1});
+  auto y_data = y_tensor->mutable_data<float>(cpu_place);
+  for (int i = 0; i < 2 * 1; ++i) {
+    y_data[i] = static_cast<float>(i);
+  }
+
+  auto loss_var = scope.Var(loss_name);
+
+  for (int i = 0; i < 10; ++i) {
+    executor.Run(*train_program.get(), &scope, 0, false, true);
+    std::cout << "step: " << i << " loss: "
+              << loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]
+              << std::endl;
+  }
+  return 0;
+}
diff --git a/paddle/function/BlockExpandOp.cpp b/paddle/function/BlockExpandOp.cpp
index aa53853e08..f01f89a727 100644
--- a/paddle/function/BlockExpandOp.cpp
+++ b/paddle/function/BlockExpandOp.cpp
@@ -33,7 +33,7 @@ namespace paddle {
  * \param outputs[0] Image data of NCHW format.
  */
 class BlockExpandFunction : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     // function arguments
     strides_ = config.get<std::vector<size_t>>("strides");
@@ -81,7 +81,7 @@ public:
                         (size_t)blockW()});
   }
 
-protected:
+ protected:
   std::vector<size_t> strides_;
   std::vector<size_t> paddings_;
   std::vector<size_t> blocks_;
@@ -101,7 +101,7 @@ protected:
 
 template <DeviceType Device>
 class BlockExpandForward : public BlockExpandFunction {
-public:
+ public:
   void init(const FuncConfig& config) override {
     BlockExpandFunction::init(config);
   }
@@ -149,7 +149,7 @@ public:
 
 template <DeviceType Device>
 class BlockExpandBackward : public BlockExpandFunction {
-public:
+ public:
   void init(const FuncConfig& config) override {
     BlockExpandFunction::init(config);
   }
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 89ee09837d..6de8c94e77 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -63,12 +63,12 @@ enum ArgType {
   ADD_TO = 2,
 };
 class BufferArg {
-public:
+ public:
   void setArgType(ArgType argType) { argType_ = argType; }
 
   ArgType getArgType() const { return argType_; }
 
-public:
+ public:
   BufferArg(ValueType valueType,
             const TensorShape& shape,
             ArgType argType = UNSPECIFIED)
@@ -169,7 +169,7 @@ public:
   const SequenceArg& sequence() const;
   const SparseMatrixArg& sparse() const;
 
-protected:
+ protected:
   void* buf_;
   ValueType valueType_;
   TensorShape shape_;
@@ -185,7 +185,7 @@ protected:
 // valueType_ = int32
 // if a < b then value_.buf_[a] < value_.buf_[b]
 class SequenceIdArg : public BufferArg {
-public:
+ public:
   SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
       : BufferArg(VALUE_TYPE_INT32, shape, argType) {
     bufferType_ = TENSOR_SEQUENCE_ID;
@@ -212,7 +212,7 @@ public:
 
   size_t numSeqs() const { return numSeqs_; }
 
-private:
+ private:
   size_t numSeqs_;
 };
 
@@ -222,7 +222,7 @@ private:
 // SequenceArg can be used to represent sequences that contain multiple
 // unequal lengths.
 class SequenceArg : public BufferArg {
-public:
+ public:
   SequenceArg(ValueType valueType,
               const TensorShape& shape,
               ArgType argType = UNSPECIFIED)
@@ -255,7 +255,7 @@ public:
   SequenceIdArg& getSequenceId() { return startPositions_; }
   const SequenceIdArg& getSequenceId() const { return startPositions_; }
 
-private:
+ private:
   SequenceIdArg startPositions_;
 };
 
@@ -263,7 +263,7 @@ private:
 // valueType_ == float or double
 // shape_.ndims() == 2
 class SparseMatrixArg : public BufferArg {
-public:
+ public:
   SparseMatrixArg(void* buf,
                   ValueType valueType,
                   const TensorShape& shape,
@@ -353,7 +353,7 @@ public:
 
   SparseDataType dataType() const { return type_; }
 
-private:
+ private:
   BufferArg row_;
   BufferArg col_;
   size_t nnz_;
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 9b2779b42c..29b4ac098e 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -52,9 +52,3 @@ add_simple_unittest(Im2ColTest)
 add_simple_unittest(GemmConvOpTest)
 add_simple_unittest(DepthwiseConvOpTest)
 endif()
-
-add_style_check_target(paddle_function ${h_files})
-add_style_check_target(paddle_function ${cpp_files})
-if(WITH_GPU)
-    add_style_check_target(paddle_function ${cu_files})
-endif()
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index 904b0958e6..1187842452 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -100,7 +100,7 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
  */
 template <DeviceType Device>
 class ContextProjectionForwardFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     context_length_ = config.get<size_t>("context_length");
     context_start_ = config.get<int>("context_start");
@@ -146,7 +146,7 @@ public:
                                      begin_pad_);
   }
 
-private:
+ private:
   size_t context_length_;
   int context_start_;
   size_t begin_pad_;
@@ -223,7 +223,7 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
  */
 template <DeviceType Device>
 class ContextProjectionBackwardFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     context_length_ = config.get<size_t>("context_length");
     context_start_ = config.get<int>("context_start");
@@ -278,7 +278,7 @@ public:
                                       total_pad_);
   }
 
-private:
+ private:
   size_t context_length_;
   int context_start_;
   size_t begin_pad_;
@@ -299,7 +299,7 @@ private:
  */
 template <DeviceType Device>
 class ContextProjectionBackwardDataFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     context_length_ = config.get<size_t>("context_length");
     context_start_ = config.get<int>("context_start");
@@ -331,7 +331,7 @@ public:
         out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
   }
 
-private:
+ private:
   size_t context_length_;
   int context_start_;
 };
@@ -348,7 +348,7 @@ private:
  */
 template <DeviceType Device>
 class ContextProjectionBackwardWeightFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     context_length_ = config.get<size_t>("context_length");
     context_start_ = config.get<int>("context_start");
@@ -382,7 +382,7 @@ public:
                                             begin_pad_);
   }
 
-private:
+ private:
   size_t context_length_;
   int context_start_;
   size_t begin_pad_;
diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h
index 7d23d0079c..2d8437bcfe 100644
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
@@ -56,7 +56,7 @@ namespace paddle {
  *      H and W is height and width of filter.
  */
 class ConvFunctionBase : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     // function arguments
     strides_ = config.get<std::vector<size_t>>("strides");
@@ -101,7 +101,7 @@ public:
     }
   }
 
-protected:
+ protected:
   size_t getFilterHeight(const TensorShape& filter) const {
     return filter[filter.ndims() - 2];
   }
diff --git a/paddle/function/CosSimOp.cpp b/paddle/function/CosSimOp.cpp
index 81bccc1a9c..2c25e1af44 100644
--- a/paddle/function/CosSimOp.cpp
+++ b/paddle/function/CosSimOp.cpp
@@ -97,7 +97,7 @@ class CosSimForwardFunc : public FunctionBase {
     CosSimForward<Device>(out_mat, in1_mat, in2_mat, scale_);
   }
 
-private:
+ private:
   real scale_;
 };
 
@@ -227,7 +227,7 @@ class CosSimBackwardFunc : public FunctionBase {
         out_grad, out_val, in1_val, in2_val, in1_grad, in2_grad, scale_);
   }
 
-private:
+ private:
   real scale_;
 };
 
diff --git a/paddle/function/CropOp.cpp b/paddle/function/CropOp.cpp
index 7aa527d216..5bd98910fe 100644
--- a/paddle/function/CropOp.cpp
+++ b/paddle/function/CropOp.cpp
@@ -112,7 +112,7 @@ void CropGrad<DEVICE_TYPE_CPU>(const real* inGrad,
  */
 template <DeviceType Device>
 class CropFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { conf_ = config; }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -130,7 +130,7 @@ public:
                  conf_);
   }
 
-private:
+ private:
   FuncConfig conf_;
 };
 
@@ -145,7 +145,7 @@ private:
 
 template <DeviceType Device>
 class CropGradFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { conf_ = config; }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -163,7 +163,7 @@ public:
                      conf_);
   }
 
-private:
+ private:
   FuncConfig conf_;
 };
 
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index 75c0fc2a3d..7ff9227e5c 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -160,7 +160,7 @@ void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
  */
 template <DeviceType Device>
 class CrossMapNormalFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     // function arguments
     size_ = config.get<size_t>("size");
@@ -220,7 +220,7 @@ public:
     return ops;
   }
 
-private:
+ private:
   size_t size_;
   real scale_;
   real pow_;
@@ -260,7 +260,7 @@ private:
  */
 template <DeviceType Device>
 class CrossMapNormalGradFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     // function arguments
     size_ = config.get<size_t>("size");
@@ -328,7 +328,7 @@ public:
     return ops;
   }
 
-private:
+ private:
   size_t size_;
   real scale_;
   real pow_;
diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index 46651345b4..958034e08e 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -19,7 +19,7 @@ namespace paddle {
 
 template <class T>
 class DepthwiseConvFunctor<DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(const T* inputData,
                   const T* filterData,
                   int batchSize,
@@ -43,7 +43,7 @@ public:
 
 template <class T>
 class DepthwiseConvGradInputFunctor<DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* filterData,
                   int batchSize,
@@ -66,7 +66,7 @@ public:
 
 template <class T>
 class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* inputData,
                   int batchSize,
@@ -93,7 +93,7 @@ public:
  */
 template <DeviceType Device>
 class DepthwiseConvFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -156,7 +156,7 @@ public:
  */
 template <DeviceType Device>
 class DepthwiseConvGradInputFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -220,7 +220,7 @@ public:
  */
 template <DeviceType Device>
 class DepthwiseConvGradFilterFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
diff --git a/paddle/function/DepthwiseConvOp.h b/paddle/function/DepthwiseConvOp.h
index 6700747314..7837edd1c0 100644
--- a/paddle/function/DepthwiseConvOp.h
+++ b/paddle/function/DepthwiseConvOp.h
@@ -44,7 +44,7 @@ namespace paddle {
  */
 template <DeviceType Device, class T>
 class DepthwiseConvFunctor {
-public:
+ public:
   void operator()(const T* inputData,
                   const T* filterData,
                   int batchSize,
@@ -89,7 +89,7 @@ public:
  */
 template <DeviceType Device, class T>
 class DepthwiseConvGradInputFunctor {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* filterData,
                   int batchSize,
@@ -135,7 +135,7 @@ public:
  */
 template <DeviceType Device, class T>
 class DepthwiseConvGradFilterFunctor {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* inputData,
                   int batchSize,
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
index cd1d55a416..2c0e71b19b 100644
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -199,7 +199,7 @@ __global__ void ConvolutionDepthwiseFilterBackward(const int num_i,
 
 template <class T>
 class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(const T* inputData,
                   const T* filterData,
                   int batchSize,
@@ -249,7 +249,7 @@ public:
 
 template <class T>
 class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* filterData,
                   int batchSize,
@@ -300,7 +300,7 @@ public:
 
 template <class T>
 class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* inputData,
                   int batchSize,
diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp
index bac4659e62..8e9dbbd7a1 100644
--- a/paddle/function/EigenGemm.cpp
+++ b/paddle/function/EigenGemm.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <glog/logging.h>
-#include "unsupported/Eigen/CXX11/Tensor"
+#include "paddle/function/EigenThreadDevice.h"
 
 namespace paddle {
 
@@ -70,25 +70,26 @@ struct EigenBlasGemm {
     dims[0].first = transA ? 0 : 1;
     dims[0].second = transB ? 1 : 0;
 
-    Eigen::DefaultDevice device;
+    auto* device = EigenDeviceWarpper::device();
     if (N == ldc) {
       if (alpha == T(1) && beta == T(0)) {
-        c.device(device) = a.contract(b, dims);
+        c.device(*device) = a.contract(b, dims);
       } else if (alpha == T(1) && beta == T(1)) {
-        c.device(device) += a.contract(b, dims);
+        c.device(*device) += a.contract(b, dims);
       } else {
-        c.device(device) = alpha * a.contract(b, dims) + beta * c;
+        c.device(*device) = alpha * a.contract(b, dims) + beta * c;
       }
     } else {
       if (alpha == T(1) && beta == T(0)) {
-        c.slice(offsetC, extentC).device(device) = a.contract(b, dims);
+        c.slice(offsetC, extentC).device(*device) = a.contract(b, dims);
       } else if (alpha == T(1) && beta == T(1)) {
-        c.slice(offsetC, extentC).device(device) += a.contract(b, dims);
+        c.slice(offsetC, extentC).device(*device) += a.contract(b, dims);
       } else {
-        c.slice(offsetC, extentC).device(device) =
+        c.slice(offsetC, extentC).device(*device) =
             alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
       }
     }
+    EigenDeviceWarpper::free_device(device);
   }
 };
 
diff --git a/paddle/function/EigenThreadDevice.h b/paddle/function/EigenThreadDevice.h
new file mode 100644
index 0000000000..eb92251c82
--- /dev/null
+++ b/paddle/function/EigenThreadDevice.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+
+#pragma once
+
+#if defined(__OSX__) || defined(__APPLE__)
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+
+#if defined(__ANDROID__)
+int GetCpuCount() {
+  FILE* fp = fopen("/sys/devices/system/cpu/possible", "r");
+  if (!fp) {
+    return 1;
+  }
+  int rank0, rank1;
+  int num = fscanf(fp, "%d-%d", &rank0, &rank1);
+  fclose(fp);
+  if (num < 2) return 1;
+  return rank1 + 1;
+}
+#elif defined(__OSX__) || defined(__APPLE__)
+int GetCpuCount() {
+  int count = 0;
+  size_t len = sizeof(int);
+  sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
+  return count > 0 ? count : 1;
+}
+#else
+int GetCpuCount() { return 1; }
+#endif
+
+class EigenDeviceWarpper {
+ public:  // NOLINT
+#if EIGEN_USE_THREADS
+  static Eigen::ThreadPoolDevice* device() {
+    const int num_cpus = GetCpuCount();
+    const int num_threads = (num_cpus > 2) ? 2 : num_cpus;
+    static Eigen::ThreadPool tp(num_threads);
+    static Eigen::ThreadPoolDevice* device =
+        new Eigen::ThreadPoolDevice(&tp, num_threads);
+    return device;
+  }
+
+  static void free_device(Eigen::ThreadPoolDevice* device) {
+    // do nothing
+  }
+#else
+  static Eigen::DefaultDevice* device() {
+    Eigen::DefaultDevice* device = new Eigen::DefaultDevice;
+    return device;
+  }
+
+  static void free_device(Eigen::DefaultDevice* device) { delete device; }
+#endif
+};
+
+}  // namespace paddle
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 01288ef92e..a6c14ef29b 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -29,7 +29,7 @@ namespace paddle {
  * The argument type of Function::init.
  */
 class FuncConfig {
-public:
+ public:
   template <typename T>
   T get(const std::string& key, Error* err = nullptr) const {
     try {
@@ -59,7 +59,7 @@ public:
     return *this;
   }
 
-protected:
+ protected:
   mutable std::unordered_map<std::string, any> valueMap_;
 };
 
@@ -77,7 +77,7 @@ protected:
  * in the BufferArgs life time.
  */
 class BufferArgs {
-public:
+ public:
   BufferArgs() {}
 
   ~BufferArgs() {
@@ -137,7 +137,7 @@ public:
 
   void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
 
-private:
+ private:
   std::vector<BufferArg*> args_;
   // The BufferArg object is constructed and freed by BufferArgs.
   std::vector<BufferArg*> _args_;
@@ -163,7 +163,7 @@ private:
  * If Function has more than one output, each output can have different modes.
  */
 class FunctionBase {
-public:
+ public:
   virtual ~FunctionBase() {}
 
   virtual void init(const FuncConfig& config) {}
@@ -192,7 +192,7 @@ public:
 
   static ClassRegistrar<FunctionBase> funcRegistrar_;
 
-protected:
+ protected:
   // numInputs_ and numOutputs_ represents the maximum
   // input and output supported by Function.
   // Some functions are optimized for input and output,
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index 56c3537b6a..14003d2c88 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -39,7 +39,7 @@ struct Allocator<DEVICE_TYPE_GPU> {
 // Copy argument1 to argument2
 template <DeviceType DType1, DeviceType DType2>
 class CopyArgument {
-public:
+ public:
   void operator()(const BufferArg& arg1, BufferArg& arg2) {
     CHECK_EQ(arg1.valueType(), arg2.valueType());
     CHECK_LE(arg1.shape().getElements(), arg2.shape().getElements());
@@ -95,7 +95,7 @@ public:
  */
 template <DeviceType DType1, DeviceType DType2>
 class Compare2Function {
-public:
+ public:
   typedef typename test::Allocator<DType1>::type Allocator1;
   typedef typename test::Allocator<DType2>::type Allocator2;
   typedef typename Tensor<real, DType1>::Vector Vector1;
@@ -305,7 +305,7 @@ public:
 
   std::shared_ptr<FunctionBase> getFunction2() const { return function2_; }
 
-protected:
+ protected:
   // only init cpu argument, gpu argument copy from cpu argument.
   void initArg(BufferArg& arg) {
     Vector1 vector(arg.shape().getElements(), (real*)arg.data());
@@ -381,7 +381,7 @@ protected:
     }
   }
 
-protected:
+ protected:
   std::shared_ptr<FunctionBase> function1_;
   std::shared_ptr<FunctionBase> function2_;
   std::vector<std::shared_ptr<Allocator1>> func1Memory_;
@@ -400,7 +400,7 @@ protected:
 
 class CpuGpuFuncCompare
     : public Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> {
-public:
+ public:
   CpuGpuFuncCompare(const std::string& name, const FuncConfig& config)
       : Compare2Function(name + "-CPU", name + "-GPU", config) {}
 
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index 2b7c6f9eab..5b023e2c10 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -24,7 +24,7 @@ namespace paddle {
  */
 template <DeviceType Device>
 class GemmConvFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -136,7 +136,7 @@ public:
  */
 template <DeviceType Device>
 class GemmConvMobileFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -297,7 +297,7 @@ public:
  */
 template <DeviceType Device>
 class GemmConvGradInputFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -404,7 +404,7 @@ public:
  */
 template <DeviceType Device>
 class GemmConvGradFilterFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index 6a07787000..e0ce6918a2 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -70,7 +70,7 @@ enum ColFormat { kCFO = 0, kOCF = 1 };
  */
 template <ColFormat Format, DeviceType Device, class T>
 class Im2ColFunctor {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
@@ -85,7 +85,7 @@ public:
 
 template <ColFormat Format, DeviceType Device, class T>
 class Col2ImFunctor {
-public:
+ public:
   void operator()(T* imData,
                   const TensorShape& imShape,
                   const T* colData,
@@ -100,7 +100,7 @@ public:
 
 template <class T>
 class Im2ColMobileFunctor {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
diff --git a/paddle/function/Im2ColOp.cpp b/paddle/function/Im2ColOp.cpp
index ad2aed8f3c..55a3ff98db 100644
--- a/paddle/function/Im2ColOp.cpp
+++ b/paddle/function/Im2ColOp.cpp
@@ -23,7 +23,7 @@ namespace paddle {
  */
 template <class T>
 class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
@@ -75,7 +75,7 @@ public:
  */
 template <class T>
 class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(T* imData,
                   const TensorShape& imShape,
                   const T* colData,
@@ -130,7 +130,7 @@ template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, double>;
  */
 template <class T>
 class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
@@ -188,7 +188,7 @@ public:
  */
 template <class T>
 class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(T* imData,
                   const TensorShape& imShape,
                   const T* colData,
diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu
index a944a0ee68..96dd8f528e 100644
--- a/paddle/function/Im2ColOpGpu.cu
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -71,7 +71,7 @@ __global__ void im2col(const T* data_im,
  */
 template <class T>
 class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
@@ -184,7 +184,7 @@ __global__ void col2im(size_t n,
  */
 template <class T>
 class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(T* imData,
                   const TensorShape& imShape,
                   const T* colData,
@@ -292,7 +292,7 @@ __global__ void im2colOCF(const T* imData,
  */
 template <class T>
 class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
@@ -399,7 +399,7 @@ __global__ void col2imOCF(T* imData,
  */
 template <class T>
 class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(T* imData,
                   const TensorShape& imShape,
                   const T* colData,
diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
index 90cd4a2b6d..7bf36c8050 100644
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
@@ -240,7 +240,7 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
  */
 template <DeviceType Device>
 class MulFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     aTrans_ = config.get<bool>("aTrans");
     bTrans_ = config.get<bool>("bTrans");
@@ -335,7 +335,7 @@ public:
     }
   }
 
-private:
+ private:
   bool aTrans_;
   bool bTrans_;
 };
diff --git a/paddle/function/NaiveConvOp.cpp b/paddle/function/NaiveConvOp.cpp
index 22d3b33d0f..99c8b81acb 100644
--- a/paddle/function/NaiveConvOp.cpp
+++ b/paddle/function/NaiveConvOp.cpp
@@ -24,7 +24,7 @@ namespace paddle {
  */
 template <class T>
 class NaiveConvFunctor {
-public:
+ public:
   void operator()(const T* inputData,
                   size_t batchSize,
                   size_t inputChannels,
@@ -85,7 +85,7 @@ public:
 
 template <DeviceType Device>
 class NaiveConvFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
index db6dd518ca..5d7515e8c0 100644
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
@@ -132,7 +132,7 @@ static inline PadConf castToPadConf(const FuncConfig& conf) {
 
 template <DeviceType Device>
 class PadFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -157,7 +157,7 @@ public:
                 pad_);
   }
 
-private:
+ private:
   PadConf pad_;
 };
 
@@ -173,7 +173,7 @@ private:
 
 template <DeviceType Device>
 class PadGradFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -201,7 +201,7 @@ public:
                     pad_);
   }
 
-private:
+ private:
   PadConf pad_;
 };
 
diff --git a/paddle/function/RowConvOp.cpp b/paddle/function/RowConvOp.cpp
index 925860346e..129e933458 100644
--- a/paddle/function/RowConvOp.cpp
+++ b/paddle/function/RowConvOp.cpp
@@ -129,7 +129,7 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
 
 template <DeviceType Device>
 class RowConvFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {}
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -176,7 +176,7 @@ public:
 template <DeviceType Device>
 class RowConvGradFunc : public FunctionBase {
   // TODO(qingqing): split into RowConvDataFunc and RowConvWeightFunc
-public:
+ public:
   void init(const FuncConfig& config) override {}
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
diff --git a/paddle/function/ScaleSubRegionOp.cpp b/paddle/function/ScaleSubRegionOp.cpp
index 6ed6eb2dba..9a06ef2a96 100644
--- a/paddle/function/ScaleSubRegionOp.cpp
+++ b/paddle/function/ScaleSubRegionOp.cpp
@@ -92,7 +92,7 @@ void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
  */
 template <DeviceType Device>
 class ScaleSubRegionFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { conf_ = config; }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -109,7 +109,7 @@ public:
                            conf_);
   }
 
-private:
+ private:
   FuncConfig conf_;
 };
 
@@ -124,7 +124,7 @@ private:
 
 template <DeviceType Device>
 class ScaleSubRegionGradFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { conf_ = config; }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -141,7 +141,7 @@ public:
                                conf_);
   }
 
-private:
+ private:
   FuncConfig conf_;
 };
 
diff --git a/paddle/function/SwitchOp.cpp b/paddle/function/SwitchOp.cpp
index 50e1d6c04c..750fb6bf28 100644
--- a/paddle/function/SwitchOp.cpp
+++ b/paddle/function/SwitchOp.cpp
@@ -75,7 +75,7 @@ void NHWC2NCHW<DEVICE_TYPE_CPU>(real* outputs,
  */
 template <DeviceType Device>
 class NCHW2NHWCFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {}
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -108,7 +108,7 @@ public:
  */
 template <DeviceType Device>
 class NHWC2NCHWFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {}
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
diff --git a/paddle/function/TensorShape.h b/paddle/function/TensorShape.h
index 02d38c32c0..d4d1eae396 100644
--- a/paddle/function/TensorShape.h
+++ b/paddle/function/TensorShape.h
@@ -22,7 +22,7 @@ namespace paddle {
  * TensorShape used to represent shape of normal tensor.
  */
 class TensorShape {
-public:
+ public:
   TensorShape() : ndims_(0), nelements_(0) { initDims(0); }
 
   TensorShape(size_t ndims) : ndims_(ndims), nelements_(1) { initDims(ndims); };
@@ -80,7 +80,7 @@ public:
 
   bool operator!=(const TensorShape& t) const { return !(*this == t); }
 
-private:
+ private:
   // compute number of elements
   void numElements() {
     nelements_ = 1;
diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp
index d3298c7538..d7ac83da41 100644
--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ b/paddle/function/neon/NeonDepthwiseConv.cpp
@@ -21,7 +21,7 @@ namespace paddle {
 
 template <DeviceType Device>
 class NeonDepthwiseConvFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -66,18 +66,18 @@ public:
     float* inputPadding = inputData;
     int padInputHeight = inputHeight + 2 * paddingH();
     int padInputWidth = inputWidth + 2 * paddingW();
-    if (paddingH() > 0 || paddingW() > 0) {
-      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
-      resizeBuffer<Device>(newSize);
-      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
-      neon::Padding<float>::run(inputData,
-                                inputPadding,
-                                batchSize * inputChannels,
-                                inputHeight,
-                                inputWidth,
-                                padInputHeight,
-                                padInputWidth);
-    }
+    int newSize =
+        batchSize * (inputChannels + 1) * padInputHeight * padInputWidth;
+
+    resizeBuffer<Device>(newSize);
+    inputPadding = reinterpret_cast<float*>(memory_->getBuf());
+    neon::Padding<float>::run(inputData,
+                              inputPadding,
+                              batchSize * inputChannels,
+                              inputHeight,
+                              inputWidth,
+                              padInputHeight,
+                              padInputWidth);
 
     std::function<void(
         const float*, const float*, int, int, int, int, int, int, float*)>
diff --git a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
index d443d3fa49..1fc5daf607 100644
--- a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
+++ b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
@@ -21,7 +21,7 @@ namespace paddle {
 
 template <DeviceType Device>
 class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
index 3cdba4f2ed..48c997b50d 100644
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -46,7 +46,7 @@ nnp_convolution_algorithm get_nnp_convolution_algorithm(
 
 template <DeviceType Device>
 class NNPACKConvFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
     algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
@@ -231,7 +231,7 @@ public:
     }
   }
 
-private:
+ private:
   nnp_convolution_algorithm algorithm_;
   nnp_convolution_transform_strategy transform_strategy_;
   void* workspaceBuffer_;
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 3d6ced713f..6dc877dd90 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -146,8 +146,6 @@ else()
         ${GSERVER_SOURCES})
 endif()
 
-add_style_check_target(paddle_gserver ${GSERVER_SOURCES})
-add_style_check_target(paddle_gserver ${GSERVER_HEADER})
 add_dependencies(paddle_gserver paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
     add_subdirectory(tests)
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index 8d8f01234f..71c238fbfe 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -44,10 +44,10 @@ static ClassRegistrar<ActivationFunction> gActivationRegistrar;
  */
 #define BEGIN_DEFINE_ACTIVATION(ACTIVATION_NAME)                             \
   class ACTIVATION_CLASS_NAME(ACTIVATION_NAME) : public ActivationFunction { \
-  private:                                                                   \
+   private:                                                                  \
     static const std::string name;                                           \
                                                                              \
-  public:                                                                    \
+   public:                                                                   \
     const std::string& getName() const { return name; }
 /**
  * @def END_DEFINE_ACTIVATION
@@ -70,7 +70,7 @@ static ClassRegistrar<ActivationFunction> gActivationRegistrar;
  * Do nothing when forward/backward.
  */
 class IdentityActivation : public ActivationFunction {
-public:
+ public:
   static const std::string name;
   Error __must_check forward(Argument& act) {
     (void)act;
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index 0f4b0fe0ab..8e2e144769 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -31,7 +31,7 @@ struct Argument;
  *
  */
 class ActivationFunction {
-public:
+ public:
   static ActivationFunction* create(const std::string& type);
   static std::vector<std::string> getAllRegisteredTypes();
 
diff --git a/paddle/gserver/activations/MKLDNNActivation.cpp b/paddle/gserver/activations/MKLDNNActivation.cpp
index 56ffb83934..672444c656 100644
--- a/paddle/gserver/activations/MKLDNNActivation.cpp
+++ b/paddle/gserver/activations/MKLDNNActivation.cpp
@@ -35,10 +35,10 @@ static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
  * @def END_MKLDNN_ACTIVATION
  */
 #define END_MKLDNN_ACTIVATION(ACT_TYPE)                            \
-private:                                                           \
+ private:                                                          \
   static const std::string name;                                   \
                                                                    \
-public:                                                            \
+ public:                                                           \
   const std::string& getName() const { return name; }              \
   }                                                                \
   ;                                                                \
@@ -63,11 +63,11 @@ public:                                                            \
 #define DEFINE_MKLDNN_ELTWISE_ACTIVATION(                            \
     ACT_TYPE, BASE_CLASS, ALPHA, BWD_ALPHA)                          \
   BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)                      \
-private:                                                             \
+ private:                                                            \
   static const float alpha;                                          \
   static const float bwdAlpha;                                       \
                                                                      \
-public:                                                              \
+ public:                                                             \
   float getAlpha() const { return alpha; }                           \
   float getBwdAlpha() const { return bwdAlpha; }                     \
   END_MKLDNN_ACTIVATION(ACT_TYPE)                                    \
diff --git a/paddle/gserver/activations/MKLDNNActivation.h b/paddle/gserver/activations/MKLDNNActivation.h
index 392b32c70d..eece1b9c37 100644
--- a/paddle/gserver/activations/MKLDNNActivation.h
+++ b/paddle/gserver/activations/MKLDNNActivation.h
@@ -27,7 +27,7 @@ namespace paddle {
  * including mkldnn_relu, mkldnn_elu, mkldnn_tanh, mkldnn_softmax
  */
 class MKLDNNActivation : public ActivationFunction {
-protected:
+ protected:
   // input value element count
   size_t cnt_;
   // should not merge the resetBwd into resetFwd,
@@ -43,7 +43,7 @@ protected:
   std::vector<mkldnn::primitive> pipelineFwd_;
   std::vector<mkldnn::primitive> pipelineBwd_;
 
-public:
+ public:
   MKLDNNActivation() : cnt_(0), needResetBwd_(true) {}
   ~MKLDNNActivation() {}
   static ActivationFunction* create(const std::string& type);
@@ -72,7 +72,7 @@ class MKLDNNEltwiseActivation : public MKLDNNActivation {
   typedef mkldnn::eltwise_backward eltwise_bwd;
   typedef mkldnn::algorithm algorithm;
 
-protected:
+ protected:
   // save the forward primitive desc, which can be used backward
   std::shared_ptr<eltwise_fwd::primitive_desc> fwdPD_;
   // eltwise_bwd need src input value
@@ -80,7 +80,7 @@ protected:
   // use for copy data
   std::shared_ptr<mkldnn::reorder> copyInVal_;
 
-public:
+ public:
   MKLDNNEltwiseActivation() {}
   ~MKLDNNEltwiseActivation() {}
   virtual const std::string& getName() const = 0;
@@ -102,12 +102,12 @@ public:
 class MKLDNNSoftmaxActivation : public MKLDNNActivation {
   typedef mkldnn::softmax_forward softmax_fwd;
 
-private:
+ private:
   // for backward
   MatrixPtr sftMaxSum_;
   MatrixPtr sftMaxDot_;
 
-public:
+ public:
   MKLDNNSoftmaxActivation() {}
   ~MKLDNNSoftmaxActivation() {}
   virtual const std::string& getName() const = 0;
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 4851168aba..21822b10c2 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -71,7 +71,7 @@ typedef std::shared_ptr<BufferBatch> BufferBatchPtr;
  * @brief Data for batch training a neural network
  */
 class DataBatch {
-public:
+ public:
   DataBatch() : size_(0) { data_.clear(); }
   /**
    * @brief Get batch size
@@ -181,7 +181,7 @@ public:
     }
   }
 
-protected:
+ protected:
   /**
    * @brief batch size
    */
@@ -194,7 +194,7 @@ protected:
 };
 
 class BufferBatch {
-public:
+ public:
   BufferBatch() {
     hlStream_ = HPPL_STREAM_DEFAULT;
     hlEvent_ = NULL;
@@ -235,7 +235,7 @@ public:
   void swap(BufferBatch* bufBatch);
   void clone(DataBatch* srcBatch, bool useGpu);
 
-protected:
+ protected:
   DataBatch* batchData_;
   hl_stream_t hlStream_;
   hl_event_t hlEvent_;
@@ -247,7 +247,7 @@ typedef std::shared_ptr<DataProvider> DataProviderPtr;
 typedef Queue<BufferBatch*> BufferBatchQueue;
 
 class DoubleBuffer {
-public:
+ public:
   DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
   virtual ~DoubleBuffer();
   void removeOneBatch(DataBatch* dataBatch);
@@ -267,7 +267,7 @@ public:
 
   void setPending(bool pending) { pending_ = pending; }
 
-protected:
+ protected:
   virtual void asyncLoadBatch();
   void insertOneBatch(DataBatch* batch);
 
@@ -290,7 +290,7 @@ protected:
  * one is for input, one is for label.
  */
 class DataProvider {
-public:
+ public:
   static ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool> registrar_;
   static DataProvider* create(const DataConfig& config,
                               const ModelConfig& modelConfig,
@@ -359,7 +359,7 @@ public:
    */
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) = 0;
 
-protected:
+ protected:
   DataConfig config_;
   bool skipShuffle_;
   float usageRatio_;
@@ -382,7 +382,7 @@ protected:
  * necessary configurations such as stream_names
  */
 class DummyDataProvider : public DataProvider {
-public:
+ public:
   DummyDataProvider(const DataConfig& config, bool useGpu)
       : DataProvider(config, useGpu) {}
   virtual void shuffle() {}
@@ -399,7 +399,7 @@ public:
  * Data provider for one input and one integer label.
  */
 class SimpleDataProviderBase : public DataProvider {
-protected:
+ protected:
   /// sample feature dimension
   int64_t sampleDim_;
   /// the number of samples
@@ -425,7 +425,7 @@ protected:
 
   RWLock lock_;
 
-public:
+ public:
   SimpleDataProviderBase(const DataConfig& config, bool useGpu, bool withInfo);
   ~SimpleDataProviderBase() {}
 
@@ -440,7 +440,7 @@ public:
   /// return the number of samples in the buffer
   int64_t fillBuffer();
 
-protected:
+ protected:
   /**
    * @brief Fill at most size samples into data and label.
    *
@@ -458,12 +458,12 @@ protected:
 };
 
 class SimpleDataProvider : public SimpleDataProviderBase {
-public:
+ public:
   SimpleDataProvider(const DataConfig& config, bool useGpu);
   ~SimpleDataProvider();
   virtual void reset();
 
-protected:
+ protected:
   void loadData(const std::string& fileName);
   void loadDataFile(const std::string& fileName);
   virtual int64_t fillBufferImp(real* data,
@@ -471,7 +471,7 @@ protected:
                                 int* info,
                                 int64_t size);
 
-protected:
+ protected:
   size_t currentSampleIndex_;
   std::vector<int> labels_;
   std::vector<real> data_;
diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/gserver/dataproviders/DataProviderGroup.h
index 768e54fe82..91c94dc986 100644
--- a/paddle/gserver/dataproviders/DataProviderGroup.h
+++ b/paddle/gserver/dataproviders/DataProviderGroup.h
@@ -20,7 +20,7 @@ namespace paddle {
 
 template <class T>
 class DataProviderGroup : public DataProvider {
-protected:
+ protected:
   typedef T ProviderType;
   typedef std::shared_ptr<ProviderType> ProviderPtrType;
   ProviderPtrType provider_;
@@ -29,7 +29,7 @@ protected:
   std::mutex lock_;
   std::unique_ptr<MultiThreadWorker<ProviderType>> loader_;
 
-public:
+ public:
   DataProviderGroup(const DataConfig& config, bool useGpu);
   ~DataProviderGroup() {}
 
@@ -38,7 +38,7 @@ public:
   virtual int64_t getSize() { return -1; }
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
 
-private:
+ private:
   void startLoader();
   void stopLoader();
   void forceStopLoader();
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/gserver/dataproviders/MultiDataProvider.h
index 9a863c8967..baa1fc0190 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.h
+++ b/paddle/gserver/dataproviders/MultiDataProvider.h
@@ -19,10 +19,10 @@ limitations under the License. */
 namespace paddle {
 
 class MultiDataProvider : public DataProvider {
-protected:
+ protected:
   std::vector<std::unique_ptr<DataProvider>> subDataProviders_;
 
-public:
+ public:
   MultiDataProvider(const DataConfig& config,
                     const ModelConfig& modelConfig,
                     bool useGpu);
@@ -33,7 +33,7 @@ public:
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
   bool isTestMode() const { return isTestMode_; }
 
-private:
+ private:
   int totalDataRatio_;
   bool isTestMode_;
 };
diff --git a/paddle/gserver/dataproviders/ProtoReader.h b/paddle/gserver/dataproviders/ProtoReader.h
index 786703f4de..08d045226e 100644
--- a/paddle/gserver/dataproviders/ProtoReader.h
+++ b/paddle/gserver/dataproviders/ProtoReader.h
@@ -28,7 +28,7 @@ namespace paddle {
  * messages from/to i/ostream.
  */
 class ProtoReader {
-public:
+ public:
   explicit ProtoReader(std::istream* s, bool dataCompression = false) {
     CHECK(s) << "istream pointer is nullptr";
     istreamInput_.reset(new google::protobuf::io::IstreamInputStream(s));
@@ -109,7 +109,7 @@ public:
     return true;
   }
 
-protected:
+ protected:
   std::unique_ptr<google::protobuf::io::ZeroCopyInputStream> istreamInput_;
   std::unique_ptr<google::protobuf::io::GzipInputStream> gzipInput_;
   std::unique_ptr<google::protobuf::io::CodedInputStream> codedInput_;
@@ -144,7 +144,7 @@ protected:
 };
 
 class ProtoWriter {
-public:
+ public:
   explicit ProtoWriter(std::ostream* s, bool dataCompression = false) {
     CHECK(s) << "ostream pointer is nullptr";
     ostreamOutput_.reset(new google::protobuf::io::OstreamOutputStream(s));
@@ -168,7 +168,7 @@ public:
     return ret;
   }
 
-protected:
+ protected:
   std::unique_ptr<google::protobuf::io::ZeroCopyOutputStream> ostreamOutput_;
   std::unique_ptr<google::protobuf::io::GzipOutputStream> gzipOutput_;
   std::unique_ptr<google::protobuf::io::CodedOutputStream> codedOutput_;
diff --git a/paddle/gserver/dataproviders/PyDataProvider.h b/paddle/gserver/dataproviders/PyDataProvider.h
index e53354c9e4..da50dd4e2e 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.h
+++ b/paddle/gserver/dataproviders/PyDataProvider.h
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 
 class PyDataProvider : public DataProvider {
-public:
+ public:
   PyDataProvider(const DataConfig& config,
                  bool useGpu,
                  bool loadDataAll = true);
@@ -40,7 +40,7 @@ public:
 
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
 
-protected:
+ protected:
   struct ProtoSlot;
   // return false if each each sample is one sequence, i.e., independent
   // of other samples.
@@ -73,7 +73,7 @@ protected:
   void resetSlots();
   void loadData(const std::vector<std::string>& fileList);
 
-protected:
+ protected:
   struct ProtoSlot {
     SlotDef::SlotType type;
     int dim;
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index b4215bb307..54ee091e8f 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -93,7 +93,7 @@ inline std::ostream& operator<<(std::ostream& os, const SlotHeader& header) {
  * prepare step, fill data into argument during fill step.
  */
 class IFieldScanner {
-public:
+ public:
   DISABLE_COPY(IFieldScanner);
   /**
    * Ctor.
@@ -146,7 +146,7 @@ public:
    */
   static IFieldScanner* create(SlotHeader* header);
 
-protected:
+ protected:
   SlotHeader* headerPtr_;
 };
 
@@ -154,7 +154,7 @@ protected:
  * Py Data Provider Cache Interface.
  */
 class IPyDataProviderCache {
-public:
+ public:
   virtual ~IPyDataProviderCache() {}
 
   /**
@@ -193,7 +193,7 @@ public:
  * data. And it support cache strategies.
  */
 class PyDataProvider2 : public DataProvider {
-public:
+ public:
   /**
    * Ctor
    */
@@ -234,7 +234,7 @@ public:
    */
   virtual ~PyDataProvider2() { resetImpl(false); }
 
-private:
+ private:
   void createPyDataObj(const std::string& model,
                        const std::string& className,
                        const std::string& fileListName,
@@ -435,7 +435,7 @@ private:
     exit_ = false;
   }
 
-private:
+ private:
   std::unique_ptr<std::thread> loadThread_;
   std::atomic<bool> exit_;
   std::deque<PyObjectPtr> callingContexts_;
@@ -461,7 +461,7 @@ private:
   static PyObjectPtr zeroTuple_;
 
   class PositionRandom {
-  public:
+   public:
     inline explicit PositionRandom(bool skipRand)
         : eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
 
@@ -476,14 +476,14 @@ private:
       }
     }
 
-  private:
+   private:
     std::default_random_engine& eng_;
     std::unique_ptr<std::uniform_int_distribution<size_t>> dist_;
     bool skipRand_;
   };
 
   // DataProvider interface
-public:
+ public:
   /**
    * Resetting the PyDataProvider. May start reading thread here.
    */
@@ -666,7 +666,7 @@ REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
  * Scanner for dense slot.
  */
 class DenseScanner : public IFieldScanner {
-public:
+ public:
   explicit DenseScanner(SlotHeader* ptr) : IFieldScanner(ptr), height_(0) {}
 
   /**
@@ -708,7 +708,7 @@ public:
     ++height_;
   }
 
-private:
+ private:
   size_t height_;
 };
 
@@ -716,7 +716,7 @@ private:
  * Scanner for index slot
  */
 class IndexScanner : public IFieldScanner {
-public:
+ public:
   explicit IndexScanner(SlotHeader* ptr) : IFieldScanner(ptr), cnt_(0) {}
 
   /**
@@ -740,12 +740,12 @@ public:
     CHECK(ok) << "Cannot cast int " << py::repr(obj);
   }
 
-private:
+ private:
   size_t cnt_;
 };
 
 class SparseNonValueScanner : public IFieldScanner {
-public:
+ public:
   explicit SparseNonValueScanner(SlotHeader* ptr)
       : IFieldScanner(ptr), nnz_(0), height_(0) {}
 
@@ -790,7 +790,7 @@ public:
     ++height_;
   }
 
-protected:
+ protected:
   /**
    * Set a single sparse index and value.
    * @param [out] col sparse index
@@ -809,7 +809,7 @@ protected:
 };
 
 class SparseValueScanner : public SparseNonValueScanner {
-public:
+ public:
   explicit SparseValueScanner(SlotHeader* ptr) : SparseNonValueScanner(ptr) {}
 
   virtual void finishPrepare(Argument& argument) {
@@ -817,7 +817,7 @@ public:
         argument.value, height_, headerPtr_->dim, nnz_, FLOAT_VALUE);
   }
 
-protected:
+ protected:
   virtual void setData(int* col, real* dat, PyObject* obj) {
     py::SequenceHelper s(obj);
     SparseNonValueScanner::setData(col, dat, s[0]);
@@ -829,7 +829,7 @@ protected:
  * Sequence Scanner. Scanner for sequence or sub-sequence.
  */
 class SequenceScanner : public IFieldScanner {
-public:
+ public:
   /**
    * Ctor
    * @param innerScanner inner scanner for each timestep or sub-sequence.
@@ -902,7 +902,7 @@ public:
    */
   virtual void finishFill(Argument& argument) { inner_->finishFill(argument); }
 
-protected:
+ protected:
   size_t getSize(PyObject* obj) {
     py::SequenceHelper s(obj);
     auto sc = dynamic_cast<SequenceScanner*>(inner_.get());
@@ -917,7 +917,7 @@ protected:
     }
   }
 
-private:
+ private:
   std::unique_ptr<IFieldScanner> inner_;
   size_t cnt_;
   std::function<ICpuGpuVectorPtr&(Argument&)> getSeqStartPos_;
@@ -969,7 +969,7 @@ IFieldScanner* IFieldScanner::create(SlotHeader* header) {
  * python every pass.
  */
 class NoCacheStrategy : public IPyDataProviderCache {
-public:
+ public:
   virtual bool reset() { return true; }
 
   virtual void drop(std::deque<PyObjectPtr>* data) { data->clear(); }
@@ -984,7 +984,7 @@ public:
  * The rest passes, will load data from memory.
  */
 class CacheOnePassInMemory : public IPyDataProviderCache {
-public:
+ public:
   CacheOnePassInMemory()
       : objPool_(new std::deque<PyObjectPtr>()),
         droppedPool_(new std::deque<PyObjectPtr>()) {}
@@ -1011,7 +1011,7 @@ public:
 
   virtual std::deque<PyObjectPtr>* load() { return objPool_.get(); }
 
-private:
+ private:
   std::unique_ptr<std::deque<PyObjectPtr>> objPool_;
   std::unique_ptr<std::deque<PyObjectPtr>> droppedPool_;
 };
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index 0f680de776..c6cd41de9a 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -22,7 +22,7 @@ namespace paddle {
  * calculate sequence-to-sequence edit distance
  */
 class CTCErrorEvaluator : public Evaluator {
-private:
+ private:
   MatrixPtr outActivations_;
   int numTimes_, numClasses_, numSequences_, blank_;
   real deletions_, insertions_, substitutions_;
@@ -197,7 +197,7 @@ private:
         (real)seqClassficationError_ / numSequences_;
   }
 
-public:
+ public:
   CTCErrorEvaluator()
       : numTimes_(0),
         numClasses_(0),
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index 755b91d05c..a2216293b1 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -77,7 +77,7 @@ class ChunkEvaluator : public Evaluator {
   std::set<int> excludedChunkTypes_;
   mutable std::unordered_map<std::string, real> values_;
 
-public:
+ public:
   virtual void init(const EvaluatorConfig& config) {
     Evaluator::init(config);
     if (config.chunk_scheme() == "IOB") {
@@ -276,7 +276,7 @@ public:
     return "chunk";
   }
 
-private:
+ private:
   void storeLocalValues() const {
     CHECK_GE(numOutputSegments_, 0);
     CHECK_GE(numLabelSegments_, 0);
diff --git a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
index f43ef5dd51..ddb8ebca78 100644
--- a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
+++ b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
@@ -28,7 +28,7 @@ namespace paddle {
  * The config file api is detection_map_evaluator.
  */
 class DetectionMAPEvaluator : public Evaluator {
-public:
+ public:
   DetectionMAPEvaluator()
       : evaluateDifficult_(false), cpuOutput_(nullptr), cpuLabel_(nullptr) {}
 
@@ -132,7 +132,7 @@ public:
     LOG(FATAL) << "Distribute detection evaluation not implemented.";
   }
 
-protected:
+ protected:
   void calcTFPos(const size_t batchSize,
                  const vector<map<size_t, vector<NormalizedBBox>>>& allGTBBoxes,
                  const vector<map<size_t, vector<pair<real, NormalizedBBox>>>>&
@@ -287,7 +287,7 @@ protected:
 
   real getValueImpl() const { return calcMAP(); }
 
-private:
+ private:
   real overlapThreshold_;  // overlap threshold when determining whether matched
   bool evaluateDifficult_;  // whether evaluate difficult ground truth
   size_t backgroundId_;     // class index of background
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index 79478e7fac..941fb8fb53 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -38,7 +38,7 @@ void Evaluator::eval(const NeuralNetwork& nn) {
  * The config file api is classification_error_evaluator.
  */
 class ClassificationErrorEvaluator : public Evaluator {
-public:
+ public:
   /*
   ClassificationErrorEvaluator() : totalScore2_(0) {}
 
@@ -124,7 +124,7 @@ public:
   }
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const { return "classification_error"; }
 };
 
@@ -135,7 +135,7 @@ protected:
  */
 class SequenceClassificationErrorEvaluator
     : public ClassificationErrorEvaluator {
-public:
+ public:
   virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
     numSamples_ += arguments[0].getNumSequences();
   }
@@ -166,7 +166,7 @@ public:
   }
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const { return "seq_classification_error"; }
 };
 REGISTER_EVALUATOR(seq_classification_error,
@@ -178,7 +178,7 @@ REGISTER_EVALUATOR(seq_classification_error,
  * The config file api is sum_evaluator.
  */
 class SumEvaluator : public Evaluator {
-public:
+ public:
   SumEvaluator() : cpuLabel_(nullptr), cpuWeight_(nullptr) {}
 
   virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
@@ -255,12 +255,12 @@ public:
     mergeResultsOfAllClients(client);
   }
 
-private:
+ private:
   IVectorPtr cpuLabel_;
   MatrixPtr cpuWeight_;
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const { return "sum"; }
 };
 /**
@@ -274,7 +274,7 @@ protected:
  *
  */
 class ColumnSumEvaluator : public Evaluator {
-public:
+ public:
   explicit ColumnSumEvaluator(int32_t colIdx)
       : colIdx_(colIdx), colNum_(0), sum_(nullptr) {}
 
@@ -368,13 +368,13 @@ public:
     client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0);
   }
 
-private:
+ private:
   int32_t colIdx_;
   size_t colNum_;
   MatrixPtr sum_; /* cpu matrix */
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const {
     if (colIdx_ == -1)
       return "last-column-sum";
@@ -1018,7 +1018,7 @@ static InitFunction __reg_type_auc_sum__([]() {
  * The config file api is value_printer_evaluator.
  */
 class ValuePrinter : public NotGetableEvaluator {
-public:
+ public:
   virtual void eval(const NeuralNetwork& nn) {
     for (const std::string& name : config_.input_layers()) {
       nn.getLayer(name)->getOutput().printValueString(LOG(INFO),
@@ -1038,7 +1038,7 @@ REGISTER_EVALUATOR(value_printer, ValuePrinter);
  * The config file api is gradient_printer_evaluator.
  */
 class GradientPrinter : public NotGetableEvaluator {
-public:
+ public:
   virtual void eval(const NeuralNetwork& nn) {
     for (const std::string& name : config_.input_layers()) {
       const Argument& argu = nn.getLayer(name)->getOutput();
@@ -1061,11 +1061,11 @@ REGISTER_EVALUATOR(gradient_printer, GradientPrinter);
  * The config file api is maxid_printer_evaluator.
  */
 class MaxIdPrinter : public NotGetableEvaluator {
-private:
+ private:
   IVectorPtr maxIds_;
   MatrixPtr maxValues_;
 
-public:
+ public:
   MaxIdPrinter() {}
 
   virtual void eval(const NeuralNetwork& nn) {
@@ -1103,12 +1103,12 @@ REGISTER_EVALUATOR(max_id_printer, MaxIdPrinter);
  * The config file api is maxframe_printer_evaluator.
  */
 class MaxFramePrinter : public NotGetableEvaluator {
-private:
+ private:
   IVectorPtr maxIds_;
   MatrixPtr maxValues_;
   MatrixPtr value_;
 
-public:
+ public:
   MaxFramePrinter() {
     value_ =
         Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, false);
@@ -1190,7 +1190,7 @@ REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
  *
  */
 class SequenceTextPrinter : public NotGetableEvaluator {
-private:
+ private:
   /// dict_file, which contains a list of tokens
   std::vector<std::string> dict_;
   /// result_file, which is the output file
@@ -1203,7 +1203,7 @@ private:
   /// store the probability associated with each sequence
   std::vector<MatrixPtr> cpuIn_;
 
-public:
+ public:
   SequenceTextPrinter() {}
 
   virtual void init(const EvaluatorConfig& config) {
@@ -1334,7 +1334,7 @@ REGISTER_EVALUATOR(seq_text_printer, SequenceTextPrinter);
  * The config file api is classification_error_printer_evaluator.
  */
 class ClassificationErrorPrinter : public ClassificationErrorEvaluator {
-public:
+ public:
   virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
 
   virtual real evalImp(std::vector<Argument>& arguments) {
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
index be2032992c..42948f1097 100644
--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -40,7 +40,7 @@ class NeuralNetwork;
  * has been by a trained model.
  */
 class Evaluator {
-public:
+ public:
   static Evaluator* create(const EvaluatorConfig& config);
 
   Evaluator() : numSamples_(0), totalScore_(0) {}
@@ -172,7 +172,7 @@ public:
     return this->getTypeImpl();
   }
 
-protected:
+ protected:
   /**
    * @brief getValueImpl The simplest way to define getValue result. If this
    * evaluator doesn't contain multiple fields, and do not throw any error, just
@@ -191,7 +191,7 @@ protected:
    */
   virtual std::string getTypeImpl() const { return "base"; }
 
-protected:
+ protected:
   EvaluatorConfig config_;
   double numSamples_;
   double totalScore_;
@@ -204,7 +204,7 @@ protected:
  */
 class NotGetableEvaluator : public Evaluator {
   // Evaluator interface
-public:
+ public:
   void getNames(std::vector<std::string>* names) {}
 
   real getValue(const std::string& name, Error* err) const {
@@ -219,7 +219,7 @@ public:
 };
 
 class DummyEvaluator : public Evaluator {
-public:
+ public:
   DummyEvaluator() {}
   virtual void init(const EvaluatorConfig&) {}
   virtual void start() {}
@@ -232,7 +232,7 @@ public:
   virtual void printStats(std::ostream&) const {}
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const;
 };
 /**
@@ -251,7 +251,7 @@ protected:
  *
  */
 class AucEvaluator : public Evaluator {
-public:
+ public:
   AucEvaluator(int32_t colIdx)
       : colIdx_(colIdx),
         realColumnIdx_(0),
@@ -269,7 +269,7 @@ public:
 
   virtual void distributeEval(ParameterClient2* client);
 
-private:
+ private:
   static const uint32_t kBinNum_ = (1 << 24) - 1;
   static const int kNegativeLabel_ = 0;
   double statPos_[kBinNum_ + 1];
@@ -292,7 +292,7 @@ private:
   double calcAuc() const;
 
   // Evaluator interface
-protected:
+ protected:
   real getValueImpl() const;
   std::string getTypeImpl() const;
 };
@@ -305,7 +305,7 @@ protected:
  * dense value.
  */
 class RankAucEvaluator : public Evaluator {
-public:
+ public:
   // evaluate ranking AUC
   virtual void start();
 
@@ -317,7 +317,7 @@ public:
     mergeResultsOfAllClients(client);
   }
 
-private:
+ private:
   MatrixPtr output_;
   MatrixPtr click_;
   MatrixPtr pv_;
@@ -329,7 +329,7 @@ private:
                      size_t size);
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const;
 };
 
@@ -344,7 +344,7 @@ protected:
  * The config file api is precision_recall_evaluator.
  */
 class PrecisionRecallEvaluator : public Evaluator {
-public:
+ public:
   // Evaluate precision, recall and F1 score
   PrecisionRecallEvaluator()
       : isMultiBinaryLabel_(false),
@@ -379,7 +379,7 @@ public:
     StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {}
   };
 
-private:
+ private:
   bool isMultiBinaryLabel_;
   std::vector<StatsInfo> statsInfo_;
 
@@ -444,7 +444,7 @@ private:
  * The config file api is pnpair_evaluator.
  */
 class PnpairEvaluator : public Evaluator {
-public:
+ public:
   PnpairEvaluator()
       : cpuOutput_(nullptr),
         cpuLabel_(nullptr),
@@ -491,7 +491,7 @@ public:
               << " calc total neg pair: " << pairArray_[1];
   }
 
-private:
+ private:
   static const uint32_t kPairArrayNum_ = 2;
   double pairArray_[kPairArrayNum_];
   MatrixPtr cpuOutput_;
@@ -500,7 +500,7 @@ private:
   MatrixPtr cpuWeight_;
 
   // Evaluator interface
-protected:
+ protected:
   real getValueImpl() const {
     return pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
   }
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index 60936c311d..22cf5d265f 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -73,7 +73,7 @@ class GradientMachine;
 typedef std::shared_ptr<GradientMachine> GradientMachinePtr;
 
 class GradientMachine {
-public:
+ public:
   enum CreateMode {
     kNormal = 0,
     kSgdSparseCpuTraining = 3,
@@ -240,7 +240,7 @@ public:
    */
   virtual void releaseOutput() {}
 
-protected:
+ protected:
   virtual void onLoadParameter() {}
 
   std::vector<ParameterPtr> parameters_;
diff --git a/paddle/gserver/gradientmachines/GradientMachineMode.h b/paddle/gserver/gradientmachines/GradientMachineMode.h
index 898b68fbbc..dd944a35f8 100644
--- a/paddle/gserver/gradientmachines/GradientMachineMode.h
+++ b/paddle/gserver/gradientmachines/GradientMachineMode.h
@@ -19,14 +19,14 @@ limitations under the License. */
 namespace paddle {
 
 class IGradientMachineMode {
-public:
+ public:
   virtual ~IGradientMachineMode() {}
 
-public:  // interfaces
-         /**
-          * @brief create current mode's gradient machine by model config.
-          * @param config model config
-          */
+ public:  // interfaces
+          /**
+           * @brief create current mode's gradient machine by model config.
+           * @param config model config
+           */
   virtual GradientMachine* create(const ModelConfig& config) = 0;
 
   /**
@@ -55,14 +55,14 @@ public:  // interfaces
    */
   virtual bool needTrainWholeDataInOneBatch() const = 0;
 
-public:  // static methods.
-         /**
-          * @brief register a custom gradient machine mode.
-          * @note For user to register a custom gradient machine mode, id should >=
-          * kCustom.
-          * @param mode mode id.
-          * @param ptr mode description object.
-          */
+ public:  // static methods.
+          /**
+           * @brief register a custom gradient machine mode.
+           * @note For user to register a custom gradient machine mode, id should >=
+           * kCustom.
+           * @param mode mode id.
+           * @param ptr mode description object.
+           */
   static void regGradientMachineMode(
       int32_t mode, std::unique_ptr<IGradientMachineMode>&& ptr) {
     modes_.insert(std::make_pair(mode, std::move(ptr)));
@@ -141,7 +141,7 @@ public:  // static methods.
     }
   }
 
-private:
+ private:
   static std::unordered_map<int32_t, std::unique_ptr<IGradientMachineMode>>
       modes_;
 };
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index 83d2651f34..eff7d5284c 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -166,7 +166,7 @@ struct GradBuffer {
  *     the merged gradient to parameter server.
  */
 class MultiGradientMachine : public GradientMachine {
-public:
+ public:
   enum TaskType {
     TASK_FORWARD_BACKWARD = 0,
     TASK_FORWARD = 1,
@@ -213,7 +213,7 @@ public:
   /// The gradietns will be copied to each thread in the computing threads.
   virtual void setOutputGrad(const std::vector<Argument>& args);
 
-protected:
+ protected:
   friend class TrainerThread;
 
   std::vector<TrainerThreadPtr>& getAllThreads() { return threads_; }
@@ -281,7 +281,7 @@ protected:
 
   int paraMainThread(int pid) const { return paraMainThread_[pid]; }
 
-protected:
+ protected:
   virtual void forwardImp(const std::vector<Argument>& inArgs,
                           std::vector<Argument>* outArgs,
                           PassType passType,
@@ -298,7 +298,7 @@ protected:
 
   void allocGradBufs();
 
-protected:
+ protected:
   bool useGpu_;
 
   bool hasNonstaticCpuParamters_;
@@ -342,7 +342,7 @@ protected:
 };
 
 class TrainerThread {
-public:
+ public:
   TrainerThread(const ModelConfig& config,
                 int threadId,
                 MultiGradientMachine* multiMachine);
@@ -392,7 +392,7 @@ public:
   /// Whether the thread has input data.
   bool hasInputData() { return batchSize_ != 0; }
 
-protected:
+ protected:
   void mergeCpuGradients();
 
   void mergeGradSparse(
@@ -421,7 +421,7 @@ protected:
   /// GradientMachine::backward
   void doCallback(int pid);
 
-protected:
+ protected:
   MultiGradientMachine* multiMachine_;
   ModelConfig config_;
   /// whether the thread should stop
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.cpp b/paddle/gserver/gradientmachines/MultiNetwork.cpp
index a1140402b8..5f3d09dda2 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.cpp
+++ b/paddle/gserver/gradientmachines/MultiNetwork.cpp
@@ -122,7 +122,7 @@ void MultiNetwork::finish() {
 }
 
 class MultiCombinedEvaluator : public Evaluator {
-public:
+ public:
   MultiCombinedEvaluator() {}
   void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
     evaluators_.emplace_back(std::move(evaluator));
@@ -167,7 +167,7 @@ public:
     }
   }
 
-protected:
+ protected:
   std::vector<std::unique_ptr<Evaluator>> evaluators_;
 };
 
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.h b/paddle/gserver/gradientmachines/MultiNetwork.h
index 186a9ad0a3..495d559201 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.h
+++ b/paddle/gserver/gradientmachines/MultiNetwork.h
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 
 class MultiNetwork : public NeuralNetwork {
-public:
+ public:
   explicit MultiNetwork(std::string subModelName = "")
       : NeuralNetwork(subModelName) {}
 
@@ -58,7 +58,7 @@ public:
 
   virtual void finish();
 
-protected:
+ protected:
   std::vector<std::unique_ptr<NeuralNetwork>> subNetworks_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index a3c13df3db..ac60a3a340 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -362,7 +362,7 @@ void NeuralNetwork::releaseOutput() {
 #ifndef PADDLE_MOBILE_INFERENCE
 
 class CombinedEvaluator : public Evaluator {
-public:
+ public:
   void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
     evaluators_.emplace_back(std::move(evaluator));
   }
@@ -400,11 +400,11 @@ public:
     }
   }
 
-protected:
+ protected:
   std::vector<std::unique_ptr<Evaluator>> evaluators_;
 
   // Evaluator interface
-public:
+ public:
   /**
    * @brief getNames will return all inside evaluators' names.
    * @param names [out]: return names.
@@ -435,7 +435,7 @@ public:
         });
   }
 
-private:
+ private:
   template <typename T>
   T getMethodHelper(const std::string& name,
                     Error* err,
@@ -454,7 +454,7 @@ private:
 };
 
 class SubnetEvaluator : public CombinedEvaluator {
-public:
+ public:
   SubnetEvaluator(const std::string& layerName,
                   std::unique_ptr<Evaluator>&& evaluator)
       : layerName_(layerName) {
@@ -473,7 +473,7 @@ public:
                     << " in submodel " << nn.getName();
   }
 
-protected:
+ protected:
   std::string layerName_;
 };
 
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 5b32f844f7..3e5615c8f0 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -56,7 +56,7 @@ void parameterInitNN(int paramId,
                      std::vector<ParameterPtr>* sharedParams);
 
 class NeuralNetwork : public GradientMachine {
-public:
+ public:
   virtual void init(const ModelConfig& config,
                     ParamInitCallback callback = nullptr,
                     const std::vector<ParameterType>& parameterTypes =
@@ -144,7 +144,7 @@ public:
    */
   void releaseOutput();
 
-protected:
+ protected:
   /**
    * The constructor of NeuralNetwork.
    * The sub networks can get parameters_ and parameterMap_
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
index e3b6812123..c091459506 100644
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
@@ -32,7 +32,7 @@ enum TaskType {
  * multiple threads in parallel.
  */
 class ParallelNeuralNetwork : public NeuralNetwork {
-public:
+ public:
   ParallelNeuralNetwork(std::string subModelName = "",
                         NeuralNetwork *rootNetwork = nullptr)
       : NeuralNetwork(subModelName, rootNetwork) {}
@@ -66,7 +66,7 @@ public:
 
   // virtual void eval(Evaluator* evaluator);
 
-protected:
+ protected:
   bool useGpu_;
   /// number of gpu devices
   int numDevices_;
@@ -74,7 +74,7 @@ protected:
 };
 
 class ParallelThread {
-public:
+ public:
   ParallelThread(int threadId, int deviceId, bool useGpu);
   ~ParallelThread();
   void jobEnqueue(LayerPtr layer, TaskType task);
@@ -87,10 +87,10 @@ public:
   }
   void setForwardPassType(PassType passType) { passType_ = passType; }
 
-protected:
+ protected:
   void computeThread();
 
-public:
+ public:
   struct Job {
     LayerPtr layer_;
     TaskType task_;
@@ -98,7 +98,7 @@ public:
   typedef Queue<Job> JobQueue;
   JobQueue queue_;
 
-protected:
+ protected:
   /// from 0 to threads-1
   int threadId_;
   /// the GPU device Id which the computeThread_ used
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 2429b5d1a0..73ac8cda72 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -96,7 +96,7 @@ static InitFunction __init__diy_prob_method(
     std::numeric_limits<int>::max());
 
 class BeamSearchControlCallbacks {
-public:
+ public:
   RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback
       beamSearchCandidateAdjust;
   RecurrentGradientMachine::NormOrDropNodeCallback normOrDropNode;
@@ -115,7 +115,7 @@ public:
 };
 
 class BeamSearchStatisticsCallbacks {
-public:
+ public:
   RecurrentGradientMachine::EachStepCallback onEachStepStarted;
   RecurrentGradientMachine::EachStepCallback onEachStepStoped;
 
@@ -148,11 +148,11 @@ RecurrentGradientMachine::RecurrentGradientMachine(
  *    so it's should not be placed in root network.
  */
 class BootBiasLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
   IVectorPtr cpuIds_;
 
-public:
+ public:
   explicit BootBiasLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index 0032b72cda..7e943cebd3 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -30,7 +30,7 @@ class BeamSearchControlCallbacks;
 class BeamSearchStatisticsCallbacks;
 
 class RecurrentGradientMachine : public NeuralNetwork {
-public:
+ public:
   RecurrentGradientMachine(const std::string& subModelName,
                            NeuralNetwork* rootNetwork);
 
@@ -290,7 +290,7 @@ public:
     return this->finalPaths_;
   }
 
-protected:
+ protected:
   std::vector<Argument::SeqInfo> commonSeqInfo_;
   ICpuGpuVectorPtr sequenceStartPositions_;
   void calcSequenceStartPositions();
@@ -447,7 +447,7 @@ protected:
   MatrixPtr cpuProb_;
   IVectorPtr cpuEos_;
 
-private:
+ private:
   /*
    * @return beam size in beam search
    */
diff --git a/paddle/gserver/layers/AddtoLayer.h b/paddle/gserver/layers/AddtoLayer.h
index 1d00063056..6ea54f4a53 100644
--- a/paddle/gserver/layers/AddtoLayer.h
+++ b/paddle/gserver/layers/AddtoLayer.h
@@ -33,10 +33,10 @@ namespace paddle {
  * The config file api is addto_layer.
  */
 class AddtoLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit AddtoLayer(const LayerConfig& config) : Layer(config) {}
 
   ~AddtoLayer() {}
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
index da0ac45308..51f346d5c9 100644
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -26,11 +26,11 @@ namespace paddle {
  * called to set one and only one real layer
  */
 class AgentLayer : public Layer {
-protected:
+ protected:
   LayerPtr realLayer_;
   int numSamples_;
 
-public:
+ public:
   explicit AgentLayer(const LayerConfig& config) : Layer(config) {}
 
   ~AgentLayer() {}
@@ -55,14 +55,14 @@ public:
  * GatherAgentLayer collect a complete sequence.
  */
 class GatherAgentLayer : public Layer {
-protected:
+ protected:
   std::vector<LayerPtr> realLayers_;
   std::vector<IVectorPtr> idsVec_;
   // we don't clear idsVec_ vector to aviod IVector alloc/free
   IVectorPtr allIds_;
   std::vector<int> idIndex_;
 
-public:
+ public:
   explicit GatherAgentLayer(const LayerConfig& config) : Layer(config) {}
 
   virtual ~GatherAgentLayer() {}
@@ -95,7 +95,7 @@ public:
  * if it is, the agent will select a few ids in real layer.
  */
 class ScatterAgentLayer : public Layer {
-protected:
+ protected:
   LayerPtr realLayer_;
   IVectorPtr ids_;
   IVectorPtr cpuIds_;
@@ -113,7 +113,7 @@ protected:
   // true for setRealLayer, false for setRealLayerAndOutput
   bool selectionMode_;
 
-public:
+ public:
   explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
 
   virtual ~ScatterAgentLayer() {}
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
index 24602d2a9c..03e2673b55 100644
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -37,7 +37,7 @@ namespace paddle {
  * The config file api is pooling_layer.
  */
 class AverageLayer : public SequencePoolLayer {
-public:
+ public:
   enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
   explicit AverageLayer(const LayerConfig& config)
       : SequencePoolLayer(config) {}
@@ -48,7 +48,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   int mode_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
index 69d642af4f..5a446c0843 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -40,7 +40,7 @@ namespace paddle {
  */
 
 class BatchNormBaseLayer : public Layer {
-public:
+ public:
   explicit BatchNormBaseLayer(const LayerConfig& config) : Layer(config) {}
 
   ~BatchNormBaseLayer() {}
@@ -61,7 +61,7 @@ public:
    */
   void calFeatureMapSize();
 
-protected:
+ protected:
   /// Batch normalization scale parameter, which is referred to as gamma in
   /// in original paper.
   std::unique_ptr<Weight> weight_;
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
index 95add69215..e5e4e690b6 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -27,7 +27,7 @@ namespace paddle {
  */
 
 class BatchNormalizationLayer : public BatchNormBaseLayer {
-public:
+ public:
   explicit BatchNormalizationLayer(const LayerConfig& config)
       : BatchNormBaseLayer(config), firstTest_(true) {}
 
@@ -38,7 +38,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   /// Load pre-calculated mean and std.
   void setMeanAndStd();
 
diff --git a/paddle/gserver/layers/BilinearInterpLayer.h b/paddle/gserver/layers/BilinearInterpLayer.h
index acd320420f..8e08c2e1ce 100644
--- a/paddle/gserver/layers/BilinearInterpLayer.h
+++ b/paddle/gserver/layers/BilinearInterpLayer.h
@@ -26,13 +26,13 @@ namespace paddle {
  * @note  The config file api is bilinear_interp_layer.
  */
 class BilinearInterpLayer : public Layer {
-protected:
+ protected:
   size_t outImgH_, outImgW_;
   size_t inImgH_, inImgW_;
   real ratioH_, ratioW_;
   size_t numChannels_;
 
-public:
+ public:
   explicit BilinearInterpLayer(const LayerConfig& config) : Layer(config) {}
 
   virtual ~BilinearInterpLayer() {}
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index 1797b64036..9d76584f3a 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -40,7 +40,7 @@ namespace paddle {
  * The config file api is block_expand_layer.
  */
 class BlockExpandLayer : public Layer {
-protected:
+ protected:
   /**
    * @brief Calculate outputH_ and outputW_ and return block number which
    * actually is time steps.
@@ -53,7 +53,7 @@ protected:
   TensorShape inputShape_;
   TensorShape outputShape_;
 
-public:
+ public:
   explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}
 
   ~BlockExpandLayer() {}
diff --git a/paddle/gserver/layers/CRFDecodingLayer.h b/paddle/gserver/layers/CRFDecodingLayer.h
index fba3cebac1..018162e146 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.h
+++ b/paddle/gserver/layers/CRFDecodingLayer.h
@@ -30,14 +30,14 @@ namespace paddle {
  * See LinearChainCRF.h for the detail of the CRF formulation.
  */
 class CRFDecodingLayer : public CRFLayer {
-public:
+ public:
   explicit CRFDecodingLayer(const LayerConfig& config) : CRFLayer(config) {}
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   std::unique_ptr<LinearChainCRF> crf_;
 };
 
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
index cb5bd05568..88c2ed343a 100644
--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
@@ -27,14 +27,14 @@ namespace paddle {
  * See class LinearChainCRF for the detail of the CRF formulation.
  */
 class CRFLayer : public Layer {
-public:
+ public:
   explicit CRFLayer(const LayerConfig& config) : Layer(config) {}
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   size_t numClasses_;
   ParameterPtr parameter_;
   std::vector<LinearChainCRF> crfs_;
diff --git a/paddle/gserver/layers/CTCLayer.h b/paddle/gserver/layers/CTCLayer.h
index fcbc42565e..5d70b1f4ce 100644
--- a/paddle/gserver/layers/CTCLayer.h
+++ b/paddle/gserver/layers/CTCLayer.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 
 class CTCLayer : public Layer {
-public:
+ public:
   explicit CTCLayer(const LayerConfig& config) : Layer(config) {}
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
@@ -31,7 +31,7 @@ public:
                    const Argument& softmaxSeqs,
                    const Argument& labelSeqs);
 
-protected:
+ protected:
   size_t numClasses_;
   bool normByTimes_;
   std::vector<LinearChainCTC> ctcs_;
diff --git a/paddle/gserver/layers/ClipLayer.cpp b/paddle/gserver/layers/ClipLayer.cpp
index dbc3337499..6aa3c8fe64 100644
--- a/paddle/gserver/layers/ClipLayer.cpp
+++ b/paddle/gserver/layers/ClipLayer.cpp
@@ -24,11 +24,11 @@ namespace paddle {
  */
 
 class ClipLayer : public Layer {
-protected:
+ protected:
   double min_;
   double max_;
 
-public:
+ public:
   explicit ClipLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp
index f5ab29a509..e6de329ff3 100644
--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -23,7 +23,7 @@ namespace paddle {
  * each input as one row for the output of this layer and apply activation.
  */
 class ConcatenateLayer : public Layer {
-public:
+ public:
   explicit ConcatenateLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ConcatenateLayer() {}
@@ -97,7 +97,7 @@ void ConcatenateLayer::backward(const UpdateCallback& callback) {
  * processed by a Projection.
  */
 class ConcatenateLayer2 : public Layer {
-public:
+ public:
   explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
 
   ~ConcatenateLayer2() {}
@@ -108,7 +108,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   std::vector<std::unique_ptr<Projection>> projections_;
   std::vector<Argument> projOutput_;
   std::vector<std::pair<size_t, size_t>> projCol_;
diff --git a/paddle/gserver/layers/ContextProjection.h b/paddle/gserver/layers/ContextProjection.h
index e30f98f58d..9c21714541 100644
--- a/paddle/gserver/layers/ContextProjection.h
+++ b/paddle/gserver/layers/ContextProjection.h
@@ -42,7 +42,7 @@ namespace paddle {
  * The config file api is context_projection.
  */
 class ContextProjection : public Projection {
-public:
+ public:
   /**
    * Constructor. If context_start is zero and context_lenth is one, it will
    * set trainable_padding false. trainable_padding is an optional arguments
@@ -63,7 +63,7 @@ public:
 
   virtual bool init();
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
   /// number of extra timesteps added at the beginning
   size_t beginPad_;
diff --git a/paddle/gserver/layers/Conv3DLayer.h b/paddle/gserver/layers/Conv3DLayer.h
index 5ab5ff3d4a..07b804bad0 100644
--- a/paddle/gserver/layers/Conv3DLayer.h
+++ b/paddle/gserver/layers/Conv3DLayer.h
@@ -26,7 +26,7 @@ namespace paddle {
  * calculate convolution operation.
  */
 class Conv3DLayer : public ConvBaseLayer {
-public:
+ public:
   explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
   ~Conv3DLayer() {}
 
@@ -40,7 +40,7 @@ public:
   void bpropWeights(int i);
   size_t getSize();
 
-protected:
+ protected:
   // Figure out the dimensions for individual gemms.
   IntV M_;  /// numFilters_ / filter_group_;
   IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
index 93869fe68d..801bc4f888 100644
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -24,7 +24,7 @@ namespace paddle {
  */
 
 class ConvBaseLayer : public Layer {
-protected:
+ protected:
   typedef std::vector<int> IntV;
 
   /// True if it's deconv layer, false if it's convolution layer
@@ -88,7 +88,7 @@ protected:
   /// of output size.
   bool caffeMode_;
 
-public:
+ public:
   explicit ConvBaseLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/ConvBaseOperator.h b/paddle/gserver/layers/ConvBaseOperator.h
index 27fb0362d3..c3c647cb69 100644
--- a/paddle/gserver/layers/ConvBaseOperator.h
+++ b/paddle/gserver/layers/ConvBaseOperator.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class ConvBaseOperator : public Operator {
-public:
+ public:
   ConvBaseOperator(const OperatorConfig &config, bool useGpu);
   /**
    * Free workspace in device and destroy cudnn tensor descriptor.
@@ -46,7 +46,7 @@ public:
     hl_destroy_convolution_descriptor(convDesc_);
   }
 
-protected:
+ protected:
   /**
    * Get convolution parameters from layer config and
    * initialize member variables.
diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h
index ba76d236d9..f3266ae1ab 100644
--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@@ -23,7 +23,7 @@ namespace paddle {
  * @brief Base class for ConvProjection and ConvTransProjection.
  */
 class ConvBaseProjection : public Projection {
-public:
+ public:
   /**
    * Constructor.
    */
@@ -33,7 +33,7 @@ public:
 
   ~ConvBaseProjection();
 
-protected:
+ protected:
   void getConvParams();
   void initCudnn();
 
diff --git a/paddle/gserver/layers/ConvOperator.h b/paddle/gserver/layers/ConvOperator.h
index fbdb7bb1cd..527dbf8c27 100644
--- a/paddle/gserver/layers/ConvOperator.h
+++ b/paddle/gserver/layers/ConvOperator.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class ConvOperator : public ConvBaseOperator {
-public:
+ public:
   ConvOperator(const OperatorConfig &config, bool useGpu)
       : ConvBaseOperator(config, useGpu) {}
   /**
diff --git a/paddle/gserver/layers/ConvProjection.h b/paddle/gserver/layers/ConvProjection.h
index e8ecb99431..22a2202bb6 100644
--- a/paddle/gserver/layers/ConvProjection.h
+++ b/paddle/gserver/layers/ConvProjection.h
@@ -23,7 +23,7 @@ namespace paddle {
  * @brief Convolution projection do the same calculation with CudnnConvLayer.
  */
 class ConvProjection : public ConvBaseProjection {
-public:
+ public:
   /**
    * Constructor.
    */
diff --git a/paddle/gserver/layers/ConvShiftLayer.cpp b/paddle/gserver/layers/ConvShiftLayer.cpp
index fb87771019..615c347806 100644
--- a/paddle/gserver/layers/ConvShiftLayer.cpp
+++ b/paddle/gserver/layers/ConvShiftLayer.cpp
@@ -42,7 +42,7 @@ namespace paddle {
  */
 
 class ConvShiftLayer : public Layer {
-public:
+ public:
   explicit ConvShiftLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ConvShiftLayer() {}
diff --git a/paddle/gserver/layers/ConvTransOperator.h b/paddle/gserver/layers/ConvTransOperator.h
index 1bf58f2bfb..53cb7a21b4 100644
--- a/paddle/gserver/layers/ConvTransOperator.h
+++ b/paddle/gserver/layers/ConvTransOperator.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class ConvTransOperator : public ConvBaseOperator {
-public:
+ public:
   ConvTransOperator(const OperatorConfig &config, bool useGpu)
       : ConvBaseOperator(config, useGpu) {}
   /**
diff --git a/paddle/gserver/layers/ConvTransProjection.h b/paddle/gserver/layers/ConvTransProjection.h
index 269b2694c8..0f9ed720d3 100644
--- a/paddle/gserver/layers/ConvTransProjection.h
+++ b/paddle/gserver/layers/ConvTransProjection.h
@@ -23,7 +23,7 @@ namespace paddle {
  * @brief Convolution projection do the same calculation with CudnnConvLayer.
  */
 class ConvTransProjection : public ConvBaseProjection {
-public:
+ public:
   /**
    * Constructor.
    */
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
index dce751940c..31363d97c4 100644
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@@ -36,7 +36,7 @@ namespace paddle {
  * The config file api is linear_comb_layer.
  */
 class ConvexCombinationLayer : public Layer {
-protected:
+ protected:
   /// A matrix pointer pointing to second input.
   MatrixPtr tmpMtx0;
   /// A matrix pointer pointing to first input.
@@ -44,7 +44,7 @@ protected:
   /// A matrix pointer pointing to output.
   MatrixPtr tmpRow1;
 
-public:
+ public:
   explicit ConvexCombinationLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ConvexCombinationLayer() {}
diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h
index 675cdb16b5..d9fe1ff270 100644
--- a/paddle/gserver/layers/CosSimLayer.h
+++ b/paddle/gserver/layers/CosSimLayer.h
@@ -33,7 +33,7 @@ namespace paddle {
  * The config file api is cos_sim.
  */
 class CosSimLayer : public Layer {
-public:
+ public:
   explicit CosSimLayer(const LayerConfig& config) : Layer(config) {}
 
   ~CosSimLayer() {}
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp
index 685b4e8ef3..230ecc768b 100644
--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp
@@ -32,7 +32,7 @@ namespace paddle {
  */
 
 class CosSimVecMatLayer : public Layer {
-protected:
+ protected:
   MatrixPtr tmpMtx0;
   MatrixPtr tmpMtx1;
   MatrixPtr tmpRow0;
@@ -40,7 +40,7 @@ protected:
   MatrixPtr tmpRow2;
   MatrixPtr tmpRow3;
 
-public:
+ public:
   explicit CosSimVecMatLayer(const LayerConfig& config) : Layer(config) {}
 
   ~CosSimVecMatLayer() {}
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 484f803a83..1327616950 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -716,7 +716,7 @@ void HuberTwoClassification::backwardImp(Matrix& output,
  * \f]
  */
 class SumCostLayer : public Layer {
-public:
+ public:
   explicit SumCostLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index 306c067ed1..9bfec0e2b1 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  * handled by the base class.
  */
 class CostLayer : public Layer {
-public:
+ public:
   explicit CostLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -51,7 +51,7 @@ public:
                            Argument& label,
                            Matrix& outputGrad) = 0;
 
-protected:
+ protected:
   LayerPtr weightLayer_;
   real coeff_;
 };
@@ -65,7 +65,7 @@ protected:
  * \f]
  */
 class MultiClassCrossEntropy : public CostLayer {
-public:
+ public:
   explicit MultiClassCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
@@ -95,7 +95,7 @@ public:
  *     In Proceedings of the ACL 2014 Conference.
  */
 class MultiClassCrossEntropyWithSelfNorm : public CostLayer {
-public:
+ public:
   explicit MultiClassCrossEntropyWithSelfNorm(const LayerConfig& config)
       : CostLayer(config) {}
 
@@ -108,7 +108,7 @@ public:
                    Argument& label,
                    Matrix& outputGrad) override;
 
-protected:
+ protected:
   MatrixPtr sftMaxSum_;
   MatrixPtr sumInv_;
 };
@@ -120,7 +120,7 @@ protected:
  * \f]
  */
 class SoftBinaryClassCrossEntropy : public CostLayer {
-public:
+ public:
   explicit SoftBinaryClassCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
@@ -133,7 +133,7 @@ public:
                    Argument& label,
                    Matrix& outputGrad) override;
 
-protected:
+ protected:
   MatrixPtr targetPerDim_;
 };
 
@@ -145,7 +145,7 @@ protected:
  * \f]
  */
 class SumOfSquaresCostLayer : public CostLayer {
-public:
+ public:
   explicit SumOfSquaresCostLayer(const LayerConfig& config)
       : CostLayer(config) {}
 
@@ -171,7 +171,7 @@ public:
  * x = output - label
  */
 class SmoothL1CostLayer : public CostLayer {
-public:
+ public:
   explicit SmoothL1CostLayer(const LayerConfig& config) : CostLayer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -197,7 +197,7 @@ public:
  *      Rank useing Gradient Descent.
  */
 class RankingCost : public Layer {
-public:
+ public:
   explicit RankingCost(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -225,7 +225,7 @@ public:
     (void)outputGrad;
   }
 
-private:
+ private:
   double posPairCount_;
   double negPairCount_;
   MatrixPtr margin_;
@@ -250,7 +250,7 @@ private:
  *     with Nonsmooth Cost Functions.
  */
 class LambdaCost : public Layer {
-public:
+ public:
   explicit LambdaCost(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -270,7 +270,7 @@ public:
                 real* gradData,
                 int size);
 
-private:
+ private:
   MatrixPtr marginGrad_;
   int truncationSize_;
   int maxSortSize_;
@@ -287,10 +287,10 @@ private:
  * \f]
  */
 class MultiBinaryLabelCrossEntropy : public CostLayer {
-protected:
+ protected:
   MatrixPtr targetPerDim_;
 
-public:
+ public:
   explicit MultiBinaryLabelCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
@@ -308,7 +308,7 @@ public:
  * A base layer for HuberRegressionLoss and HuberTwoClassification.
  */
 class HuberCost : public CostLayer {
-public:
+ public:
   std::vector<Argument> tmpCpuInput_;
 
   explicit HuberCost(const LayerConfig& config) : CostLayer(config) {}
@@ -331,7 +331,7 @@ public:
  * Loss = delta * abs(y - f) - 0.5 * delta^2, otherwise
  */
 class HuberRegressionLoss : public HuberCost {
-public:
+ public:
   explicit HuberRegressionLoss(const LayerConfig& config) : HuberCost(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -343,7 +343,7 @@ public:
                    Argument& label,
                    Matrix& outputGrad) override;
 
-protected:
+ protected:
   real delta_;
 };
 
@@ -356,7 +356,7 @@ protected:
  * Loss = 0, otherwise
  */
 class HuberTwoClassification : public HuberCost {
-public:
+ public:
   explicit HuberTwoClassification(const LayerConfig& config)
       : HuberCost(config) {}
 
diff --git a/paddle/gserver/layers/CropLayer.h b/paddle/gserver/layers/CropLayer.h
index 1a85911ef7..ef88bc483d 100644
--- a/paddle/gserver/layers/CropLayer.h
+++ b/paddle/gserver/layers/CropLayer.h
@@ -28,7 +28,7 @@ namespace paddle {
  *                  crop input as this shape conf
  */
 class CropLayer : public Layer {
-public:
+ public:
   explicit CropLayer(const LayerConfig& config) : Layer(config) {}
 
   ~CropLayer() {}
@@ -38,7 +38,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   void setOutDims();
   void setInDims();
 
diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.h b/paddle/gserver/layers/CrossEntropyOverBeam.h
index b47a2933c2..c8702b1616 100644
--- a/paddle/gserver/layers/CrossEntropyOverBeam.h
+++ b/paddle/gserver/layers/CrossEntropyOverBeam.h
@@ -44,7 +44,7 @@ struct BeamExpansion {
 typedef std::shared_ptr<BeamExpansion> BeamExpansionPtr;
 
 class CostForOneSequence {
-public:
+ public:
   CostForOneSequence()
       : beamSize_(0), validExpansionCount_(0), goldAsExtraPath_(false) {}
   void setData(const BeamExpansionPtr bPtr, size_t beamSize) {
@@ -64,7 +64,7 @@ public:
   real forward();
   void backward();
 
-private:
+ private:
   void calValidExpandStep();
   void constructTotalExpansion();
   size_t initLastExpansion();
@@ -93,14 +93,14 @@ private:
 };
 
 class CrossEntropyOverBeam : public Layer {
-public:
+ public:
   explicit CrossEntropyOverBeam(const LayerConfig& config) : Layer(config) {}
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
 
-private:
+ private:
   void checkInputs();
   void copyInputsToCpu();
   void resizeOutput();
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
index aa279f73d6..1bb4eff8d2 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -30,7 +30,7 @@ namespace paddle {
  */
 
 class CudnnBatchNormLayer : public BatchNormBaseLayer {
-public:
+ public:
   explicit CudnnBatchNormLayer(const LayerConfig& config)
       : BatchNormBaseLayer(config) {}
 
@@ -46,7 +46,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   /// Epsilon value used in the batch normalization formula.
   /// Same epsilon value should be used in forward and backward functions.
   double eps_;
diff --git a/paddle/gserver/layers/CudnnConvBaseLayer.h b/paddle/gserver/layers/CudnnConvBaseLayer.h
index 698104e4fb..1ee1aa100d 100644
--- a/paddle/gserver/layers/CudnnConvBaseLayer.h
+++ b/paddle/gserver/layers/CudnnConvBaseLayer.h
@@ -31,14 +31,14 @@ namespace paddle {
  * The config file api is img_conv_layer.
  */
 class CudnnConvBaseLayer : public ConvBaseLayer {
-protected:
+ protected:
   std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
   std::vector<std::unique_ptr<Projection>> projections_;
 
   hl_tensor_descriptor biasDesc_;
   hl_tensor_descriptor outputDesc_;
 
-public:
+ public:
   explicit CudnnConvBaseLayer(const LayerConfig& config)
       : ConvBaseLayer(config) {}
 
diff --git a/paddle/gserver/layers/CudnnPoolLayer.h b/paddle/gserver/layers/CudnnPoolLayer.h
index 9eb4fc6138..fc249354d1 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.h
+++ b/paddle/gserver/layers/CudnnPoolLayer.h
@@ -26,7 +26,7 @@ namespace paddle {
  */
 
 class CudnnPoolLayer : public PoolLayer {
-protected:
+ protected:
   int windowHeight, windowWidth;
   int heightPadding, widthPadding, strideHeight, strideWidth;
   int imageH_, imageW_, outputH_, outputW_;
@@ -40,7 +40,7 @@ protected:
   /// A description of a pooling operation.
   hl_pooling_descriptor poolingDesc_;
 
-public:
+ public:
   static bool typeCheck(const std::string& poolType,
                         hl_pooling_mode_t* mode = nullptr);
   explicit CudnnPoolLayer(const LayerConfig& config);
diff --git a/paddle/gserver/layers/DataLayer.h b/paddle/gserver/layers/DataLayer.h
index 4b12afe0ef..d02f5a4697 100644
--- a/paddle/gserver/layers/DataLayer.h
+++ b/paddle/gserver/layers/DataLayer.h
@@ -25,7 +25,7 @@ namespace paddle {
  * The config file api is data_layer.
  */
 class DataLayer : public Layer {
-public:
+ public:
   explicit DataLayer(const LayerConfig& config) : Layer(config) {}
 
   virtual void setData(const Argument& data) { data_ = data; }
@@ -58,10 +58,10 @@ public:
     }
   }
 
-private:
+ private:
   void copyDataToOutput(Argument& output);
 
-protected:
+ protected:
   Argument data_;
 };
 
diff --git a/paddle/gserver/layers/DataNormLayer.h b/paddle/gserver/layers/DataNormLayer.h
index 2a2a2a4aa7..7ae67a877b 100644
--- a/paddle/gserver/layers/DataNormLayer.h
+++ b/paddle/gserver/layers/DataNormLayer.h
@@ -37,7 +37,7 @@ namespace paddle {
  */
 
 class DataNormLayer : public Layer {
-public:
+ public:
   enum NormalizationStrategy { kZScore = 0, kMinMax = 1, kDecimalScaling = 2 };
 
   explicit DataNormLayer(const LayerConfig& config) : Layer(config) {}
@@ -50,7 +50,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   int mode_;
   std::unique_ptr<Weight> weight_;
   MatrixPtr min_;
diff --git a/paddle/gserver/layers/DeConv3DLayer.h b/paddle/gserver/layers/DeConv3DLayer.h
index 57d51cdec6..13d1d07cf5 100644
--- a/paddle/gserver/layers/DeConv3DLayer.h
+++ b/paddle/gserver/layers/DeConv3DLayer.h
@@ -27,7 +27,7 @@ namespace paddle {
  * calculate deconvolution3D operation.
  */
 class DeConv3DLayer : public ConvBaseLayer {
-public:
+ public:
   explicit DeConv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
   ~DeConv3DLayer() {}
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
@@ -40,7 +40,7 @@ public:
   void bpropWeights(int i);
   size_t getSize();
 
-protected:
+ protected:
   // Figure out the dimensions for individual gemms.
   IntV M_;  /// numFilters_ / filter_group_;
   IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
diff --git a/paddle/gserver/layers/DetectionOutputLayer.h b/paddle/gserver/layers/DetectionOutputLayer.h
index 174a6e5d9a..b0270ed331 100644
--- a/paddle/gserver/layers/DetectionOutputLayer.h
+++ b/paddle/gserver/layers/DetectionOutputLayer.h
@@ -33,7 +33,7 @@ namespace paddle {
  */
 
 class DetectionOutputLayer : public Layer {
-public:
+ public:
   explicit DetectionOutputLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
@@ -42,7 +42,7 @@ public:
 
   void backward(const UpdateCallback& callback = nullptr) {}
 
-protected:
+ protected:
   inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
 
   inline LayerPtr getLocInputLayer(size_t index) {
@@ -53,7 +53,7 @@ protected:
     return inputLayers_[1 + inputNum_ + index];
   }
 
-private:
+ private:
   size_t numClasses_;  // number of classes
   size_t inputNum_;    // number of input layers
   real nmsThreshold_;
diff --git a/paddle/gserver/layers/DotMulOperator.cpp b/paddle/gserver/layers/DotMulOperator.cpp
index 68db2929ad..03d18d9b23 100644
--- a/paddle/gserver/layers/DotMulOperator.cpp
+++ b/paddle/gserver/layers/DotMulOperator.cpp
@@ -27,7 +27,7 @@ namespace paddle {
  * The config file api is dotmul_operator.
  */
 class DotMulOperator : public Operator {
-public:
+ public:
   DotMulOperator(const OperatorConfig& config, bool useGpu);
   virtual void forward();
   virtual void backward();
diff --git a/paddle/gserver/layers/DotMulProjection.cpp b/paddle/gserver/layers/DotMulProjection.cpp
index 86453aae84..d778038767 100644
--- a/paddle/gserver/layers/DotMulProjection.cpp
+++ b/paddle/gserver/layers/DotMulProjection.cpp
@@ -26,14 +26,14 @@ namespace paddle {
  * The config file api is dotmul_projection.
  */
 class DotMulProjection : public Projection {
-public:
+ public:
   DotMulProjection(const ProjectionConfig& config,
                    const ParameterPtr& parameter,
                    bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
-protected:
+ protected:
   /// shared memory with parameter
   std::unique_ptr<Weight> weight_;
 };
diff --git a/paddle/gserver/layers/DotProdLayer.cpp b/paddle/gserver/layers/DotProdLayer.cpp
index 5148d93e27..72b0c707b2 100644
--- a/paddle/gserver/layers/DotProdLayer.cpp
+++ b/paddle/gserver/layers/DotProdLayer.cpp
@@ -27,7 +27,7 @@ namespace paddle {
  */
 
 class DotProdLayer : public Layer {
-public:
+ public:
   explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
 
   ~DotProdLayer() {}
diff --git a/paddle/gserver/layers/EosIdCheckLayer.cpp b/paddle/gserver/layers/EosIdCheckLayer.cpp
index 470a5b8ea2..04400f2836 100644
--- a/paddle/gserver/layers/EosIdCheckLayer.cpp
+++ b/paddle/gserver/layers/EosIdCheckLayer.cpp
@@ -24,7 +24,7 @@ namespace paddle {
  * It is used by recurrent layer group.
  */
 class EosIdCheckLayer : public Layer {
-public:
+ public:
   explicit EosIdCheckLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
index be968155ef..6919ef7135 100644
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class ExpandConvLayer : public ConvBaseLayer {
-public:
+ public:
   explicit ExpandConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
 
   ~ExpandConvLayer() {}
@@ -42,7 +42,7 @@ public:
 
   size_t getOutputSize();
 
-protected:
+ protected:
   std::vector<TensorShape> inputShape_;
   std::vector<TensorShape> filterShape_;
   std::vector<TensorShape> outputShape_;
diff --git a/paddle/gserver/layers/ExpandLayer.h b/paddle/gserver/layers/ExpandLayer.h
index 04bbfcbd04..06bd4ef05e 100644
--- a/paddle/gserver/layers/ExpandLayer.h
+++ b/paddle/gserver/layers/ExpandLayer.h
@@ -37,7 +37,7 @@ namespace paddle {
  */
 
 class ExpandLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
   /// if input[0] is dense data, ExpandLevel=kNonSeq;
   /// if input[0] is sequence data, ExpandLevel=kSeq
@@ -48,7 +48,7 @@ protected:
   /// of input[1]
   ICpuGpuVectorPtr expandStartsPos_;
 
-public:
+ public:
   explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ExpandLayer() {}
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
index 684da4e65a..148abe2381 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -42,7 +42,7 @@ namespace paddle {
  */
 
 class FactorizationMachineLayer : public Layer {
-protected:
+ protected:
   // The latent vectors, shape: (size, factorSize_)
   // Each row of the latentVectors_ matrix is the latent vector
   // corresponding to one input feature dimension
@@ -50,7 +50,7 @@ protected:
   // The hyperparameter that defines the dimensionality of the factorization
   size_t factorSize_;
 
-private:
+ private:
   // Store the square values of the letent vectors matrix
   MatrixPtr latentVectorsSquare_;
   // Store the square values of input matrix
@@ -65,7 +65,7 @@ private:
   // Negative identity matrix
   MatrixPtr negOnes_;
 
-public:
+ public:
   explicit FactorizationMachineLayer(const LayerConfig& config)
       : Layer(config) {}
   ~FactorizationMachineLayer() {}
diff --git a/paddle/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
index 81b98da45b..d95f0b9b3d 100644
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
@@ -38,11 +38,11 @@ namespace paddle {
  */
 
 class FeatureMapExpandLayer : public Layer {
-private:
+ private:
   int numFilters_;
   bool asRowVector_;
 
-public:
+ public:
   explicit FeatureMapExpandLayer(const LayerConfig& config) : Layer(config) {}
 
   ~FeatureMapExpandLayer() {}
diff --git a/paddle/gserver/layers/FullMatrixProjection.h b/paddle/gserver/layers/FullMatrixProjection.h
index 7c4cd1a706..a27aa4a123 100644
--- a/paddle/gserver/layers/FullMatrixProjection.h
+++ b/paddle/gserver/layers/FullMatrixProjection.h
@@ -28,14 +28,14 @@ namespace paddle {
  * The config file api is full_matrix_projection.
  */
 class FullMatrixProjection : public Projection {
-public:
+ public:
   FullMatrixProjection(const ProjectionConfig& config,
                        const ParameterPtr& parameter,
                        bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
 };
 
diff --git a/paddle/gserver/layers/FullyConnectedLayer.h b/paddle/gserver/layers/FullyConnectedLayer.h
index e66aeeb733..e0f9d6ce55 100644
--- a/paddle/gserver/layers/FullyConnectedLayer.h
+++ b/paddle/gserver/layers/FullyConnectedLayer.h
@@ -28,11 +28,11 @@ namespace paddle {
  */
 
 class FullyConnectedLayer : public Layer {
-protected:
+ protected:
   WeightList weights_;
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {}
   ~FullyConnectedLayer() {}
 
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.h b/paddle/gserver/layers/GatedRecurrentLayer.h
index f0a3a82301..46508dc977 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.h
+++ b/paddle/gserver/layers/GatedRecurrentLayer.h
@@ -47,7 +47,7 @@ namespace paddle {
  */
 
 class GatedRecurrentLayer : public Layer, public GruCompute {
-public:
+ public:
   explicit GatedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -63,7 +63,7 @@ public:
 
   LayerStatePtr getState() override;
 
-protected:
+ protected:
   void forwardSequence(int batchSize,
                        size_t numSequences,
                        const int* starts,
@@ -79,7 +79,7 @@ protected:
                     MatrixPtr inputValue);
   void backwardBatch(int batchSize, MatrixPtr inputGrad);
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> gateWeight_;
   std::unique_ptr<Weight> stateWeight_;
diff --git a/paddle/gserver/layers/GetOutputLayer.cpp b/paddle/gserver/layers/GetOutputLayer.cpp
index f255681f3e..7c1e3c407c 100644
--- a/paddle/gserver/layers/GetOutputLayer.cpp
+++ b/paddle/gserver/layers/GetOutputLayer.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 
 class GetOutputLayer : public Layer {
-public:
+ public:
   explicit GetOutputLayer(const LayerConfig& config) : Layer(config) {}
 
   ~GetOutputLayer() {}
diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/gserver/layers/GruCompute.h
index fb6bc56422..50006325ce 100644
--- a/paddle/gserver/layers/GruCompute.h
+++ b/paddle/gserver/layers/GruCompute.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class GruCompute {
-public:
+ public:
   void init(LayerConfig &config);
 
   template <bool useGpu>
@@ -33,7 +33,7 @@ public:
                 int frameSize,
                 int batchSize = 1);
 
-public:
+ public:
   hl_activation_mode_t activeNode_;
   hl_activation_mode_t activeGate_;
 };
diff --git a/paddle/gserver/layers/GruStepLayer.cpp b/paddle/gserver/layers/GruStepLayer.cpp
index 917c50250c..114f287411 100644
--- a/paddle/gserver/layers/GruStepLayer.cpp
+++ b/paddle/gserver/layers/GruStepLayer.cpp
@@ -44,13 +44,13 @@ namespace paddle {
  * The config file api if gru_step_layer.
  */
 class GruStepLayer : public Layer, public GruCompute {
-protected:
+ protected:
   Argument gate_;
   Argument resetOutput_;
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> bias_;
 
-public:
+ public:
   explicit GruStepLayer(const LayerConfig& config) : Layer(config) {}
 
   ~GruStepLayer() {}
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 10e501f180..73ef252fd5 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -58,7 +58,7 @@ namespace paddle {
  * The config file api is hsigmod_layer.
  */
 class HierarchicalSigmoidLayer : public Layer {
-public:
+ public:
   explicit HierarchicalSigmoidLayer(const LayerConfig& config)
       : Layer(config) {}
   bool init(const LayerMap& layerMap,
@@ -66,7 +66,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   /**
    * The last of inputs is label layer.
    */
diff --git a/paddle/gserver/layers/IdentityProjection.cpp b/paddle/gserver/layers/IdentityProjection.cpp
index 6c70f77acc..34e9eb9016 100644
--- a/paddle/gserver/layers/IdentityProjection.cpp
+++ b/paddle/gserver/layers/IdentityProjection.cpp
@@ -26,7 +26,7 @@ namespace paddle {
  * The config file api is identity_projection.
  */
 class IdentityProjection : public Projection {
-public:
+ public:
   IdentityProjection(const ProjectionConfig& config,
                      const ParameterPtr& parameter,
                      bool useGpu);
@@ -68,7 +68,7 @@ void IdentityProjection::backward(const UpdateCallback& callback) {
  * The config file api is identity_projection.
  */
 class IdentityOffsetProjection : public Projection {
-public:
+ public:
   IdentityOffsetProjection(const ProjectionConfig& config,
                            const ParameterPtr& parameter,
                            bool useGpu);
diff --git a/paddle/gserver/layers/InterpolationLayer.cpp b/paddle/gserver/layers/InterpolationLayer.cpp
index 0ac92024bc..509c07cf22 100644
--- a/paddle/gserver/layers/InterpolationLayer.cpp
+++ b/paddle/gserver/layers/InterpolationLayer.cpp
@@ -33,12 +33,12 @@ namespace paddle {
  */
 
 class InterpolationLayer : public Layer {
-protected:
+ protected:
   /// weightLast = 1 - weight
   MatrixPtr weightLast_;
   MatrixPtr tmpMatrix;
 
-public:
+ public:
   explicit InterpolationLayer(const LayerConfig& config) : Layer(config) {}
 
   ~InterpolationLayer() {}
diff --git a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
index 0ea960902e..7fd25954ef 100644
--- a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
+++ b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
@@ -17,14 +17,14 @@ limitations under the License. */
 namespace paddle {
 
 class KmaxSeqScoreLayer : public Layer {
-private:
+ private:
   MatrixPtr scores_;
   size_t beamSize_;
   void kmaxScorePerSeq(const real* score,
                        real* sortedRes,
                        const ICpuGpuVectorPtr seqStartPos);
 
-public:
+ public:
   explicit KmaxSeqScoreLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/L2DistanceLayer.h b/paddle/gserver/layers/L2DistanceLayer.h
index 97f35daf78..44e688e137 100644
--- a/paddle/gserver/layers/L2DistanceLayer.h
+++ b/paddle/gserver/layers/L2DistanceLayer.h
@@ -33,7 +33,7 @@ namespace paddle {
  */
 
 class L2DistanceLayer : public Layer {
-public:
+ public:
   explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
   ~L2DistanceLayer() {}
 
@@ -43,7 +43,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-private:
+ private:
   // Store the result of subtracting Input2 from Input1 in forward computation,
   // which will be reused in backward computation.
   MatrixPtr inputSub_;
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 8da342a00f..13e20e8316 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -60,7 +60,7 @@ enum PADDLE_DEVICE_ID {
  * Define necessary variables and functions for every layer.
  */
 class Layer {
-protected:
+ protected:
   /// Layer config
   LayerConfig config_;
   /// whether to use GPU
@@ -112,7 +112,7 @@ protected:
   /// Layer backward function
   std::vector<std::shared_ptr<FunctionBase>> backward_;
 
-public:
+ public:
   /**
    * Wait until all input value ready.
    * Called before Layer::forward() function.
@@ -137,7 +137,7 @@ public:
    */
   virtual void markAllInputGrad();
 
-protected:
+ protected:
   /**
    * Create layer function. Function is called in forward or backward.
    * \param function, Layer::forward_ or Layer::backward_
@@ -252,7 +252,7 @@ protected:
    */
   void addOutputArgument(int deviceId);
 
-public:
+ public:
   explicit Layer(const LayerConfig& config, bool useGpu = FLAGS_use_gpu);
   virtual ~Layer() {}
 
@@ -490,7 +490,7 @@ public:
    */
   virtual void onPassEnd() {}
 
-protected:
+ protected:
   /**
    * Forward of activation function.
    */
diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h
index 1ea4c7e105..e802b701d0 100644
--- a/paddle/gserver/layers/LinearChainCRF.h
+++ b/paddle/gserver/layers/LinearChainCRF.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 
 class LinearChainCRF {
-public:
+ public:
   /**
    * The size of para must be \f$(numClasses + 2) * numClasses\f$.
    * The first numClasses values of para are for starting weights (\f$a\f$).
@@ -71,7 +71,7 @@ public:
    */
   MatrixPtr getXGrad() { return matGrad_; }
 
-protected:
+ protected:
   int numClasses_;
   MatrixPtr a_;
   MatrixPtr b_;
diff --git a/paddle/gserver/layers/LinearChainCTC.h b/paddle/gserver/layers/LinearChainCTC.h
index 0b774277dc..5b325a0deb 100644
--- a/paddle/gserver/layers/LinearChainCTC.h
+++ b/paddle/gserver/layers/LinearChainCTC.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 
 class LinearChainCTC {
-public:
+ public:
   LinearChainCTC(int numClasses, bool normByTimes);
 
   // Calculate the negative log probability as loss
@@ -35,7 +35,7 @@ public:
                 int* labelSeq,
                 int labelSeqLen);
 
-protected:
+ protected:
   int numClasses_, blank_, totalSegments_, totalTime_;
   bool normByTimes_;
   bool isInvalid_;
diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/gserver/layers/LstmCompute.h
index b7d55eb1f9..80fb01cd18 100644
--- a/paddle/gserver/layers/LstmCompute.h
+++ b/paddle/gserver/layers/LstmCompute.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class LstmCompute {
-public:
+ public:
   void init(LayerConfig &config);
 
   /**
@@ -57,7 +57,7 @@ public:
                            hl_lstm_grad grad,
                            int frameSize);
 
-public:
+ public:
   hl_activation_mode_t activeNode_;
   hl_activation_mode_t activeGate_;
   hl_activation_mode_t activeState_;
diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h
index 4568b13ade..76dfe8146b 100644
--- a/paddle/gserver/layers/LstmLayer.h
+++ b/paddle/gserver/layers/LstmLayer.h
@@ -71,7 +71,7 @@ namespace paddle {
  */
 
 class LstmLayer : public Layer, public LstmCompute {
-public:
+ public:
   explicit LstmLayer(const LayerConfig &config) : Layer(config) {}
 
   bool init(const LayerMap &layerMap,
@@ -87,7 +87,7 @@ public:
 
   LayerStatePtr getState() override;
 
-protected:
+ protected:
   /**
    * @brief Compute lstm forward one sequence by one sequence.
    * @param batchSize The batchSize is not equal to the batch_size in
@@ -165,7 +165,7 @@ protected:
    */
   void getPrevBatchState(size_t numSequences);
 
-protected:
+ protected:
   /// Learned parameters, shape: (size, 4*size).
   /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
   std::unique_ptr<Weight> weight_;
diff --git a/paddle/gserver/layers/LstmStepLayer.cpp b/paddle/gserver/layers/LstmStepLayer.cpp
index 8faaa1c4e1..c44768ddb2 100644
--- a/paddle/gserver/layers/LstmStepLayer.cpp
+++ b/paddle/gserver/layers/LstmStepLayer.cpp
@@ -22,7 +22,7 @@ namespace paddle {
  * LstmStepLayer used in recurrent layer group.
  */
 class LstmStepLayer : public Layer, public LstmCompute {
-protected:
+ protected:
   Argument state_;
   Argument gate_;
   Argument stateActive_;
@@ -30,7 +30,7 @@ protected:
   MatrixPtr checkIgGrad_, checkFgGrad_, checkOgGrad_;
   std::unique_ptr<Weight> weight_;
 
-public:
+ public:
   explicit LstmStepLayer(const LayerConfig& config) : Layer(config) {}
 
   ~LstmStepLayer() {}
diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp
index 7cfdb3ff25..22c28157c5 100644
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ b/paddle/gserver/layers/MDLstmLayer.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 
 class CoordIterator {
-public:
+ public:
   std::vector<int> dims_;
   std::vector<bool> directions_;
   std::vector<int> curPos_;
@@ -51,7 +51,7 @@ public:
     }
   }
 
-public:
+ public:
   CoordIterator(std::vector<int> dim, std::vector<bool> directions)
       : dims_(dim), directions_(directions), end_(false) {
     CHECK_EQ(dims_.size(), directions_.size());
@@ -178,7 +178,7 @@ public:
  * */
 
 class MDLstmLayer : public LstmLayer {
-public:
+ public:
   explicit MDLstmLayer(const LayerConfig& config) : LstmLayer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -188,13 +188,13 @@ public:
 
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   void forwardOneSequence(int start, CoordIterator& coordIter);
   void backwardOneSequence(int start, CoordIterator& coordIter);
   void forwardGate2OutputSequence(int start, CoordIterator& coordIter);
   void backwardGate2OutputSequence(int start, CoordIterator& coordIter);
 
-protected:
+ protected:
   std::vector<Argument> frameInputGate_;
   std::vector<Argument> frameForgetGate_;
   std::vector<Argument> frameOutputGate_;
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.h b/paddle/gserver/layers/MKLDNNAddtoLayer.h
index e40e2f2251..0b385e804f 100644
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@@ -25,7 +25,7 @@ namespace paddle {
  * The config file api is mkldnn_addto
  */
 class MKLDNNAddtoLayer : public MKLDNNLayer {
-protected:
+ protected:
   // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
   size_t layerSize_;
 
@@ -38,7 +38,7 @@ protected:
   std::vector<std::shared_ptr<mkldnn::primitive>> fwdBias_;
   std::shared_ptr<mkldnn::primitive> bwdBias_;
 
-public:
+ public:
   explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
 
   ~MKLDNNAddtoLayer() {}
@@ -59,7 +59,7 @@ public:
 
   void updateWeights(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
diff --git a/paddle/gserver/layers/MKLDNNBase.h b/paddle/gserver/layers/MKLDNNBase.h
index d84e285940..786ceaf860 100644
--- a/paddle/gserver/layers/MKLDNNBase.h
+++ b/paddle/gserver/layers/MKLDNNBase.h
@@ -31,7 +31,7 @@ typedef enum {
  *
  */
 class CPUEngine {
-public:
+ public:
   static CPUEngine& Instance() {
     // Thread-safe in C++11.
     static CPUEngine myInstance;
@@ -46,12 +46,12 @@ public:
 
   mkldnn::engine& getEngine() { return cpuEngine_; }
 
-protected:
+ protected:
   CPUEngine() : cpuEngine_(mkldnn::engine::cpu, 0) {}
   //    CPUEngine() : cpuEngine_(mkldnn::engine::cpu_lazy, 0) {}
   ~CPUEngine() {}
 
-private:
+ private:
   mkldnn::engine cpuEngine_;
 };
 
@@ -60,7 +60,7 @@ private:
  *
  */
 class MKLDNNStream {
-public:
+ public:
   MKLDNNStream() : ready_(false) { resetState(); }
 
   virtual ~MKLDNNStream() {}
@@ -89,7 +89,7 @@ public:
     ready_ = true;
   }
 
-private:
+ private:
   bool ready_;
   std::shared_ptr<mkldnn::stream> stream_;
 };
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.h b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
index 93e182206a..9aa20df98f 100644
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.h
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
@@ -27,7 +27,7 @@ typedef mkldnn::batch_normalization_backward bn_bwd;
  * The config file api is mkldnn_batch_norm
  */
 class MKLDNNBatchNormLayer : public MKLDNNLayer {
-protected:
+ protected:
   // save forward primitive_desc, which can be used backward
   std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;
 
@@ -62,7 +62,7 @@ protected:
   MKLDNNMatrixPtr mean_;
   MKLDNNMatrixPtr var_;
 
-public:
+ public:
   explicit MKLDNNBatchNormLayer(const LayerConfig& config)
       : MKLDNNLayer(config), useGlobalStats_(true), hasInitedWgt_(false) {}
 
@@ -88,7 +88,7 @@ public:
 
   void convertWeightsFromPaddle() override;
 
-protected:
+ protected:
   void initWeight();
   /**
    * cal moving mean and variance.
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.h b/paddle/gserver/layers/MKLDNNConcatLayer.h
index f7abdabfb5..d7738df6c1 100644
--- a/paddle/gserver/layers/MKLDNNConcatLayer.h
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.h
@@ -25,7 +25,7 @@ namespace paddle {
  * The config file api is mkldnn_concat
  */
 class MKLDNNConcatLayer : public MKLDNNLayer {
-protected:
+ protected:
   std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
   // input channel numbers
   std::vector<int> channels_;
@@ -35,7 +35,7 @@ protected:
   // if axis_ == 1, concat channel (default)
   int axis_;
 
-public:
+ public:
   explicit MKLDNNConcatLayer(const LayerConfig& config)
       : MKLDNNLayer(config), axis_(1) {}
 
@@ -75,7 +75,7 @@ public:
     return totalSize;
   }
 
-protected:
+ protected:
   void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                        MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/gserver/layers/MKLDNNConvLayer.h
index 29c8735fbb..d399035ed3 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
@@ -28,7 +28,7 @@ typedef mkldnn::convolution_backward_data conv_bwdData;
  * The config file api is mkldnn_conv
  */
 class MKLDNNConvLayer : public MKLDNNLayer {
-protected:
+ protected:
   // padding height and width
   int ph_, pw_;
   // stride height and width
@@ -59,7 +59,7 @@ protected:
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit MKLDNNConvLayer(const LayerConfig& config)
       : MKLDNNLayer(config), hasInitedWgt_(false), caffeMode_(true) {}
 
@@ -92,7 +92,7 @@ public:
                        << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
   }
 
-protected:
+ protected:
   /**
    * load the dims settings of this conv
    */
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
index 0d41a4379d..a704066cc8 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -28,7 +28,7 @@ typedef mkldnn::inner_product_backward_data fc_bwdData;
  * The config file api is mkldnn_fc
  */
 class MKLDNNFcLayer : public MKLDNNLayer {
-protected:
+ protected:
   // input layer size, can not be change after init
   size_t iLayerSize_;  // == ic * ih * iw
 
@@ -42,7 +42,7 @@ protected:
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit MKLDNNFcLayer(const LayerConfig& config)
       : MKLDNNLayer(config), hasInitedWgt_(false) {}
 
@@ -68,7 +68,7 @@ public:
 
   void convertWeightsToPaddle() override;
 
-protected:
+ protected:
   void resetFwdBuffers(MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.h b/paddle/gserver/layers/MKLDNNLRNLayer.h
index b503ee5594..028438f2c9 100644
--- a/paddle/gserver/layers/MKLDNNLRNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.h
@@ -27,7 +27,7 @@ typedef mkldnn::lrn_backward lrn_bwd;
  * The config file api is mkldnn_lrn
  */
 class MKLDNNLRNLayer : public MKLDNNLayer {
-protected:
+ protected:
   // save forward primitive_desc, which can be used in backward
   std::shared_ptr<lrn_fwd::primitive_desc> fwdPD_;
   // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
@@ -37,7 +37,7 @@ protected:
   int localSize_;
   float alpha_, beta_;  // scale and pow in paddle
 
-public:
+ public:
   explicit MKLDNNLRNLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
 
   ~MKLDNNLRNLayer() {}
@@ -56,7 +56,7 @@ public:
                 std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
-protected:
+ protected:
   void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr in,
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 4a7eb74ce3..2b164d0d3b 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -33,7 +33,7 @@ typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
  *
  */
 class MKLDNNLayer : public Layer {
-protected:
+ protected:
   // batch size
   int bs_;
   // their sizes are always from the first input layer
@@ -95,7 +95,7 @@ protected:
   // tmp input argument to save input grad, only used to merge grad
   Argument tmpInArg_;
 
-public:
+ public:
   explicit MKLDNNLayer(const LayerConfig& config)
       : Layer(config),
         ih_(0),
@@ -162,7 +162,7 @@ public:
    */
   void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); }
 
-protected:
+ protected:
   /**
    * Some layers may have different condition to reset the forward.
    * The function returns the condition that do not need reset forward.
@@ -233,7 +233,7 @@ protected:
    */
   void resetMergeGrad(MKLDNNMatrixPtr& out);
 
-protected:
+ protected:
   /**
    * Set deviceId of this layer.
    */
@@ -340,7 +340,7 @@ protected:
     }
   }
 
-private:
+ private:
   /**
    * clear all grad
    */
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/gserver/layers/MKLDNNPoolLayer.h
index 12821cda73..1eb0ee4ad9 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
@@ -27,7 +27,7 @@ typedef mkldnn::pooling_backward pool_bwd;
  * The config file api is mkldnn_pool
  */
 class MKLDNNPoolLayer : public MKLDNNLayer {
-protected:
+ protected:
   // padding height and width
   int ph_, pw_;
   // stride height and width
@@ -44,7 +44,7 @@ protected:
   // test_pooling_forward.cpp, pool need workspace for backward
   std::shared_ptr<mkldnn::memory> workspace_;
 
-public:
+ public:
   explicit MKLDNNPoolLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
 
   ~MKLDNNPoolLayer() {}
@@ -70,7 +70,7 @@ public:
                        << ", sw: " << sw_;
   }
 
-protected:
+ protected:
   void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr in,
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
index 37eb362d45..441025a9c9 100644
--- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class MKLPackedRecurrentLayer : public RecurrentLayer {
-public:
+ public:
   explicit MKLPackedRecurrentLayer(const LayerConfig& config)
       : RecurrentLayer(config) {}
 
@@ -38,7 +38,7 @@ public:
 
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   void forwardBatch(int batchSize,
                     size_t numSequences,
                     const int* starts) override;
@@ -47,7 +47,7 @@ protected:
                      size_t numSequences,
                      const int* starts) override;
 
-protected:
+ protected:
   /// packed_weight_ contains same data with
   /// RecurrentLayer::weight_ but is packed
   std::unique_ptr<MKLPackedWeight> packed_weight_;
diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h
index 28b8a7db7c..b01a961d00 100644
--- a/paddle/gserver/layers/MKLPackedWeight.h
+++ b/paddle/gserver/layers/MKLPackedWeight.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class MKLPackedWeight {
-protected:
+ protected:
   /// The pointer of weight
   real *weight_;
   /// The pointer of cblas packed gemm to weight
@@ -30,7 +30,7 @@ protected:
   size_t width_;
   bool transW_;
 
-public:
+ public:
   explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) {
     packedWeight_ = nullptr;
     weight_ = weight->getData();
@@ -59,7 +59,7 @@ public:
                         dst->getWidth());
   }
 
-protected:
+ protected:
   void pack_(real *src) {
     if (!packedWeight_) {
       packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
diff --git a/paddle/gserver/layers/MaxIdLayer.cpp b/paddle/gserver/layers/MaxIdLayer.cpp
index 84e375d744..eecd4996e9 100644
--- a/paddle/gserver/layers/MaxIdLayer.cpp
+++ b/paddle/gserver/layers/MaxIdLayer.cpp
@@ -23,11 +23,11 @@ namespace paddle {
  * The config file api is maxid_layer.
  */
 class MaxIdLayer : public Layer {
-private:
+ private:
   /// a predetermined number of best states at each level
   size_t beamSize_;
 
-public:
+ public:
   explicit MaxIdLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
index 9dbc672652..e46f997c34 100644
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -39,11 +39,11 @@ namespace paddle {
  */
 
 class MaxLayer : public SequencePoolLayer {
-protected:
+ protected:
   // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
   IVectorPtr maxIndex_;
 
-public:
+ public:
   explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/MaxOutLayer.h b/paddle/gserver/layers/MaxOutLayer.h
index 1fb371836b..0eb8674b4c 100644
--- a/paddle/gserver/layers/MaxOutLayer.h
+++ b/paddle/gserver/layers/MaxOutLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class MaxOutLayer : public Layer {
-protected:
+ protected:
   size_t groups_;
   size_t imgSizeH_, imgSizeW_;
   /// outputChannels_ = channels_ / groups_
@@ -38,7 +38,7 @@ protected:
   size_t featLen_;
   IVectorPtr maxoutId_;
 
-public:
+ public:
   /// return imgSizeH_ * imgSizeW_ * outputChannels_;
   size_t getSize();
 
diff --git a/paddle/gserver/layers/MaxPoolWithMaskLayer.h b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
index 74cc8acf35..c948364f6b 100644
--- a/paddle/gserver/layers/MaxPoolWithMaskLayer.h
+++ b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
@@ -23,10 +23,10 @@ namespace paddle {
  * @brief Basic parent layer of different kinds of pooling
  */
 class MaxPoolWithMaskLayer : public PoolLayer {
-protected:
+ protected:
   Argument mask_;
 
-public:
+ public:
   explicit MaxPoolWithMaskLayer(const LayerConfig& config)
       : PoolLayer(config) {}
 
diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/gserver/layers/MixedLayer.h
index a1a43c52e4..43ee2bd818 100644
--- a/paddle/gserver/layers/MixedLayer.h
+++ b/paddle/gserver/layers/MixedLayer.h
@@ -30,7 +30,7 @@ namespace paddle {
  * The config file api is mixed_layer.
  */
 class MixedLayer : public Layer {
-public:
+ public:
   explicit MixedLayer(const LayerConfig& config) : Layer(config) {}
 
   ~MixedLayer() {}
@@ -52,7 +52,7 @@ public:
    */
   LayerStatePtr getState() override;
 
-protected:
+ protected:
   std::vector<std::unique_ptr<Projection>> projections_;
   std::vector<std::unique_ptr<Operator>> operators_;
   /// the matrix size of projection state
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.h b/paddle/gserver/layers/MultiBoxLossLayer.h
index 9935da5644..a358cded00 100644
--- a/paddle/gserver/layers/MultiBoxLossLayer.h
+++ b/paddle/gserver/layers/MultiBoxLossLayer.h
@@ -41,7 +41,7 @@ namespace paddle {
  */
 
 class MultiBoxLossLayer : public CostLayer {
-public:
+ public:
   explicit MultiBoxLossLayer(const LayerConfig& config) : CostLayer(config) {}
 
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
@@ -54,7 +54,7 @@ public:
 
   void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {}
 
-protected:
+ protected:
   inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
   inline LayerPtr getLabelLayer() { return inputLayers_[1]; }
   inline LayerPtr getLocInputLayer(size_t index) {
@@ -64,7 +64,7 @@ protected:
     return inputLayers_[2 + inputNum_ + index];
   }
 
-protected:
+ protected:
   size_t numClasses_;
   real overlapThreshold_;
   real negPosRatio_;
diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h
index 1f9e818ee5..8cbb229f15 100644
--- a/paddle/gserver/layers/MultinomialSampler.h
+++ b/paddle/gserver/layers/MultinomialSampler.h
@@ -29,7 +29,7 @@ namespace paddle {
  * The computational complexity of generate one sample is O(1).
  */
 class MultinomialSampler {
-public:
+ public:
   MultinomialSampler(const real* prob, int size);
 
   //! protobuf always using double.
@@ -53,7 +53,7 @@ public:
     return gen1([&g, this]() { return rand_(g); });
   }
 
-protected:
+ protected:
   /**
    * @brief Generation
    * @param[in] rand rand is a real random number distribution
diff --git a/paddle/gserver/layers/MultiplexLayer.cpp b/paddle/gserver/layers/MultiplexLayer.cpp
index 82857f8c3e..43ecc48cd9 100644
--- a/paddle/gserver/layers/MultiplexLayer.cpp
+++ b/paddle/gserver/layers/MultiplexLayer.cpp
@@ -37,7 +37,7 @@ namespace paddle {
  */
 
 class MultiplexLayer : public Layer {
-protected:
+ protected:
   /**
    * @brief A struct is used to save the copy information, includes input
    * layer index and copy size.
@@ -64,7 +64,7 @@ protected:
   /// Temporary matrix pointer to point to output data.
   MatrixPtr tmpDest_;
 
-public:
+ public:
   explicit MultiplexLayer(const LayerConfig& config) : Layer(config) {}
 
   ~MultiplexLayer() {}
@@ -75,7 +75,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-private:
+ private:
   /**
    * @brief Calculate copy info for input layers.
    */
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
index d3d7b1fd9a..cc48fe100f 100644
--- a/paddle/gserver/layers/NCELayer.cpp
+++ b/paddle/gserver/layers/NCELayer.cpp
@@ -54,7 +54,7 @@ class NCELayer : public Layer {
 
   IVectorPtr labelIds_;
 
-public:
+ public:
   explicit NCELayer(const LayerConfig& config)
       : Layer(config),
         numClasses_(config.num_classes()),
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index c89cbbfce9..3807584415 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -27,7 +27,7 @@ namespace paddle {
  * @note Normalize the input in local region
  */
 class NormLayer : public Layer {
-public:
+ public:
   explicit NormLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -49,12 +49,12 @@ public:
  * Need to implement in the futrue.
  */
 class ResponseNormLayer : public NormLayer {
-protected:
+ protected:
   size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_;
   real scale_, pow_;
   MatrixPtr denoms_;
 
-public:
+ public:
   explicit ResponseNormLayer(const LayerConfig& config) : NormLayer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -76,7 +76,7 @@ public:
  *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
  */
 class CrossChannelNormLayer : public NormLayer {
-public:
+ public:
   explicit CrossChannelNormLayer(const LayerConfig& config)
       : NormLayer(config) {}
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
@@ -85,7 +85,7 @@ public:
   MatrixPtr createSampleMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
   MatrixPtr createSpatialMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
 
-protected:
+ protected:
   size_t channels_;
   std::unique_ptr<Weight> scale_;
   MatrixPtr scaleDiff_;
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
index 898b5823a9..64803a1603 100644
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
@@ -28,7 +28,7 @@ class CMRProjectionNormLayer : public ResponseNormLayer {
   size_t imgSizeH_, imgSizeW_;
   size_t outputH_, outputW_;
 
-public:
+ public:
   explicit CMRProjectionNormLayer(const LayerConfig& config)
       : ResponseNormLayer(config) {}
 
@@ -41,7 +41,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   TensorShape shape_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/Operator.h b/paddle/gserver/layers/Operator.h
index a620926ccc..42d525ef3e 100644
--- a/paddle/gserver/layers/Operator.h
+++ b/paddle/gserver/layers/Operator.h
@@ -34,7 +34,7 @@ namespace paddle {
  * @note: Operator can't have parameters.
  */
 class Operator {
-public:
+ public:
   static Operator* create(const OperatorConfig& config, bool useGpu);
 
   Operator(const OperatorConfig& config, bool useGpu)
@@ -81,7 +81,7 @@ public:
    */
   virtual LayerStatePtr getState() { return nullptr; }
 
-protected:
+ protected:
   /// Config of operator
   OperatorConfig config_;
   bool useGpu_;
diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp
index 75f4abf93e..11a910f331 100644
--- a/paddle/gserver/layers/OuterProdLayer.cpp
+++ b/paddle/gserver/layers/OuterProdLayer.cpp
@@ -28,12 +28,12 @@ namespace paddle {
  */
 
 class OuterProdLayer : public Layer {
-protected:
+ protected:
   MatrixPtr tmpMtx0;
   MatrixPtr tmpRow0;
   MatrixPtr tmpRow1;
 
-public:
+ public:
   explicit OuterProdLayer(const LayerConfig& config) : Layer(config) {}
 
   ~OuterProdLayer() {}
diff --git a/paddle/gserver/layers/PadLayer.h b/paddle/gserver/layers/PadLayer.h
index 7e09d7f8a0..46b8a59597 100644
--- a/paddle/gserver/layers/PadLayer.h
+++ b/paddle/gserver/layers/PadLayer.h
@@ -24,7 +24,7 @@ namespace paddle {
  *         the 4th dimenstion according padc_, padh_ and padw_.
  */
 class PadLayer : public Layer {
-public:
+ public:
   explicit PadLayer(const LayerConfig& config) : Layer(config) {}
 
   ~PadLayer() {}
@@ -34,7 +34,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   void setOutDims(const size_t batchSize);
   void setTensorDim(const size_t batchSize);
 
diff --git a/paddle/gserver/layers/ParameterReluLayer.h b/paddle/gserver/layers/ParameterReluLayer.h
index 3725fa4a11..4553413fcd 100644
--- a/paddle/gserver/layers/ParameterReluLayer.h
+++ b/paddle/gserver/layers/ParameterReluLayer.h
@@ -36,7 +36,7 @@ namespace paddle {
  */
 
 class ParameterReluLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
 
   /**
@@ -51,7 +51,7 @@ protected:
    */
   size_t partialSum_;
 
-public:
+ public:
   explicit ParameterReluLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ParameterReluLayer() {}
diff --git a/paddle/gserver/layers/Pool3DLayer.h b/paddle/gserver/layers/Pool3DLayer.h
index 59ee73f7cb..32605f8b70 100644
--- a/paddle/gserver/layers/Pool3DLayer.h
+++ b/paddle/gserver/layers/Pool3DLayer.h
@@ -26,7 +26,7 @@ namespace paddle {
  * Pools the input within regions
  */
 class Pool3DLayer : public Layer {
-public:
+ public:
   explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
   ~Pool3DLayer() {}
 
@@ -36,7 +36,7 @@ public:
   void backward(const UpdateCallback& callback) override;
   size_t getSize();
 
-protected:
+ protected:
   int channels_;
   int sizeX_, sizeY_, sizeZ_;
   int strideW_, strideH_, strideD_;
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index 58d5fb0a09..99f8f148e2 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -26,7 +26,7 @@ namespace paddle {
  * Pools the input within regions
  */
 class PoolLayer : public Layer {
-protected:
+ protected:
   size_t channels_, sizeX_, stride_, outputX_, imgSize_;
   int confPadding_;
 
@@ -40,7 +40,7 @@ protected:
 
   bool excludeMode_;
 
-public:
+ public:
   explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
 
   /**
diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h
index c99287dbf0..8004cc1550 100644
--- a/paddle/gserver/layers/PoolProjection.h
+++ b/paddle/gserver/layers/PoolProjection.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 
 class PoolProjection : public Projection {
-protected:
+ protected:
   size_t imgSizeY_, imgSize_;
   size_t outputY_, outputX_;
   size_t strideY_, stride_;
@@ -30,7 +30,7 @@ protected:
   std::string poolType_;
   bool excludeMode_;
 
-public:
+ public:
   PoolProjection(const ProjectionConfig& config,
                  ParameterPtr parameter,
                  bool useGpu);
@@ -45,7 +45,7 @@ public:
 };
 
 class MaxPoolProjection : public PoolProjection {
-public:
+ public:
   MaxPoolProjection(const ProjectionConfig& config,
                     ParameterPtr parameter,
                     bool useGpu)
@@ -56,7 +56,7 @@ public:
 };
 
 class AvgPoolProjection : public PoolProjection {
-public:
+ public:
   AvgPoolProjection(const ProjectionConfig& config,
                     ParameterPtr parameter,
                     bool useGpu)
diff --git a/paddle/gserver/layers/PoolProjectionLayer.h b/paddle/gserver/layers/PoolProjectionLayer.h
index 5a97a7769a..9ad144cc2a 100644
--- a/paddle/gserver/layers/PoolProjectionLayer.h
+++ b/paddle/gserver/layers/PoolProjectionLayer.h
@@ -24,13 +24,13 @@ namespace paddle {
  * @brief Basic parent layer of different kinds of pooling
  */
 class PoolProjectionLayer : public PoolLayer {
-protected:
+ protected:
   size_t imgSizeH_, imgSizeW_;
   size_t outputH_, outputW_;
   std::unique_ptr<PoolProjection> poolProjection_;
   ProjectionConfig projectionConfig_;
 
-public:
+ public:
   explicit PoolProjectionLayer(const LayerConfig& config) : PoolLayer(config) {
     PoolConfig* conf = projectionConfig_.mutable_pool_conf();
     *conf = config_.inputs(0).pool_conf();
diff --git a/paddle/gserver/layers/PowerLayer.cpp b/paddle/gserver/layers/PowerLayer.cpp
index 18f650fcda..7e8d60db8f 100644
--- a/paddle/gserver/layers/PowerLayer.cpp
+++ b/paddle/gserver/layers/PowerLayer.cpp
@@ -32,10 +32,10 @@ namespace paddle {
  */
 
 class PowerLayer : public Layer {
-protected:
+ protected:
   MatrixPtr tmpMtx;
 
-public:
+ public:
   explicit PowerLayer(const LayerConfig& config) : Layer(config) {}
 
   ~PowerLayer() {}
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
index 5a527d598d..6fbcc447f9 100644
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 
 class PrintLayer : public Layer {
-public:
+ public:
   explicit PrintLayer(const LayerConfig& config) : Layer(config) {}
 
   void forward(PassType passType) override {
diff --git a/paddle/gserver/layers/PriorBox.cpp b/paddle/gserver/layers/PriorBox.cpp
index 56a4d942f0..39d2c2d737 100644
--- a/paddle/gserver/layers/PriorBox.cpp
+++ b/paddle/gserver/layers/PriorBox.cpp
@@ -28,7 +28,7 @@ namespace paddle {
  */
 
 class PriorBoxLayer : public Layer {
-public:  // NOLINT
+ public:  // NOLINT
   explicit PriorBoxLayer(const LayerConfig& config) : Layer(config) {}
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
@@ -36,7 +36,7 @@ public:  // NOLINT
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override {}
 
-protected:  // NOLINT
+ protected:  // NOLINT
   int numPriors_;
   std::vector<int> minSize_;
   std::vector<int> maxSize_;
diff --git a/paddle/gserver/layers/Projection.h b/paddle/gserver/layers/Projection.h
index 1f0b96c79e..88a41355cf 100644
--- a/paddle/gserver/layers/Projection.h
+++ b/paddle/gserver/layers/Projection.h
@@ -37,7 +37,7 @@ namespace paddle {
  * to output Argument.
  */
 class Projection {
-public:
+ public:
   static Projection* create(const ProjectionConfig& config,
                             ParameterPtr parameter,
                             bool useGpu);
@@ -98,7 +98,7 @@ public:
    */
   size_t getOutputSize() const { return config_.output_size(); }
 
-protected:
+ protected:
   /**
    * Create layer function. Function is called in forward or backward.
    * \param function, Layer::forward_ or Layer::backward_
@@ -119,7 +119,7 @@ protected:
     func->init(config);
   }
 
-protected:
+ protected:
   /// Config of projection
   ProjectionConfig config_;
   /// Parameter of projection
diff --git a/paddle/gserver/layers/ROIPoolLayer.h b/paddle/gserver/layers/ROIPoolLayer.h
index b1735e9748..801a9b3aeb 100644
--- a/paddle/gserver/layers/ROIPoolLayer.h
+++ b/paddle/gserver/layers/ROIPoolLayer.h
@@ -33,7 +33,7 @@ namespace paddle {
  */
 
 class ROIPoolLayer : public Layer {
-protected:
+ protected:
   size_t channels_;
   size_t width_;
   size_t height_;
@@ -44,7 +44,7 @@ protected:
   // Since there is no int matrix, use real maxtrix instead.
   MatrixPtr maxIdxs_;
 
-public:
+ public:
   explicit ROIPoolLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/RecurrentLayer.h b/paddle/gserver/layers/RecurrentLayer.h
index 8fd4fe6b78..94e633e657 100644
--- a/paddle/gserver/layers/RecurrentLayer.h
+++ b/paddle/gserver/layers/RecurrentLayer.h
@@ -40,7 +40,7 @@ namespace paddle {
  */
 
 class RecurrentLayer : public Layer {
-public:
+ public:
   explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -56,7 +56,7 @@ public:
 
   LayerStatePtr getState() override;
 
-protected:
+ protected:
   /**
    * @brief If user do not set --rnn_use_batch=true, it will
    * compute rnn forward one sequence by one sequence in default.
@@ -110,7 +110,7 @@ protected:
                              size_t numSequences,
                              const int* starts);
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> bias_;
 
diff --git a/paddle/gserver/layers/RecurrentLayerGroup.cpp b/paddle/gserver/layers/RecurrentLayerGroup.cpp
index 44b57185c5..6694e8f299 100644
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ b/paddle/gserver/layers/RecurrentLayerGroup.cpp
@@ -27,7 +27,7 @@ namespace paddle {
  * between RecurrentLayerGroupBegin and RecurrentLayerGroupEnd.
  */
 class RecurrentLayerGroup : public Layer {
-public:
+ public:
   explicit RecurrentLayerGroup(const LayerConfig& config) : Layer(config) {}
 
   void initSubNetwork(NeuralNetwork* rootNetwork,
@@ -58,7 +58,7 @@ public:
     callback(*network_);
   }
 
-private:
+ private:
   std::unique_ptr<RecurrentGradientMachine> network_;
 };
 
diff --git a/paddle/gserver/layers/ResizeLayer.cpp b/paddle/gserver/layers/ResizeLayer.cpp
index 831f4c3b7e..d4ae994593 100644
--- a/paddle/gserver/layers/ResizeLayer.cpp
+++ b/paddle/gserver/layers/ResizeLayer.cpp
@@ -24,7 +24,7 @@ namespace paddle {
  * resize matrix: (height * width / size) * size
  */
 class ResizeLayer : public Layer {
-public:
+ public:
   explicit ResizeLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/RotateLayer.h b/paddle/gserver/layers/RotateLayer.h
index 3b619921ab..7ecbff2016 100644
--- a/paddle/gserver/layers/RotateLayer.h
+++ b/paddle/gserver/layers/RotateLayer.h
@@ -32,7 +32,7 @@ namespace paddle {
  */
 
 class RotateLayer : public Layer {
-public:
+ public:
   explicit RotateLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
@@ -40,7 +40,7 @@ public:
   void forward(PassType passType);
   void backward(const UpdateCallback& callback = nullptr);
 
-private:
+ private:
   int batchSize_;
   int size_;
   int height_;
diff --git a/paddle/gserver/layers/RowConvLayer.h b/paddle/gserver/layers/RowConvLayer.h
index ba0af1de68..3b74df0b1a 100644
--- a/paddle/gserver/layers/RowConvLayer.h
+++ b/paddle/gserver/layers/RowConvLayer.h
@@ -22,7 +22,7 @@ namespace paddle {
  * \brief Row Convolution Layer.
  */
 class RowConvLayer : public Layer {
-public:
+ public:
   explicit RowConvLayer(const LayerConfig& config) : Layer(config) {}
 
   ~RowConvLayer() {}
@@ -32,7 +32,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   // Row convolution weight, context_lenght_ * fan_out.
   // fan_out is the size of output feature.
   std::unique_ptr<Weight> weight_;
diff --git a/paddle/gserver/layers/RowL2NormLayer.cpp b/paddle/gserver/layers/RowL2NormLayer.cpp
index 7ff0c9bae9..d5e6e10a02 100644
--- a/paddle/gserver/layers/RowL2NormLayer.cpp
+++ b/paddle/gserver/layers/RowL2NormLayer.cpp
@@ -26,12 +26,12 @@ namespace paddle {
  */
 
 class RowL2NormLayer : public Layer {
-protected:
+ protected:
   MatrixPtr inSquare_;
   MatrixPtr l2NormReciprocal_;
   MatrixPtr dotSum_;
 
-public:
+ public:
   explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/gserver/layers/SamplingIdLayer.cpp
index 2edd915d22..dbce635881 100644
--- a/paddle/gserver/layers/SamplingIdLayer.cpp
+++ b/paddle/gserver/layers/SamplingIdLayer.cpp
@@ -31,7 +31,7 @@ class SamplingIdLayer : public Layer {
   std::uniform_real_distribution<double> rand1_;
   std::vector<Argument> tmpCpuInput_;
 
-public:
+ public:
   explicit SamplingIdLayer(const LayerConfig& config)
       : Layer(config), rand1_(0, 1) {}
 
diff --git a/paddle/gserver/layers/ScaleShiftLayer.cpp b/paddle/gserver/layers/ScaleShiftLayer.cpp
index 799d1fe51a..8af78a2e27 100644
--- a/paddle/gserver/layers/ScaleShiftLayer.cpp
+++ b/paddle/gserver/layers/ScaleShiftLayer.cpp
@@ -30,11 +30,11 @@ namespace paddle {
  */
 
 class ScaleShiftLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> scale_;
   std::unique_ptr<Weight> offset_;
 
-public:
+ public:
   explicit ScaleShiftLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.h b/paddle/gserver/layers/ScaleSubRegionLayer.h
index 6e861be485..fe431698bc 100644
--- a/paddle/gserver/layers/ScaleSubRegionLayer.h
+++ b/paddle/gserver/layers/ScaleSubRegionLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  *                  region.
  */
 class ScaleSubRegionLayer : public Layer {
-public:
+ public:
   explicit ScaleSubRegionLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ScaleSubRegionLayer() {}
@@ -40,7 +40,7 @@ public:
 
   void backward(const UpdateCallback& callback = nullptr);
 
-protected:
+ protected:
   TensorShape shape_;
   TensorShape indicesShape_;
   size_t imgH_;
diff --git a/paddle/gserver/layers/ScalingLayer.cpp b/paddle/gserver/layers/ScalingLayer.cpp
index 1d98a7373d..15e07daebe 100644
--- a/paddle/gserver/layers/ScalingLayer.cpp
+++ b/paddle/gserver/layers/ScalingLayer.cpp
@@ -32,7 +32,7 @@ namespace paddle {
  */
 
 class ScalingLayer : public Layer {
-public:
+ public:
   explicit ScalingLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ScalingLayer() {}
diff --git a/paddle/gserver/layers/ScalingProjection.cpp b/paddle/gserver/layers/ScalingProjection.cpp
index 99b5b68f54..4d871cafc4 100644
--- a/paddle/gserver/layers/ScalingProjection.cpp
+++ b/paddle/gserver/layers/ScalingProjection.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 
 class ScalingProjection : public Projection {
-public:
+ public:
   ScalingProjection(const ProjectionConfig& config,
                     const ParameterPtr& parameter,
                     bool useGpu)
@@ -48,7 +48,7 @@ public:
     }
   }
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
 };
 
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
index 8156407418..4b32ce8b16 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
@@ -33,11 +33,11 @@ namespace paddle {
  * The config file api is selective_fc_layer.
  */
 class SelectiveFullyConnectedLayer : public Layer {
-protected:
+ protected:
   WeightList weights_;
   std::unique_ptr<Weight> biases_;
 
-private:
+ private:
   /**
    * Get selected columns each forward.
    */
@@ -60,7 +60,7 @@ private:
   /// if true, means output_.value is the same as Fc Layer
   bool fullOutput_;
 
-public:
+ public:
   explicit SelectiveFullyConnectedLayer(const LayerConfig& config)
       : Layer(config), selCols_(nullptr) {}
 
@@ -94,7 +94,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-private:
+ private:
   /**
    * @brief Make SelectiveFC act as FullyConnectedLayer
    */
diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp
index cf573f3f33..c84c3ce4f0 100644
--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ b/paddle/gserver/layers/SequenceConcatLayer.cpp
@@ -29,10 +29,10 @@ namespace paddle {
  */
 
 class SequenceConcatLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit SequenceConcatLayer(const LayerConfig& config) : Layer(config) {}
 
   ~SequenceConcatLayer() {}
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
index 6c4ae775c1..28d0a9296d 100644
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -38,12 +38,12 @@ namespace paddle {
  */
 
 class SequenceLastInstanceLayer : public SequencePoolLayer {
-protected:
+ protected:
   MatrixPtr tmpSrc_;
   MatrixPtr tmpDest_;
   std::vector<int> instanceIds_;
 
-public:
+ public:
   explicit SequenceLastInstanceLayer(const LayerConfig& config)
       : SequencePoolLayer(config) {}
 
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
index 254e4cc6b3..01183060af 100644
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@@ -41,7 +41,7 @@ namespace paddle {
  */
 
 class SequencePoolLayer : public Layer {
-protected:
+ protected:
   int type_;
   std::unique_ptr<Weight> biases_;
   enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
@@ -51,7 +51,7 @@ protected:
   // Whether the input sequence is reversed or not.
   bool reversed_ = false;
 
-public:
+ public:
   explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp
index fb96669917..319310af8c 100644
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp
@@ -29,12 +29,12 @@ namespace paddle {
  */
 
 class SequenceReshapeLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
 
   MatrixPtr reshapedOutputGrad;
 
-public:
+ public:
   explicit SequenceReshapeLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SequenceSliceLayer.cpp b/paddle/gserver/layers/SequenceSliceLayer.cpp
index 1b7c33477e..a6d810b583 100644
--- a/paddle/gserver/layers/SequenceSliceLayer.cpp
+++ b/paddle/gserver/layers/SequenceSliceLayer.cpp
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class SequenceSliceLayer : public Layer {
-public:
+ public:
   explicit SequenceSliceLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -30,7 +30,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-private:
+ private:
   /*
    * TODO(caoying)
    * In PaddePaddle, currently all matrices are real number types,
diff --git a/paddle/gserver/layers/SequenceToBatch.h b/paddle/gserver/layers/SequenceToBatch.h
index 8743a5ef10..5200e702d9 100644
--- a/paddle/gserver/layers/SequenceToBatch.h
+++ b/paddle/gserver/layers/SequenceToBatch.h
@@ -39,7 +39,7 @@ namespace paddle {
  *
  */
 class SequenceToBatch {
-public:
+ public:
   explicit SequenceToBatch(bool useGpu) : useGpu_(useGpu) {}
 
   /* resize and calculate the batchIndex_ */
@@ -82,7 +82,7 @@ public:
     numBatch_ = seq2batch.numBatch_;
   }
 
-protected:
+ protected:
   void sequence2BatchCopy(Matrix &batch,
                           Matrix &sequence,
                           IVector &seq2BatchIdx,
diff --git a/paddle/gserver/layers/SliceProjection.cpp b/paddle/gserver/layers/SliceProjection.cpp
index 5627ad1eb3..b474f2db75 100644
--- a/paddle/gserver/layers/SliceProjection.cpp
+++ b/paddle/gserver/layers/SliceProjection.cpp
@@ -44,14 +44,14 @@ namespace paddle {
  * The config file api is slice_projection.
  */
 class SliceProjection : public Projection {
-public:
+ public:
   SliceProjection(const ProjectionConfig& config,
                   const ParameterPtr& parameter,
                   bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
-protected:
+ protected:
   std::vector<std::pair<size_t, size_t>> slices_;
 };
 
diff --git a/paddle/gserver/layers/SlopeInterceptLayer.cpp b/paddle/gserver/layers/SlopeInterceptLayer.cpp
index c94a07e5da..f7f4735c1b 100644
--- a/paddle/gserver/layers/SlopeInterceptLayer.cpp
+++ b/paddle/gserver/layers/SlopeInterceptLayer.cpp
@@ -36,7 +36,7 @@ namespace paddle {
  */
 
 class SlopeInterceptLayer : public Layer {
-public:
+ public:
   explicit SlopeInterceptLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
index 6cb5fdf83e..421bdfe09c 100644
--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.h
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class SpatialPyramidPoolLayer : public Layer {
-protected:
+ protected:
   size_t channels_;
   size_t imgSizeW_;
   size_t imgSizeH_;
@@ -40,7 +40,7 @@ protected:
   std::vector<Argument> projOutput_;
   std::vector<std::pair<size_t, size_t>> projCol_;
 
-public:
+ public:
   explicit SpatialPyramidPoolLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
index db240ab0c9..e2bb00bbfa 100644
--- a/paddle/gserver/layers/SubNestedSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class SubNestedSequenceLayer : public Layer {
-public:
+ public:
   explicit SubNestedSequenceLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -30,7 +30,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-private:
+ private:
   /*
    * This functions generates the indices of rows in a batch according to the
    * indices of selected sub-sequence in each sequence.
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
index 808627f092..ba49f5710f 100644
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -27,12 +27,12 @@ namespace paddle {
  */
 
 class SubSequenceLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
   MatrixPtr tmpSrc_;
   MatrixPtr tmpDest_;
 
-public:
+ public:
   explicit SubSequenceLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SumToOneNormLayer.cpp b/paddle/gserver/layers/SumToOneNormLayer.cpp
index ffbe149253..00764717e8 100644
--- a/paddle/gserver/layers/SumToOneNormLayer.cpp
+++ b/paddle/gserver/layers/SumToOneNormLayer.cpp
@@ -32,13 +32,13 @@ namespace paddle {
  */
 
 class SumToOneNormLayer : public Layer {
-protected:
+ protected:
   /// reciprocalRowSum_ = \f$1 / \sum_{k=1}^N in[k]\f$
   MatrixPtr reciprocalRowSum_;
   /// dotSum = output_.grad \f$.*\f$ output_.value
   MatrixPtr dotSum_;
 
-public:
+ public:
   explicit SumToOneNormLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SwitchOrderLayer.h b/paddle/gserver/layers/SwitchOrderLayer.h
index 882437f443..8a551a2bba 100644
--- a/paddle/gserver/layers/SwitchOrderLayer.h
+++ b/paddle/gserver/layers/SwitchOrderLayer.h
@@ -22,7 +22,7 @@ namespace paddle {
  * \brief  This layer calculate softmax in image channel dimension.
  */
 class SwitchOrderLayer : public Layer {
-public:
+ public:
   explicit SwitchOrderLayer(const LayerConfig& config) : Layer(config) {}
 
   ~SwitchOrderLayer() {}
@@ -34,7 +34,7 @@ public:
   void setInDims();
   void setOutDims();
 
-protected:
+ protected:
   std::vector<std::shared_ptr<FunctionBase>> nchw2nhwc_;
   std::vector<std::shared_ptr<FunctionBase>> nhwc2nchw_;
   TensorShape inDims_;
diff --git a/paddle/gserver/layers/TableProjection.h b/paddle/gserver/layers/TableProjection.h
index ffb05e68f0..60286149f4 100644
--- a/paddle/gserver/layers/TableProjection.h
+++ b/paddle/gserver/layers/TableProjection.h
@@ -32,7 +32,7 @@ namespace paddle {
  * @note If \f$ids[i] = -1\f$, it will be ignored.
  */
 class TableProjection : public Projection {
-public:
+ public:
   TableProjection(const ProjectionConfig& config,
                   const ParameterPtr& parameter,
                   bool useGpu);
@@ -43,7 +43,7 @@ public:
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
-protected:
+ protected:
   std::unique_ptr<Weight> table_;
 };
 
diff --git a/paddle/gserver/layers/TensorLayer.h b/paddle/gserver/layers/TensorLayer.h
index 8a323aa15f..5c1ee40ced 100644
--- a/paddle/gserver/layers/TensorLayer.h
+++ b/paddle/gserver/layers/TensorLayer.h
@@ -37,11 +37,11 @@ namespace paddle {
  */
 
 class TensorLayer : public Layer {
-protected:
+ protected:
   WeightList weights_;
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit TensorLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/TransLayer.h b/paddle/gserver/layers/TransLayer.h
index 03d0948624..1cd8fd91f7 100644
--- a/paddle/gserver/layers/TransLayer.h
+++ b/paddle/gserver/layers/TransLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  * The config file api is trans_layer.
  */
 class TransLayer : public Layer {
-public:
+ public:
   explicit TransLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
index 755389f707..45f5977989 100644
--- a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
+++ b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
@@ -24,14 +24,14 @@ namespace paddle {
  * The config file api is trans_full_matrix_projection.
  */
 class TransposedFullMatrixProjection : public Projection {
-public:
+ public:
   TransposedFullMatrixProjection(const ProjectionConfig& config,
                                  ParameterPtr parameter,
                                  bool useGPu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
 };
 
diff --git a/paddle/gserver/layers/UpsampleLayer.h b/paddle/gserver/layers/UpsampleLayer.h
index 25efbac5e9..c9d079c314 100644
--- a/paddle/gserver/layers/UpsampleLayer.h
+++ b/paddle/gserver/layers/UpsampleLayer.h
@@ -30,7 +30,7 @@ namespace paddle {
  */
 
 class UpsampleLayer : public Layer {
-public:
+ public:
   explicit UpsampleLayer(const LayerConfig& config) : Layer(config) {}
   ~UpsampleLayer() {}
 
@@ -42,7 +42,7 @@ public:
 
   size_t getOutputSize();
 
-protected:
+ protected:
   size_t scale_, scaleY_;
   size_t upsampleSize_, upsampleSizeY_;
   size_t padOutX_, padOutY_;
diff --git a/paddle/gserver/layers/ValidationLayer.h b/paddle/gserver/layers/ValidationLayer.h
index f412d685c0..be41128ef4 100644
--- a/paddle/gserver/layers/ValidationLayer.h
+++ b/paddle/gserver/layers/ValidationLayer.h
@@ -23,7 +23,7 @@ DECLARE_int32(trainer_id);
 namespace paddle {
 
 class ValidationLayer : public Layer {
-public:
+ public:
   explicit ValidationLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -51,7 +51,7 @@ public:
  * AucValidation
  */
 class AucValidation : public ValidationLayer {
-public:
+ public:
   explicit AucValidation(const LayerConfig& config)
       : ValidationLayer(config),
         cpuOutput_(nullptr),
@@ -72,7 +72,7 @@ public:
   };
   std::vector<PredictionResult> predictArray_;
 
-private:
+ private:
   bool passBegin_;
   std::unique_ptr<Evaluator> evaluator_;
   MatrixPtr cpuOutput_;
@@ -84,7 +84,7 @@ private:
  * positive-negative pair rate Validation
  */
 class PnpairValidation : public ValidationLayer {
-public:
+ public:
   explicit PnpairValidation(const LayerConfig& config)
       : ValidationLayer(config) {}
 
@@ -95,7 +95,7 @@ public:
 
   void onPassEnd() override;
 
-private:
+ private:
   bool passBegin_;
   std::unique_ptr<Evaluator> evaluator_;
 };
diff --git a/paddle/gserver/layers/WarpCTCLayer.h b/paddle/gserver/layers/WarpCTCLayer.h
index 6f6be359c0..3017ca794e 100644
--- a/paddle/gserver/layers/WarpCTCLayer.h
+++ b/paddle/gserver/layers/WarpCTCLayer.h
@@ -26,7 +26,7 @@ namespace paddle {
  * The config file api is warp_ctc_layer.
  */
 class WarpCTCLayer : public Layer {
-public:
+ public:
   explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {}
   ~WarpCTCLayer() {}
 
@@ -35,7 +35,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   /**
    * sequence matrix and batch matrix copy:
    * sequence (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
@@ -49,7 +49,7 @@ protected:
                         const ICpuGpuVectorPtr& seqStartPositions,
                         bool normByTimes);
 
-protected:
+ protected:
   size_t numClasses_;
   size_t blank_;
   size_t maxSequenceLength_;
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
index c1faa6fd90..41ac46b70a 100644
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -44,7 +44,7 @@ class MKLDNNTester {
     std::vector<VectorPtr> paraValues;
   };
 
-protected:
+ protected:
   std::vector<TestConfig> configs_;
   vector<string> layerNames_;
   vector<vector<DataLayerPtr>> dataLayers_;
@@ -65,7 +65,7 @@ protected:
   /// passType, PASS_TRAIN, PASS_TEST or PASS_GC (Gradient Check pass)
   PassType passType_;
 
-public:
+ public:
   explicit MKLDNNTester(size_t iter = 3, float epsilon = 1e-4) {
     iter_ = iter;
     eps_ = epsilon;
@@ -75,7 +75,7 @@ public:
 
   ~MKLDNNTester() {}
 
-public:
+ public:
   void run(const TestConfig& dnn,
            const TestConfig& ref,
            size_t batchSize,
@@ -97,7 +97,7 @@ public:
                            bool use_mkldnn,
                            size_t iter = 2);
 
-private:
+ private:
   void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
   void setInputImgSize();
   void runOnce();
diff --git a/paddle/gserver/tests/test_MultinomialSampler.cpp b/paddle/gserver/tests/test_MultinomialSampler.cpp
index 4a295ea9d5..043025239e 100644
--- a/paddle/gserver/tests/test_MultinomialSampler.cpp
+++ b/paddle/gserver/tests/test_MultinomialSampler.cpp
@@ -27,7 +27,7 @@ using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
 class MultinomialSamplerTester : public MultinomialSampler {
-public:
+ public:
   MultinomialSamplerTester(real* prob, int size)
       : MultinomialSampler(prob, size) {}
 
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index 72324fcf29..9770567b88 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -26,7 +26,7 @@ DECLARE_int32(seed);
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 class TrainerForTest : public paddle::Trainer {
-public:
+ public:
   void startTrain() {
     GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
     gm.start();
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index e5ce922f15..b54e37b7db 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -225,7 +225,7 @@ TEST(Layer, RecurrentLayer) {
 #include "paddle/gserver/layers/RecurrentLayer.h"
 template <class T>
 class TestRecurrentLayer {
-public:
+ public:
   LayerConfig config_;
   bool useGpu_;
   bool useBatch_;
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index ae60f6fe5f..c43a83891e 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -27,7 +27,7 @@ namespace paddle {
  * This is the base class of all Allocator class.
  */
 class Allocator {
-public:
+ public:
   virtual ~Allocator() {}
   virtual void* alloc(size_t size) = 0;
   virtual void free(void* ptr) = 0;
@@ -38,7 +38,7 @@ public:
  * @brief CPU allocator implementation.
  */
 class CpuAllocator : public Allocator {
-public:
+ public:
   ~CpuAllocator() {}
 
   /**
@@ -76,7 +76,7 @@ public:
  * @brief GPU allocator implementation.
  */
 class GpuAllocator : public Allocator {
-public:
+ public:
   ~GpuAllocator() {}
 
   /**
@@ -107,7 +107,7 @@ public:
  * @brief CPU pinned memory allocator implementation.
  */
 class CudaHostAllocator : public Allocator {
-public:
+ public:
   ~CudaHostAllocator() {}
 
   /**
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 00ce5a1949..1958629aa0 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -43,7 +43,7 @@ typedef bool_constant<bool, true> true_type;
   address += row * ld + col;
 
 class MatrixOffset {
-public:
+ public:
   size_t aCol_;
   size_t aRow_;
   size_t bCol_;
@@ -72,14 +72,14 @@ public:
 
 template <class T>
 class BaseMatrixT : public TensorExpression<BaseMatrixT<T>, T> {
-public:
+ public:
   size_t height_, width_;
   size_t stride_;
   T* data_;
   bool trans_;
   bool useGpu_;
 
-public:
+ public:
   virtual ~BaseMatrixT() {}
   BaseMatrixT(size_t height, size_t width, T* data, bool trans, bool useGpu)
       : height_(height),
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index 922fb51722..3c897b5f3e 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -51,10 +51,6 @@ else()
 endif()
 
 
-
-add_style_check_target(paddle_math ${MATH_SOURCES})
-add_style_check_target(paddle_math ${MATH_HEADERS})
-
 add_dependencies(paddle_math paddle_proto ${external_project_dependencies})  # depends
 if(WITH_TESTING)
     add_subdirectory(tests)
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index 22b6b71688..172792c295 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 
 class CpuSparseMatrix : public Matrix {
-public:
+ public:
   CpuSparseMatrix(size_t height,
                   size_t width,
                   size_t nnz, /* used to allocate space */
@@ -291,10 +291,10 @@ public:
     LOG(FATAL) << "not supported!";
   }
 
-private:
+ private:
   MatrixPtr clone(size_t height = 0, size_t width = 0, bool useGpu = false);
 
-protected:
+ protected:
   void sparseResize();
   /*for csr , record row start position, for csc, record row index for every no
    * zero value*/
@@ -310,10 +310,10 @@ protected:
   static ThreadLocal<std::vector<CpuSparseMatrixPtr>> cpuLocalMats_;
 
   // BaseMatrixT interface
-public:
+ public:
   bool isSparse() const { return true; }
 
-private:
+ private:
   using Matrix::mul;
   using Matrix::copyFrom;
   using Matrix::rowMax;
@@ -329,7 +329,7 @@ private:
 namespace paddle {
 
 class CpuSparseMatrix : public Matrix {
-public:
+ public:
   CpuSparseMatrix(size_t height,
                   size_t width,
                   size_t nnz, /* used to allocate space */
diff --git a/paddle/math/ExecViaCpu.h b/paddle/math/ExecViaCpu.h
index 9b2a3c2b8a..ec2337545e 100644
--- a/paddle/math/ExecViaCpu.h
+++ b/paddle/math/ExecViaCpu.h
@@ -31,17 +31,17 @@ namespace paddle {
 
 template <typename Arg>
 class CopyToCpu {
-public:
+ public:
   explicit CopyToCpu(Arg& arg) : arg_(arg) {}
   Arg& copiedArg() const { return arg_; }
 
-private:
+ private:
   Arg& arg_;
 };
 
 template <>
 class CopyToCpu<Matrix> {
-public:
+ public:
   explicit CopyToCpu(Matrix& arg) : arg_(arg) {
     if (arg.useGpu()) {
       CHECK(!arg.isTransposed()) << "Not supported";
@@ -59,14 +59,14 @@ public:
   }
   Matrix& copiedArg() const { return copied_ ? *copied_ : arg_; }
 
-private:
+ private:
   Matrix& arg_;
   MatrixPtr copied_;
 };
 
 template <>
 class CopyToCpu<const Matrix> {
-public:
+ public:
   explicit CopyToCpu(const Matrix& arg) : arg_(arg) {
     if (arg.useGpu()) {
       CHECK(!arg.isTransposed()) << "Not supported";
@@ -79,14 +79,14 @@ public:
   }
   const Matrix& copiedArg() const { return copied_ ? *copied_ : arg_; }
 
-private:
+ private:
   const Matrix& arg_;
   MatrixPtr copied_;
 };
 
 template <>
 class CopyToCpu<IVector> {
-public:
+ public:
   explicit CopyToCpu(IVector& arg) : arg_(arg) {
     if (arg.useGpu()) {
       copied_ = IVector::create(arg.getSize(), /* useGpu= */ false);
@@ -100,14 +100,14 @@ public:
   }
   IVector& copiedArg() const { return copied_ ? *copied_ : arg_; }
 
-private:
+ private:
   IVector& arg_;
   IVectorPtr copied_;
 };
 
 template <>
 class CopyToCpu<const IVector> {
-public:
+ public:
   explicit CopyToCpu(const IVector& arg) : arg_(arg) {
     if (arg.useGpu()) {
       copied_ = IVector::create(arg.getSize(), /* useGpu= */ false);
@@ -116,7 +116,7 @@ public:
   }
   const IVector& copiedArg() const { return copied_ ? *copied_ : arg_; }
 
-private:
+ private:
   const IVector& arg_;
   IVectorPtr copied_;
 };
@@ -128,7 +128,7 @@ class GpuFuncWrapperImp;
 
 template <typename F, typename R, typename... Args>
 class GpuFuncWrapperBase {
-public:
+ public:
   typedef R ResultType;
   R operator()(F&& f, Args... args) {
     return f(CopyToCpu<typename std::remove_reference<Args>::type>(args)
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index e1fb81679a..d4a78f3e54 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -35,7 +35,7 @@ typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
  *
  */
 class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
-public:
+ public:
   MKLDNNMatrix(CpuMatrixPtr m, mkldnn::memory::primitive_desc pd)
       : CpuMatrix(m->getData(), m->getHeight(), m->getWidth(), false),
         mkldnn::memory(pd, m->getData()),
@@ -107,7 +107,7 @@ public:
     dst.copyFrom(*m_);
   }
 
-public:
+ public:
   /**
    * Reorder this MKLDNNMatrix from other format.
    * Support inplace reorder.
@@ -226,7 +226,7 @@ public:
    */
   mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
 
-protected:
+ protected:
   /**
    * Do reorder once.
    * Can support inplace.
@@ -248,7 +248,7 @@ protected:
     set_data_handle(data);
   }
 
-private:
+ private:
   // save the CpuMatrixPtr in case the buffer released outside
   CpuMatrixPtr m_;
 };
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index de404cad89..f48119aa51 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "MathFunctions.h"
+#include "paddle/math/MathFunctions.h"
 #include "hl_matrix_apply.cuh"
 #include "hl_matrix_ops.cuh"
 #include "paddle/utils/DynamicLoader.h"
@@ -240,6 +240,36 @@ template <>
 void vAdd<double>(const int n, const double* a, const double* b, double* r) {
   vdAdd(n, a, b, r);
 }
+
+template <>
+void vTanh<float>(const int n, const float* a, float* r) {
+  vsTanh(n, a, r);
+}
+
+template <>
+void vTanh<double>(const int n, const double* a, double* r) {
+  vdTanh(n, a, r);
+}
+
+template <>
+void vInvSqrt<float>(const int n, const float* a, float* r) {
+  vsInvSqrt(n, a, r);
+}
+
+template <>
+void vInvSqrt<double>(const int n, const double* a, double* r) {
+  vdInvSqrt(n, a, r);
+}
+
+template <>
+void vLog1p<float>(const int n, const float* a, float* r) {
+  vsLog1p(n, a, r);
+}
+
+template <>
+void vLog1p<double>(const int n, const double* a, double* r) {
+  vdLog1p(n, a, r);
+}
 #else
 
 DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
@@ -277,17 +307,6 @@ void vAdd(const int n, const T* a, const T* b, T* r) {
                                                      n);
 }
 
-template void vExp(const int n, const float* a, float* r);
-template void vExp(const int n, const double* a, double* r);
-template void vLog(const int n, const float* a, float* r);
-template void vLog(const int n, const double* a, double* r);
-template void vPow(const int n, const float* a, const float b, float* r);
-template void vPow(const int n, const double* a, const double b, double* r);
-template void vAdd(const int n, const float* a, const float* b, float* r);
-template void vAdd(const int n, const double* a, const double* b, double* r);
-
-#endif
-
 DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
 template <class T>
 void vInvSqrt(const int n, const T* a, T* r) {
@@ -311,11 +330,19 @@ void vTanh(const int n, const T* a, T* r) {
       binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
+template void vExp(const int n, const float* a, float* r);
+template void vExp(const int n, const double* a, double* r);
+template void vLog(const int n, const float* a, float* r);
+template void vLog(const int n, const double* a, double* r);
+template void vPow(const int n, const float* a, const float b, float* r);
+template void vPow(const int n, const double* a, const double b, double* r);
+template void vAdd(const int n, const float* a, const float* b, float* r);
+template void vAdd(const int n, const double* a, const double* b, double* r);
 template void vInvSqrt(const int n, const double* a, double* r);
 template void vInvSqrt(const int n, const float* a, float* r);
 template void vLog1p(const int n, const float* a, float* r);
 template void vLog1p(const int n, const double* a, double* r);
 template void vTanh(const int n, const float* a, float* r);
 template void vTanh(const int n, const double* a, double* r);
-
+#endif
 }  // namespace paddle
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 04e9614eab..4c3b2c9536 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -77,7 +77,7 @@ typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
  * instead.
  */
 class Matrix : public BaseMatrix {
-protected:
+ protected:
   Matrix(MemoryHandlePtr memHandle,
          size_t height,
          size_t width,
@@ -95,11 +95,11 @@ protected:
 
   static ThreadLocal<MatrixPtr> tmpMat_;
 
-public:
+ public:
   size_t elementCnt_;  // maximal number of elements which can be held in data_
   MemoryHandlePtr memoryHandle_;
 
-public:
+ public:
   virtual ~Matrix() {}
 
   static MatrixPtr create(MemoryHandlePtr memHandle,
@@ -412,7 +412,7 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-public:
+ public:
   /// Only set all variables to 0 or NULL but not free them.
   virtual void clear() {
     height_ = 0;
@@ -1228,7 +1228,7 @@ inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
 }
 
 class GpuMatrix : public Matrix {
-public:
+ public:
   GpuMatrix();
 
   GpuMatrix(size_t height, size_t width, bool trans = false);
@@ -1660,11 +1660,11 @@ public:
 };
 
 class CpuMatrix : public Matrix {
-private:
+ private:
   MatrixPtr sftmaxSum_;
   MatrixPtr sftmaxDot_;
 
-public:
+ public:
   CpuMatrix(size_t height, size_t width, bool trans = false);
   CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
       : Matrix(data, height, width, trans, false) {}
@@ -1892,7 +1892,7 @@ public:
   real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
   virtual real* getRowBuf(size_t row) { return getRow(row); }
 
-public:
+ public:
   /// add b to each sample of this.
   void addBias(Matrix& b, real scale);
   void addSharedBias(Matrix& b, real scale);
@@ -2128,7 +2128,7 @@ public:
 };
 
 class SharedCpuMatrix : public CpuMatrix {
-public:
+ public:
 #ifndef PADDLE_MOBILE_INFERENCE
   /* blockNum is number of partitions of the matrix  */
   SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
@@ -2160,12 +2160,12 @@ public:
 
   ~SharedCpuMatrix() {}
 
-public:
+ public:
   virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
   virtual void add(Matrix& b, real p1, real p2);
   virtual void add(real p1, real p2);
 
-private:
+ private:
   using Matrix::mul;
   void initShared(int blockNum);
   void initBlock(int blockNum);
diff --git a/paddle/math/MatrixBitCode.cpp b/paddle/math/MatrixBitCode.cpp
index 61a9923bc2..f7a949294b 100644
--- a/paddle/math/MatrixBitCode.cpp
+++ b/paddle/math/MatrixBitCode.cpp
@@ -27,7 +27,7 @@ struct SimpleCode {
   inline bool calcBit(int bit) const { return c_ & (1 << bit); }
   inline int getLength() const { return findLastSet(c_) - 1; }
 
-private:
+ private:
   size_t c_;
 };
 
@@ -39,7 +39,7 @@ struct SimpleCodeTable {
   size_t size() const { return numClasses_; }
   int getMaxCodeLength() const { return findLastSet(numClasses_ - 1); }
 
-private:
+ private:
   size_t numClasses_;
   int maxCodeLength_;
 };
diff --git a/paddle/math/MemoryHandle.h b/paddle/math/MemoryHandle.h
index 03ee413c12..516e09dbed 100644
--- a/paddle/math/MemoryHandle.h
+++ b/paddle/math/MemoryHandle.h
@@ -20,16 +20,16 @@ limitations under the License. */
 namespace paddle {
 
 class MemoryHandle {
-protected:
+ protected:
   explicit MemoryHandle(size_t size);
   virtual ~MemoryHandle() {}
 
-public:
+ public:
   void* getBuf() const { return buf_; }
   size_t getSize() const { return size_; }
   size_t getAllocSize() const { return allocSize_; }
 
-protected:
+ protected:
   PoolAllocator* allocator_;
   size_t size_;       // the requested size
   size_t allocSize_;  // the allocated size
@@ -43,7 +43,7 @@ protected:
  * The raw handle will be released at destructor
  */
 class GpuMemoryHandle : public MemoryHandle {
-public:
+ public:
   explicit GpuMemoryHandle(size_t size);
   virtual ~GpuMemoryHandle();
 };
@@ -54,7 +54,7 @@ public:
  * The raw handle will be released at destructor
  */
 class CpuMemoryHandle : public MemoryHandle {
-public:
+ public:
   explicit CpuMemoryHandle(size_t size);
   virtual ~CpuMemoryHandle();
 };
diff --git a/paddle/math/PoolAllocator.h b/paddle/math/PoolAllocator.h
index 90141fef3f..7239cf1c44 100644
--- a/paddle/math/PoolAllocator.h
+++ b/paddle/math/PoolAllocator.h
@@ -27,7 +27,7 @@ namespace paddle {
  * @brief Memory pool allocator implementation.
  */
 class PoolAllocator {
-public:
+ public:
   /**
    * @brief constructor.
    * @param allocator a Allocator object.
@@ -47,7 +47,7 @@ public:
   void free(void* ptr, size_t size);
   std::string getName() { return name_; }
 
-private:
+ private:
   void freeAll();
   void printAll();
   std::unique_ptr<Allocator> allocator_;
diff --git a/paddle/math/RowBuffer.h b/paddle/math/RowBuffer.h
index 2e4d11a86b..6950afaa21 100644
--- a/paddle/math/RowBuffer.h
+++ b/paddle/math/RowBuffer.h
@@ -26,7 +26,7 @@ namespace paddle {
  * If not set memory handler, then the data could be auto growth.
  */
 class RowBuffer {
-public:
+ public:
   /**
    * @brief RowBuffer create a auto-growth row buffer. The row length is width.
    * @param width the length of each row, a.k.a matrix width.
@@ -129,7 +129,7 @@ public:
    */
   inline size_t getWidth() const { return width_; }
 
-private:
+ private:
   //! TODO(yuyang18): Add resize method to CpuMemHandlePtr, then we can get rid
   //! of std::vector here.
   CpuMemHandlePtr preallocatedBuf_;
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
index 7c525f4edf..9181fa2923 100644
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -25,7 +25,7 @@ namespace paddle {
 typedef std::shared_ptr<_hl_sparse_matrix_s> hl_sparse_matrix_s_ptr;
 
 class GpuSparseMatrix : public Matrix {
-public:
+ public:
   MemoryHandlePtr sMemoryHandle_;
   int* rows_;
   int* cols_;
@@ -36,7 +36,7 @@ public:
   SparseValueType valueType_;
   SparseFormat format_;
 
-public:
+ public:
   GpuSparseMatrix(size_t height,
                   size_t width,
                   size_t nnz, /* used to allocate space */
@@ -73,7 +73,7 @@ public:
                   bool trans,
                   MemoryHandlePtr sMemoryHandle);
 
-protected:
+ protected:
   struct Element {
     int row;
     int col;
@@ -82,7 +82,7 @@ protected:
         : row(rowIn), col(colIn), val(valIn) {}
   };
 
-public:
+ public:
   ~GpuSparseMatrix() {}
 
   void resize(size_t newHeight,
@@ -211,13 +211,13 @@ public:
    */
   void rowMax(IVector& maxIds, Matrix& maxVal);
 
-protected:
+ protected:
   void sparseResize();
 
   void copyRow(int offsets, size_t colNum, const sparse_non_value_t* row);
   void copyRow(int offsets, size_t colNum, const sparse_float_value_t* row);
 
-public:
+ public:
   void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
 
   void copyFrom(CpuSparseMatrix& src, hl_stream_t stream);
@@ -228,10 +228,10 @@ public:
   void trimFromCSC(const CpuSparseMatrix& src);
 
   // BaseMatrixT interface
-public:
+ public:
   bool isSparse() const { return true; }
 
-private:
+ private:
   using Matrix::mul;
   using Matrix::copyFrom;
   using Matrix::rowMax;
@@ -248,7 +248,7 @@ private:
 namespace paddle {
 
 class GpuSparseMatrix : public Matrix {
-public:
+ public:
   GpuSparseMatrix(size_t height,
                   size_t width,
                   size_t nnz, /* used to allocate space */
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index 3920de32df..cf6779e8b0 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -29,7 +29,7 @@ namespace paddle {
  * Sparse Row
  */
 class SparseRowCpuMatrix : public CpuMatrix {
-public:
+ public:
   struct IndexDict {
     // In the following, global id means the row id in the original matrix.
     // Local id means the row id in the local storage which only contains
@@ -53,7 +53,7 @@ public:
 
   virtual ~SparseRowCpuMatrix() {}
 
-public:
+ public:
   /**
    *  Get the row buf
    *
@@ -163,7 +163,7 @@ public:
     return indexDictHandle_->localIndices;
   }
 
-protected:
+ protected:
   template <typename Func>
   void apply(Func f) {
     f(buf_->data(), localIndices_->size() * width_);
@@ -204,7 +204,7 @@ class SyncThreadPool;
 
 /// For prefetching parameters from remote Parameter server
 class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
-public:
+ public:
   SparsePrefetchRowCpuMatrix(CpuMemHandlePtr dataHandle,
                              size_t height,
                              size_t width,
@@ -229,13 +229,13 @@ public:
    */
   void setupIndices();
 
-protected:
+ protected:
   void addRows(const unsigned int* ids, size_t len);
   SyncThreadPool* pool_;
 };
 
 class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {
-public:
+ public:
   SparseAutoGrowRowCpuMatrix(size_t height,
                              size_t width,
                              IndexDictPtr indexDictHandle = nullptr,
@@ -258,7 +258,7 @@ public:
 };
 
 class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {
-public:
+ public:
   CacheRowCpuMatrix(size_t height,
                     size_t width,
                     IndexDictPtr indexDictHandle = nullptr,
@@ -287,7 +287,7 @@ public:
 
   virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
 
-public:
+ public:
   CpuVectorPtr sourceDataVec_;
   real* sourceData_;
 };
@@ -299,7 +299,7 @@ public:
  * ids are hashed by worker thread id.
  */
 class SparseRowIdsCpuMatrix : public CpuMatrix {
-public:
+ public:
   SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle,
                         size_t height,
                         size_t width,
@@ -310,7 +310,7 @@ public:
 
   std::vector<uint32_t>& getIds(size_t threadId) { return idsArray_[threadId]; }
 
-private:
+ private:
   std::vector<std::vector<uint32_t>> idsArray_;
 };
 
@@ -320,13 +320,13 @@ private:
 namespace paddle {
 
 class SparseRowCpuMatrix : public CpuMatrix {
-public:
+ public:
   void reserveStore() {}
   void clearIndices() {}
 };
 
 class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
-public:
+ public:
   void setupIndices() {}
   void addRows(MatrixPtr input) {}
   void addRows(IVectorPtr ids) {}
diff --git a/paddle/math/Storage.h b/paddle/math/Storage.h
index ba8f4689a1..61a9aa2a07 100644
--- a/paddle/math/Storage.h
+++ b/paddle/math/Storage.h
@@ -25,7 +25,7 @@ namespace paddle {
  * @brief Storage manager for multiple devices.
  */
 class StorageEngine {
-public:
+ public:
   /**
    * @return Storage singleton
    */
@@ -41,7 +41,7 @@ public:
    */
   PoolAllocator* getCpuAllocator();
 
-protected:
+ protected:
   StorageEngine();
   ~StorageEngine();
   RWLock lock_;
diff --git a/paddle/math/TensorApply.h b/paddle/math/TensorApply.h
index 7d79cae5a1..8b642047bf 100644
--- a/paddle/math/TensorApply.h
+++ b/paddle/math/TensorApply.h
@@ -21,7 +21,7 @@ namespace paddle {
  */
 template <typename Derived, class T>
 class TensorApply {
-public:
+ public:
   explicit INLINE TensorApply(const Derived& p)
       : data_(p.data_),
         stride_(p.stride_),
@@ -52,7 +52,7 @@ public:
  */
 template <typename Derived, class T>
 class TensorApply<const Derived, T> {
-public:
+ public:
   explicit INLINE TensorApply(const Derived& p)
       : data_(p.data_),
         stride_(p.stride_),
@@ -77,7 +77,7 @@ public:
 
 template <typename Derived, class T>
 class TensorApply<const TensorExpression<Derived, T>, T> {
-public:
+ public:
   explicit TensorApply(const TensorExpression<Derived, T>& expr)
       : expr_(expr.derived()) {}
 
@@ -97,7 +97,7 @@ public:
  */
 template <class OP, typename ArgType, class T>
 class TensorApply<const TensorUnaryOp<OP, ArgType, T>, T> {
-public:
+ public:
   explicit INLINE TensorApply(const TensorUnaryOp<OP, ArgType, T>& expr)
       : op_(expr.op_), expr_(expr.expr_) {}
 
@@ -118,7 +118,7 @@ public:
  */
 template <class OP, typename LhsType, typename RhsType, class T>
 class TensorApply<const TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
-public:
+ public:
   explicit INLINE TensorApply(
       const TensorBinaryOp<OP, LhsType, RhsType, T>& expr)
       : op_(expr.op_), lhs_(expr.lhs_), rhs_(expr.rhs_) {
@@ -153,7 +153,7 @@ public:
  */
 template <typename ArgType1, typename ArgType2, typename ArgType3, class T>
 class TensorApply<const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>, T> {
-public:
+ public:
   explicit INLINE TensorApply(
       const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>& expr)
       : expr1_(expr.expr1_), expr2_(expr.expr2_), expr3_(expr.expr3_) {
@@ -192,7 +192,7 @@ public:
  */
 template <class OP, typename ArgType, class T>
 class TensorApply<const TensorConstant<OP, ArgType, T>, T> {
-public:
+ public:
   explicit INLINE TensorApply(const TensorConstant<OP, ArgType, T>& expr)
       : op_(expr.op_), expr_(expr.expr_) {}
 
diff --git a/paddle/math/TensorAssign.h b/paddle/math/TensorAssign.h
index 113d98c16b..7d4726ddba 100644
--- a/paddle/math/TensorAssign.h
+++ b/paddle/math/TensorAssign.h
@@ -25,7 +25,7 @@ namespace paddle {
  */
 template <typename LhsType, typename RhsType, class T>
 class TensorAssignOp {
-public:
+ public:
   explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
       : lhs_(lhs), rhs_(rhs) {
 #ifndef __CUDA_ARCH__
@@ -49,7 +49,7 @@ public:
   }
   INLINE bool useGpu() const { return lhs_.useGpu(); }
 
-private:
+ private:
   TensorApply<LhsType, T> lhs_;
   TensorApply<const RhsType, T> rhs_;
 };
diff --git a/paddle/math/TensorExpression.h b/paddle/math/TensorExpression.h
index 83229ae65d..f6da9adfca 100644
--- a/paddle/math/TensorExpression.h
+++ b/paddle/math/TensorExpression.h
@@ -40,7 +40,7 @@ class TensorAssignOp;
  */
 template <typename Derived, class T>
 class TensorExpression {
-public:
+ public:
   /**
    * Element wise unary expression.
    */
@@ -355,7 +355,7 @@ public:
     return TensorAssignOp<Derived, ExpressionType, T>(derived(), expr);
   }
 
-protected:
+ protected:
   const Derived& derived() const { return *static_cast<const Derived*>(this); }
 };
 
@@ -365,7 +365,7 @@ protected:
 template <class OP, typename ExprType, class T>
 class TensorUnaryOp
     : public TensorExpression<TensorUnaryOp<OP, ExprType, T>, T> {
-public:
+ public:
   explicit TensorUnaryOp(const OP op, const ExprType& expr)
       : op_(op), expr_(expr) {}
 
@@ -379,7 +379,7 @@ public:
 template <class OP, typename LhsType, typename RhsType, class T>
 class TensorBinaryOp
     : public TensorExpression<TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
-public:
+ public:
   explicit TensorBinaryOp(const OP op, const LhsType& lhs, const RhsType& rhs)
       : op_(op), lhs_(lhs), rhs_(rhs) {}
 
@@ -395,7 +395,7 @@ template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
 class TensorTernaryOp : public TensorExpression<
                             TensorTernaryOp<ExprType1, ExprType2, ExprType3, T>,
                             T> {
-public:
+ public:
   explicit TensorTernaryOp(const ExprType1& expr1,
                            const ExprType2& expr2,
                            const ExprType3& expr3)
@@ -412,7 +412,7 @@ public:
 template <class OP, typename ExprType, class T>
 class TensorConstant
     : public TensorExpression<TensorConstant<OP, ExprType, T>, T> {
-public:
+ public:
   explicit TensorConstant(const OP op, const ExprType& expr)
       : op_(op), expr_(expr) {}
 
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index 3efbc769df..964b42cae5 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -40,13 +40,13 @@ class Matrix;
 
 template <class T>
 class BaseVector : public BaseMatrixT<T> {
-public:
+ public:
   BaseVector(size_t size, T* data, bool useGpu)
       : BaseMatrixT<T>(1, size, data, false, useGpu), size_(this->width_) {}
 
   ~BaseVector() {}
 
-protected:
+ protected:
   size_t& size_;
 };
 
@@ -57,7 +57,7 @@ protected:
  */
 template <class T>
 class VectorT : public BaseVector<T> {
-protected:
+ protected:
   VectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset, bool useGpu)
       : BaseVector<T>(size,
                       reinterpret_cast<T*>(memoryHandle->getBuf()) + offset,
@@ -71,7 +71,7 @@ protected:
   VectorT(size_t size, T* data, bool useGpu)
       : BaseVector<T>(size, data, useGpu) {}
 
-public:
+ public:
   virtual ~VectorT() {}
 
   static std::shared_ptr<VectorT<T>> create(size_t size, bool useGpu);
@@ -281,7 +281,7 @@ public:
     }
   }
 
-protected:
+ protected:
   friend class GpuVectorT<T>;
   friend class CpuVectorT<T>;
   virtual void copyTo(CpuVectorT<T>* dest) const = 0;
@@ -297,7 +297,7 @@ std::ostream& operator<<(std::ostream& os, const VectorT<T>& vec) {
 
 template <class T>
 class GpuVectorT : public VectorT<T> {
-public:
+ public:
   explicit GpuVectorT(size_t size);
   GpuVectorT(size_t size, GpuMemHandlePtr memHandle, size_t offset)
       : VectorT<T>(size, memHandle, offset, true) {}
@@ -343,14 +343,14 @@ public:
     TensorGpuApply<T>(*this, expr);
   }
 
-protected:
+ protected:
   virtual void copyTo(CpuVectorT<T>* dest) const;
   virtual void copyTo(GpuVectorT<T>* dest) const;
 };
 
 template <class T>
 class CpuVectorT : public VectorT<T> {
-public:
+ public:
   explicit CpuVectorT(size_t size);
   CpuVectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset)
       : VectorT<T>(size, memoryHandle, offset, false) {}
@@ -415,7 +415,7 @@ public:
 
 template <class T>
 class ParallelCpuVectorT : public CpuVectorT<T> {
-public:
+ public:
   ParallelCpuVectorT(size_t size, SyncThreadPool* pool)
       : CpuVectorT<T>(size), pool_(pool) {}
 
@@ -434,7 +434,7 @@ public:
 
   virtual void exec(SyncThreadPool::JobFunc jobFunc);
 
-private:
+ private:
   typedef std::function<void(CpuVectorT<T>& vec)> ExecFunc;
   void parallelExec(ExecFunc func);
   SyncThreadPool* pool_;
@@ -445,7 +445,7 @@ private:
  */
 template <class T>
 class CpuGpuVectorT {
-public:
+ public:
   /**
    * @brief An enum type of SyncedFlag using to
    *        mark data memory is in CPU or GPU.
@@ -670,7 +670,7 @@ public:
     setSync(flag);
   }
 
-protected:
+ protected:
   void resizeOrCreate(size_t size, bool useGpu);
 
   /**
diff --git a/paddle/math/tests/TensorCheck.h b/paddle/math/tests/TensorCheck.h
index f4332ede36..40ac04ef5d 100644
--- a/paddle/math/tests/TensorCheck.h
+++ b/paddle/math/tests/TensorCheck.h
@@ -32,7 +32,7 @@ using paddle::CpuVectorT;
 using paddle::GpuVectorT;
 
 class AssertEqual {
-public:
+ public:
   AssertEqual(real err = 0) : err_(err) {}
 
   inline bool operator()(real a, real b) {
@@ -51,7 +51,7 @@ public:
     return true;
   }
 
-private:
+ private:
   real err_;
 };
 
@@ -60,71 +60,71 @@ class CopyToCpu;
 
 template <>
 class CopyToCpu<CpuMatrix> {
-public:
+ public:
   explicit CopyToCpu(const CpuMatrix& arg) : arg_(arg) {}
   const CpuMatrix& copiedArg() const { return arg_; }
 
-private:
+ private:
   const CpuMatrix& arg_;
 };
 
 template <>
 class CopyToCpu<GpuMatrix> {
-public:
+ public:
   explicit CopyToCpu(const GpuMatrix& arg)
       : arg_(arg.getHeight(), arg.getWidth()) {
     arg_.copyFrom(arg);
   }
   CpuMatrix& copiedArg() { return arg_; }
 
-private:
+ private:
   CpuMatrix arg_;
 };
 
 template <>
 class CopyToCpu<Matrix> {
-public:
+ public:
   explicit CopyToCpu(const Matrix& arg)
       : arg_(arg.getHeight(), arg.getWidth()) {
     arg_.copyFrom(arg);
   }
   CpuMatrix& copiedArg() { return arg_; }
 
-private:
+ private:
   CpuMatrix arg_;
 };
 
 template <typename T>
 class CopyToCpu<CpuVectorT<T>> {
-public:
+ public:
   explicit CopyToCpu(const CpuVectorT<T>& arg) : arg_(arg) {}
   const CpuVectorT<T>& copiedArg() const { return arg_; }
 
-private:
+ private:
   const CpuVectorT<T>& arg_;
 };
 
 template <typename T>
 class CopyToCpu<GpuVectorT<T>> {
-public:
+ public:
   explicit CopyToCpu(const GpuVectorT<T>& arg) : arg_(arg.getSize()) {
     arg_.copyFrom(arg);
   }
   CpuVectorT<T>& copiedArg() { return arg_; }
 
-private:
+ private:
   CpuVectorT<T> arg_;
 };
 
 template <typename T>
 class CopyToCpu<VectorT<T>> {
-public:
+ public:
   explicit CopyToCpu(const VectorT<T>& arg) : arg_(arg.getSize()) {
     arg_.copyFrom(arg);
   }
   CpuVectorT<T>& copiedArg() { return arg_; }
 
-private:
+ private:
   CpuVectorT<T> arg_;
 };
 
diff --git a/paddle/math/tests/TestUtils.h b/paddle/math/tests/TestUtils.h
index d2b9706432..e1966ec8a7 100644
--- a/paddle/math/tests/TestUtils.h
+++ b/paddle/math/tests/TestUtils.h
@@ -56,31 +56,31 @@ using paddle::GpuSparseMatrix;
 
 template <typename T1, typename T2>
 class ReplaceType {
-public:
+ public:
   typedef T1 type;
 };
 
 template <>
 class ReplaceType<BaseMatrix, CpuMatrix> {
-public:
+ public:
   typedef CpuMatrix type;
 };
 
 template <>
 class ReplaceType<BaseMatrix, GpuMatrix> {
-public:
+ public:
   typedef GpuMatrix type;
 };
 
 template <>
 class ReplaceType<Matrix, CpuMatrix> {
-public:
+ public:
   typedef CpuMatrix type;
 };
 
 template <>
 class ReplaceType<Matrix, GpuMatrix> {
-public:
+ public:
   typedef GpuMatrix type;
 };
 
@@ -180,25 +180,25 @@ R call(C& obj, R (FC::*f)(FArgs...), Args&&... args) {
 
 template <typename T>
 class ReturnType {
-public:
+ public:
   typedef T type;
 };
 
 template <>
 class ReturnType<CpuMatrix> {
-public:
+ public:
   typedef GpuMatrix type;
 };
 
 template <>
 class ReturnType<CpuIVector> {
-public:
+ public:
   typedef GpuIVector type;
 };
 
 template <>
 class ReturnType<CpuSparseMatrix> {
-public:
+ public:
   typedef GpuSparseMatrix type;
 };
 
@@ -234,7 +234,7 @@ GpuSparseMatrix autoArgs(CpuSparseMatrix& v) {
 }
 
 class AutoCompare {
-public:
+ public:
   /**
    * err is the allowed calculation error.
    * The smaller the value of err,
@@ -285,7 +285,7 @@ public:
     TensorCheck(compare, cpu, gpu);
   }
 
-protected:
+ protected:
   CpuMatrix cpu;
   GpuMatrix gpu;
   AssertEqual compare;
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
index 513c7b440e..72256cb9d4 100644
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -39,7 +39,7 @@ real f(Matrix& mat1,
 }
 
 class Functor {
-public:
+ public:
   real operator()(Matrix& mat1,
                   const Matrix& mat2,
                   IVector& vec1,
@@ -49,7 +49,7 @@ public:
     return a_;
   }
 
-private:
+ private:
   real a_;
 };
 
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
index fb146176ca..fb58d26734 100644
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -28,14 +28,14 @@ DEFINE_double(max_diff, 1e-13, "max diff allowed");
 #endif
 
 class SetMaxDiff {
-public:
+ public:
   explicit SetMaxDiff(double max_diff) {
     max_diff_ = FLAGS_max_diff;
     FLAGS_max_diff = max_diff;
   }
   ~SetMaxDiff() { FLAGS_max_diff = max_diff_; }
 
-private:
+ private:
   double max_diff_;
 };
 
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index ef99dab60a..969400666f 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -32,7 +32,7 @@ const int TGT_SIZE = 21;
 const int CHANNELS = 3;
 
 class PerturbationTest : public testing::Test {
-protected:
+ protected:
   virtual void SetUp() { generateTestImages(gpuImages_); }
 
   virtual void TearDown() {}
diff --git a/paddle/optimizer/CMakeLists.txt b/paddle/optimizer/CMakeLists.txt
index 25fc35311f..7c80faa48c 100644
--- a/paddle/optimizer/CMakeLists.txt
+++ b/paddle/optimizer/CMakeLists.txt
@@ -7,6 +7,10 @@ set(OPITMIZER_SRCS
     sgd_optimizer.cc
   )
 
-cc_library(paddle_optimizer STATIC SRCS ${OPITMIZER_SRCS} DEPS paddle_proto glog)
-cc_test(serialization_test SRCS serialization_test.cc DEPS paddle_proto)
-cc_test(parameter_optimizer_test SRCS parameter_optimizer_test.cc DEPS paddle_optimizer)
+add_library(paddle_optimizer ${OPITMIZER_SRCS})
+target_link_libraries(paddle_optimizer paddle_proto glog)
+
+if (WITH_TESTING)
+    add_unittest(serialization_test serialization_test.cc)
+    add_unittest(parameter_optimizer_test parameter_optimizer_test.cc)
+endif()
diff --git a/paddle/optimizer/adadelta_optimizer.h b/paddle/optimizer/adadelta_optimizer.h
index 74df9d54be..5beb62295a 100644
--- a/paddle/optimizer/adadelta_optimizer.h
+++ b/paddle/optimizer/adadelta_optimizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace optimizer {
 
 class AdadeltaOptimizer : public ParameterOptimizer {
-public:
+ public:
   AdadeltaOptimizer(
       Tensor *parameter, LrPolicy *lr, double rho, double epsilon, double decay)
       : ParameterOptimizer(parameter, lr),
@@ -40,7 +40,7 @@ public:
   std::string SerializeState();
   void DeserializeState(const std::string &state);
 
-private:
+ private:
   Tensor *accum_gradient_;
   Tensor *accum_delta_;
   Tensor *update_delta_;
diff --git a/paddle/optimizer/adagrad_optimizer.h b/paddle/optimizer/adagrad_optimizer.h
index 1d58402d78..b6fc067399 100644
--- a/paddle/optimizer/adagrad_optimizer.h
+++ b/paddle/optimizer/adagrad_optimizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace optimizer {
 
 class AdagradOptimizer : public ParameterOptimizer {
-public:
+ public:
   AdagradOptimizer(Tensor *parameter,
                    LrPolicy *lr,
                    double epsilon,
@@ -36,7 +36,7 @@ public:
   std::string SerializeState();
   void DeserializeState(const std::string &state);
 
-private:
+ private:
   Tensor *accum_gradient_;
   double epsilon_;
   double decay_;
diff --git a/paddle/optimizer/adam_optimizer.h b/paddle/optimizer/adam_optimizer.h
index 7977226c86..fce1096006 100644
--- a/paddle/optimizer/adam_optimizer.h
+++ b/paddle/optimizer/adam_optimizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace optimizer {
 
 class AdamOptimizer : public ParameterOptimizer {
-public:
+ public:
   AdamOptimizer(Tensor *parameter,
                 LrPolicy *lr,
                 double beta_1,
@@ -42,7 +42,7 @@ public:
   std::string SerializeState();
   void DeserializeState(const std::string &state);
 
-private:
+ private:
   Tensor *momentums_;
   Tensor *velocitys_;
   double beta_1_;
diff --git a/paddle/optimizer/lr_policy.h b/paddle/optimizer/lr_policy.h
index 14422d1f42..d639c9f22c 100644
--- a/paddle/optimizer/lr_policy.h
+++ b/paddle/optimizer/lr_policy.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace optimizer {
 
 class LrPolicy {
-public:
+ public:
   virtual ~LrPolicy() {}
   virtual double LearningRate(const uint64_t num_sample_passed) = 0;
   virtual std::string SerializeState() = 0;
@@ -29,7 +29,7 @@ public:
 
 // constant learning rate policy
 class ConstLr final : public LrPolicy {
-public:
+ public:
   ConstLr(double lr) : learning_rate_(lr){};
   double LearningRate(const uint64_t num_sample_passed) {
     return learning_rate_;
@@ -45,12 +45,12 @@ public:
     learning_rate_ = state.learning_rate();
   }
 
-private:
+ private:
   double learning_rate_;
 };
 
 class LinearLr final : public LrPolicy {
-public:
+ public:
   LinearLr(double lr, double lr_decay_a, double lr_decay_b)
       : learning_rate_(lr), lr_decay_a_(lr_decay_a), lr_decay_b_(lr_decay_b) {}
   double LearningRate(const uint64_t num_sample_passed) {
@@ -72,7 +72,7 @@ public:
     lr_decay_b_ = state.lr_decay_b();
   }
 
-private:
+ private:
   double learning_rate_;
   double lr_decay_a_;
   double lr_decay_b_;
diff --git a/paddle/optimizer/parameter_optimizer.h b/paddle/optimizer/parameter_optimizer.h
index c7cf8db3ee..d5abca82d5 100644
--- a/paddle/optimizer/parameter_optimizer.h
+++ b/paddle/optimizer/parameter_optimizer.h
@@ -26,7 +26,7 @@ namespace paddle {
 namespace optimizer {
 
 class ParameterOptimizer {
-public:
+ public:
   /**
    * @brief  update hook for algorithm need to traverse parameter more than
    * once.
@@ -45,7 +45,7 @@ public:
   virtual std::string SerializeState() = 0;
   virtual void DeserializeState(const std::string &state) = 0;
 
-protected:
+ protected:
   Tensor *parameter_;
   // learning rate policy
   LrPolicy *lr_policy_;
diff --git a/paddle/optimizer/parameter_optimizer_test.cc b/paddle/optimizer/parameter_optimizer_test.cc
index d663e2fd00..1d9572999e 100644
--- a/paddle/optimizer/parameter_optimizer_test.cc
+++ b/paddle/optimizer/parameter_optimizer_test.cc
@@ -38,7 +38,7 @@ paddle::optimizer::Tensor* FixedTensor(size_t size) {
 }
 
 class OptimizerTest : public testing::Test {
-public:
+ public:
   virtual ~OptimizerTest() {}
   // init paddle::optimizer::Tensor shape
   const size_t kSize = 5;
@@ -115,7 +115,7 @@ public:
     }
   }
 
-private:
+ private:
   std::vector<paddle::optimizer::ParameterOptimizer*> opts_;
   paddle::OptimizerConfig config_;
 };
diff --git a/paddle/optimizer/sgd_optimizer.h b/paddle/optimizer/sgd_optimizer.h
index f504d98adb..a8957cde54 100644
--- a/paddle/optimizer/sgd_optimizer.h
+++ b/paddle/optimizer/sgd_optimizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace optimizer {
 
 class SGDOptimizer : public ParameterOptimizer {
-public:
+ public:
   SGDOptimizer(Tensor* parameter, LrPolicy* lr, double m, double d, bool n)
       : ParameterOptimizer(parameter, lr),
         momentums_(nullptr),
@@ -39,7 +39,7 @@ public:
   std::string SerializeState();
   void DeserializeState(const std::string& state);
 
-private:
+ private:
   Tensor* momentums_;
   double momentum_;
   double decay_;
diff --git a/paddle/optimizer/tensor.h b/paddle/optimizer/tensor.h
index fd32398a23..d2cef99074 100644
--- a/paddle/optimizer/tensor.h
+++ b/paddle/optimizer/tensor.h
@@ -26,7 +26,7 @@ namespace optimizer {
 
 template <class T>
 class TensorT {
-public:
+ public:
   TensorT(size_t size) : height_(1), width_(size) {
     // new T[size]() initializes all element to zero value.
     data_ptr_ = std::shared_ptr<T>(new T[size](), std::default_delete<T[]>());
@@ -54,7 +54,7 @@ public:
   // TODO: replace with tensorshape
   size_t size() const { return this->width_ * this->height_; }
 
-protected:
+ protected:
   size_t height_;
   size_t width_;
   std::shared_ptr<T> data_ptr_;
diff --git a/paddle/parameter/AverageOptimizer.h b/paddle/parameter/AverageOptimizer.h
index 4ad3c18d56..f0fe2fd28e 100644
--- a/paddle/parameter/AverageOptimizer.h
+++ b/paddle/parameter/AverageOptimizer.h
@@ -21,7 +21,7 @@ namespace paddle {
 // After Optimization, parameter values are further averaged within
 // time range.
 class AverageOptimizer : public ParameterOptimizer {
-public:
+ public:
   // if *useParameterApply* set, use PARAMETER_APPLY to store averaged parameter
   // else use PARAMETER_VALUE, and value backup in PARAMETER_GRADIENT
   AverageOptimizer(const OptimizationConfig& optConfig,
@@ -65,7 +65,7 @@ public:
 
   virtual void setNoDecay() { optimizer_->setNoDecay(); }
 
-protected:
+ protected:
   std::unique_ptr<ParameterOptimizer> optimizer_;
   bool useApply_;
 
@@ -98,7 +98,7 @@ protected:
 
 // Average Optimizer with Sparse support.
 class AverageSparseOptimizer : public AverageOptimizer {
-public:
+ public:
   AverageSparseOptimizer(const OptimizationConfig& optConfig,
                          ParameterOptimizer* optimizer,
                          bool useParameterApply)
@@ -130,7 +130,7 @@ public:
     t0Vec_.assign(t0Vec_.size(), 0);
   }
 
-protected:
+ protected:
   /**
    *  counting batches, clear after catch up with
    *  t(timer_) is current time,
diff --git a/paddle/parameter/CMakeLists.txt b/paddle/parameter/CMakeLists.txt
index d2ae1c16c6..19ae07e077 100644
--- a/paddle/parameter/CMakeLists.txt
+++ b/paddle/parameter/CMakeLists.txt
@@ -5,8 +5,6 @@ file(GLOB PARAMETERS_SOURCES . *.cpp)
 
 add_library(paddle_parameter STATIC
         ${PARAMETERS_SOURCES})
-add_style_check_target(paddle_parameter ${PARAMETERS_SOURCES})
-add_style_check_target(paddle_parameter ${PARAMETERS_HEADERS})
 add_dependencies(paddle_parameter paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
     add_subdirectory(tests)
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index 047989fcad..86b9a591af 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -22,7 +22,7 @@ namespace paddle {
 
 // Plain SGD optimization.
 class SgdOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit SgdOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {
     addParameterType(PARAMETER_MOMENTUM);
@@ -77,7 +77,7 @@ class SparseMomentumParameterOptimizer : public ParameterOptimizer {
     \gamma_t: learning rate at the t'th step
   */
 
-public:
+ public:
   explicit SparseMomentumParameterOptimizer(
       const OptimizationConfig& optConfig);
   virtual void init(size_t numRows, const ParameterConfig* config);
@@ -89,7 +89,7 @@ public:
       const ParameterConfig& config) const;
   virtual void finishBatch();
 
-private:
+ private:
   real alpha_;
   real beta_;
   real tau_;
@@ -98,7 +98,7 @@ private:
   real momentum_;
   real decayRate_;
 
-protected:
+ protected:
   int64_t timer_;
   mutable std::vector<int64_t> t0Vec_;
   bool isParameterSparse_;
@@ -109,7 +109,7 @@ protected:
  * http://www.magicbroom.info/Papers/DuchiHaSi10.pdf
  */
 class AdagradParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit AdagradParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {
     addParameterType(PARAMETER_MOMENTUM);
@@ -129,7 +129,7 @@ public:
   virtual TraverseCallback needSpecialTraversal(
       const ParameterConfig& config) const;
 
-protected:
+ protected:
   int64_t numUpdates_;
   static const int64_t kMaxNumAccumulates = 16384;
 };
@@ -139,7 +139,7 @@ protected:
  * http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf
  */
 class AdaDeltaParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit AdaDeltaParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {
     addParameterType(PARAMETER_MOMENTUM);
@@ -158,14 +158,14 @@ public:
                       const ParameterConfig& config,
                       size_t sparseId) const;
 
-protected:
+ protected:
   real rou_;
   real epsilon_;
 };
 
 // RMSProp Parameter Optimization.
 class RMSPropParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit RMSPropParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {
     addParameterType(PARAMETER_MOMENTUM);
@@ -191,7 +191,7 @@ public:
                       const ParameterConfig& config,
                       size_t sparseId) const;
 
-protected:
+ protected:
   real rou_;
   real epsilon_;
 
@@ -208,7 +208,7 @@ protected:
 
 // Decayed AdaGrad Optimization.
 class DecayedAdagradParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit DecayedAdagradParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {
     addParameterType(PARAMETER_MOMENTUM);
@@ -233,7 +233,7 @@ public:
                       const ParameterConfig& config,
                       size_t sparseId) const;
 
-protected:
+ protected:
   real rou_;
   real epsilon_;
 
@@ -253,7 +253,7 @@ protected:
  * Reference Paper: http://arxiv.org/abs/1412.6980 Algorithm 1
  */
 class AdamParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit AdamParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig),
         beta1_(optConfig.adam_beta1()),
@@ -275,7 +275,7 @@ public:
                       const ParameterConfig& config,
                       size_t sparseId) const;
 
-protected:
+ protected:
   real beta1_;
   real beta2_;
   real epsilon_;
@@ -288,7 +288,7 @@ protected:
  * Reference Paper: http://arxiv.org/abs/1412.6980 Algorithm 2
  */
 class AdamaxParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit AdamaxParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig),
         beta1_(optConfig.adam_beta1()),
@@ -305,7 +305,7 @@ public:
                       const ParameterConfig& config,
                       size_t sparseId) const;
 
-protected:
+ protected:
   real beta1_;
   real beta2_;
   int64_t step_;
@@ -315,7 +315,7 @@ protected:
 // Used in pserver,
 // when PARAMETER_DELTA stores in PARAMETER_GRADIENT.
 class AddOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit AddOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {}
 
@@ -333,7 +333,7 @@ public:
 
 // A optimizer which does nothing.
 class DummyOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit DummyOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {}
 
@@ -344,7 +344,7 @@ public:
 
 // Do gradient clipping before sgd update
 class OptimizerWithGradientClipping : public ParameterOptimizer {
-public:
+ public:
   OptimizerWithGradientClipping(const OptimizationConfig& optConfig,
                                 ParameterOptimizer* optimizer)
       : ParameterOptimizer(optConfig), optimizer_(optimizer) {
@@ -374,7 +374,7 @@ public:
 
   virtual void setNoDecay() { optimizer_->setNoDecay(); }
 
-protected:
+ protected:
   std::unique_ptr<ParameterOptimizer> optimizer_;
 };
 
diff --git a/paddle/parameter/LearningRateScheduler.cpp b/paddle/parameter/LearningRateScheduler.cpp
index b6b58e3dda..d57d2189a4 100644
--- a/paddle/parameter/LearningRateScheduler.cpp
+++ b/paddle/parameter/LearningRateScheduler.cpp
@@ -28,20 +28,20 @@ LearningRateScheduler* LearningRateScheduler::create(
 // LRS stands for LearningRateScheduler
 
 class BaseLRS : public LearningRateScheduler {
-public:
+ public:
   explicit BaseLRS(const OptimizationConfig& config)
       : learningRate_(config.learning_rate()),
         a_(config.learning_rate_decay_a()),
         b_(config.learning_rate_decay_b()) {}
 
-protected:
+ protected:
   real learningRate_;
   real a_;
   real b_;
 };
 
 class ConstLRS : public BaseLRS {
-public:
+ public:
   explicit ConstLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     return learningRate_;
@@ -50,7 +50,7 @@ public:
 REGISTER_LEARNING_RATE_SCHEDULER(constant, ConstLRS);
 
 class PolyLRS : public BaseLRS {
-public:
+ public:
   explicit PolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     return learningRate_ * pow(1.0 + a_ * numSamplesProcessed, -b_);
@@ -59,7 +59,7 @@ public:
 REGISTER_LEARNING_RATE_SCHEDULER(poly, PolyLRS);
 
 class CaffePolyLRS : public BaseLRS {
-public:
+ public:
   explicit CaffePolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     if (numSamplesProcessed > a_) {
@@ -78,7 +78,7 @@ public:
 REGISTER_LEARNING_RATE_SCHEDULER(caffe_poly, CaffePolyLRS);
 
 class ExpLRS : public BaseLRS {
-public:
+ public:
   explicit ExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     double decayRatio = (double)numSamplesProcessed / b_;
@@ -88,7 +88,7 @@ public:
 REGISTER_LEARNING_RATE_SCHEDULER(exp, ExpLRS);
 
 class DiscreteExpLRS : public BaseLRS {
-public:
+ public:
   explicit DiscreteExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     int numDecays = floor(numSamplesProcessed / b_);
@@ -98,7 +98,7 @@ public:
 REGISTER_LEARNING_RATE_SCHEDULER(discexp, DiscreteExpLRS);
 
 class LinearLRS : public BaseLRS {
-public:
+ public:
   explicit LinearLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     return std::max(learningRate_ - a_ * numSamplesProcessed, b_);
@@ -113,7 +113,7 @@ REGISTER_LEARNING_RATE_SCHEDULER(linear, LinearLRS);
   then learning_rate = learning_rate_base * rate_i
 */
 class ManualLRS : public BaseLRS {
-public:
+ public:
   explicit ManualLRS(const OptimizationConfig& config)
       : BaseLRS(config), currentSegment_(0), lastNum_(0) {
     std::vector<std::string> pieces;
@@ -151,7 +151,7 @@ public:
     return learningRate_ * rates_.back();
   }
 
-protected:
+ protected:
   std::vector<real> rates_;
   std::vector<int64_t> segments_;
   size_t currentSegment_;
@@ -161,7 +161,7 @@ protected:
 REGISTER_LEARNING_RATE_SCHEDULER(manual, ManualLRS);
 
 class PassManualLRS : public ManualLRS {
-public:
+ public:
   explicit PassManualLRS(const OptimizationConfig& config)
       : ManualLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
diff --git a/paddle/parameter/LearningRateScheduler.h b/paddle/parameter/LearningRateScheduler.h
index aea99a1c20..3fad970402 100644
--- a/paddle/parameter/LearningRateScheduler.h
+++ b/paddle/parameter/LearningRateScheduler.h
@@ -26,7 +26,7 @@ namespace paddle {
   })
 
 class LearningRateScheduler {
-public:
+ public:
   static LearningRateScheduler* create(const OptimizationConfig& config);
   virtual ~LearningRateScheduler() {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) = 0;
diff --git a/paddle/parameter/OptimizerWithRegularizer.h b/paddle/parameter/OptimizerWithRegularizer.h
index 7219d96d92..bd29b39663 100644
--- a/paddle/parameter/OptimizerWithRegularizer.h
+++ b/paddle/parameter/OptimizerWithRegularizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 
 // add regularizer for objective function to do optimization
 class OptimizerWithRegularizer : public ParameterOptimizer {
-public:
+ public:
   static ParameterOptimizer* create(const OptimizationConfig& optConfig,
                                     const ParameterConfig& paraConfig,
                                     bool isParameterSparse,
@@ -67,7 +67,7 @@ public:
     regularizer_->update(vecs, config, optimizer_->getLearningRate(), 0, 1);
   }
 
-protected:
+ protected:
   std::unique_ptr<ParameterOptimizer> optimizer_;
   Regularizer* regularizer_;
 
@@ -84,7 +84,7 @@ protected:
 // Regularized Loss function for every num of batches
 class OptimizerWithRegularizerEveryNumBatches
     : public OptimizerWithRegularizer {
-public:
+ public:
   OptimizerWithRegularizerEveryNumBatches(const OptimizationConfig& optConfig,
                                           ParameterOptimizer* optimizer,
                                           Regularizer* regularizer)
@@ -112,7 +112,7 @@ public:
   virtual TraverseCallback startCatchUpWith() const;
   virtual void finishCatchUpWith() { baseTimer_ = timer_; }
 
-protected:
+ protected:
   bool isRegularizationBatch(const ParameterConfig& config) const {
     return ((timer_ + 1) % config.num_batches_regularization() == 0);
   }
@@ -125,7 +125,7 @@ protected:
 
 // Regularized Loss function with Sparse support
 class OptimizerWithRegularizerSparse : public OptimizerWithRegularizer {
-public:
+ public:
   OptimizerWithRegularizerSparse(const OptimizationConfig& optConfig,
                                  ParameterOptimizer* optimizer,
                                  Regularizer* regularizer)
@@ -145,7 +145,7 @@ public:
     t0Vec_.assign(t0Vec_.size(), 0);
   }
 
-protected:
+ protected:
   /**
    *  t0Vec_ are last occur time of i rows
    *  if one block is update by multi threads,
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 24ac10f3fe..ef519bf35a 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -58,7 +58,7 @@ class Parameter;
 typedef std::shared_ptr<Parameter> ParameterPtr;
 
 class Parameter {
-public:
+ public:
   Parameter(const ParameterConfig& config, bool useGpu, bool doInit = true);
   const std::string& getName() const { return config_.name(); }
 
@@ -311,7 +311,7 @@ public:
     }
   }
 
-protected:
+ protected:
   /**
    * @brief create matrix to matType.
    *
@@ -326,7 +326,7 @@ protected:
 
   void clearUpdate() { updateCounter_ = 0; }
 
-protected:
+ protected:
   ParameterConfig config_;
 
   bool useGpu_;
@@ -363,7 +363,7 @@ protected:
 
   std::vector<std::shared_ptr<IParameterUpdaterHook>> updaterHooks_;
 
-public:
+ public:
   void setSharedCount(int cnt) { sharedCount_ = cnt; }
   int getSharedCount() { return sharedCount_; }
 
diff --git a/paddle/parameter/ParameterOptimizer.h b/paddle/parameter/ParameterOptimizer.h
index a8d0ca72f2..019afa1358 100644
--- a/paddle/parameter/ParameterOptimizer.h
+++ b/paddle/parameter/ParameterOptimizer.h
@@ -30,12 +30,12 @@ namespace paddle {
  *    may be called many times, should be no state change between calls.
  */
 class ParameterOptimizer {
-public:
+ public:
   typedef std::function<void(
       const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId)>
       TraverseCallback;
 
-public:
+ public:
   explicit ParameterOptimizer(const OptimizationConfig& optConfig)
       : applyDecay_(true),
         optConfig_(optConfig),
@@ -175,7 +175,7 @@ public:
   static ParameterOptimizer* create(const OptimizationConfig& optConfig,
                                     bool inPserver = false);
 
-protected:
+ protected:
   typedef std::vector<ParameterOptimizer::TraverseCallback> TraverseCallbackVec;
 
   static TraverseCallback composeCallbacks(
diff --git a/paddle/parameter/ParameterUpdaterBase.h b/paddle/parameter/ParameterUpdaterBase.h
index 717e1c6721..493512886c 100644
--- a/paddle/parameter/ParameterUpdaterBase.h
+++ b/paddle/parameter/ParameterUpdaterBase.h
@@ -21,7 +21,7 @@ namespace paddle {
 class ParameterOptimizer;
 
 class ParameterUpdater {
-public:
+ public:
   ParameterUpdater() : parameterTypes_{PARAMETER_VALUE, PARAMETER_GRADIENT} {}
   virtual ~ParameterUpdater() {}
 
@@ -89,7 +89,7 @@ public:
   virtual void setForwardbackwardTime(uint64_t delta) {}
 #endif
 
-protected:
+ protected:
   virtual void updateImpl(Parameter* para) = 0;
 
   std::vector<ParameterType> parameterTypes_;
@@ -101,7 +101,7 @@ protected:
 // part of all Parameters. It's useful when we need different
 // update strategy for different Parameter.
 class ParameterUpdaterComposite : public ParameterUpdater {
-public:
+ public:
   ParameterUpdaterComposite() {}
   virtual ~ParameterUpdaterComposite() {}
 
@@ -173,7 +173,7 @@ public:
         [&](int tid, size_t numThreads) { updaters_[tid]->restore(); });
   }
 
-protected:
+ protected:
   virtual void updateImpl(Parameter* para) {}
   std::vector<std::unique_ptr<ParameterUpdater>> updaters_;
   std::unique_ptr<SyncThreadPool> syncThreadPool_;
diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp
index e6aec3c348..989185b66a 100644
--- a/paddle/parameter/ParameterUpdaterHook.cpp
+++ b/paddle/parameter/ParameterUpdaterHook.cpp
@@ -37,7 +37,7 @@ namespace paddle {
  */
 
 class StaticPruningHook : public IParameterUpdaterHook {
-public:
+ public:
   explicit StaticPruningHook(const ParameterUpdaterHookConfig &hookConfig)
       : initCount_(0) {
     sparsityRatio_ = hookConfig.sparsity_ratio();
@@ -96,7 +96,7 @@ public:
     paraVec->dotMul(*maskVec_);
   }
 
-private:
+ private:
   SameThreadChecker updateThreadChecker_;
   std::atomic<size_t> initCount_;
   VectorPtr maskVec_;
@@ -116,12 +116,12 @@ IParameterUpdaterHook::~IParameterUpdaterHook() {}
  * May be extracted to Util.h to unify the hasher.
  */
 class StringIntPairHasher {
-public:
+ public:
   size_t operator()(const std::pair<std::string, int> &k) const {
     return intHasher_(strHasher_(k.first) + k.second);
   }
 
-private:
+ private:
   std::hash<std::string> strHasher_;
   std::hash<int> intHasher_;
 };
diff --git a/paddle/parameter/ParameterUpdaterHook.h b/paddle/parameter/ParameterUpdaterHook.h
index d30530ec39..cb96e4cf00 100644
--- a/paddle/parameter/ParameterUpdaterHook.h
+++ b/paddle/parameter/ParameterUpdaterHook.h
@@ -29,7 +29,7 @@ class Parameter;
  * parameter optimization.
  */
 class IParameterUpdaterHook {
-public:
+ public:
   virtual ~IParameterUpdaterHook();
 
   /**
@@ -53,7 +53,7 @@ public:
    */
   virtual void init(Parameter* para) = 0;
 
-protected:
+ protected:
   /**
    * Ctor.
    */
diff --git a/paddle/parameter/Regularizer.h b/paddle/parameter/Regularizer.h
index 6bed7b0ddf..fa5384e232 100644
--- a/paddle/parameter/Regularizer.h
+++ b/paddle/parameter/Regularizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 
 // Regularizer function for parameter, e.g. L1/L2
 class Regularizer {
-public:
+ public:
   virtual void update(const VectorPtr vecs[],
                       const ParameterConfig& paraConfig,
                       real learningRate,  // learningrate from optimizer
diff --git a/paddle/parameter/Weight.h b/paddle/parameter/Weight.h
index 7314c29d0d..113dd6530c 100644
--- a/paddle/parameter/Weight.h
+++ b/paddle/parameter/Weight.h
@@ -23,12 +23,12 @@ limitations under the License. */
 namespace paddle {
 
 class Weight {
-private:
+ private:
   MatrixPtr weight_;
   MatrixPtr weightGrad_;
   ParameterPtr parameter_;
 
-public:
+ public:
   Weight(size_t height, size_t width, ParameterPtr parameter);
   Weight(size_t height, size_t width, ParameterPtr parameter, size_t offset);
 
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index 6e10becabb..89dcc6c751 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -24,7 +24,7 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 
 class CommonTest : public ::testing::Test {
-protected:
+ protected:
   CommonTest() : testStat_("test") {}
   virtual ~CommonTest() {}
   virtual void SetUp() {
@@ -51,7 +51,7 @@ protected:
 
   virtual void TreaDown() { LOG(INFO) << "All Test Finished."; }
 
-protected:
+ protected:
   std::vector<std::pair<real, real>> valueUint_;
   std::vector<size_t> sizeVec_;
   real learningRate_;
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
index a932d34712..d50230e73a 100644
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
@@ -32,7 +32,7 @@ namespace paddle {
  * connections.
  */
 class BaseClient {
-protected:
+ protected:
   typedef std::unique_ptr<std::thread> ThreadPtr;
   typedef std::vector<std::vector<iovec>> InputIovs;
   typedef std::vector<SendParameterRequest> SendRequest;
@@ -49,7 +49,7 @@ protected:
     SendDataRequestVec parallelDataRequests;
   };
 
-public:
+ public:
   explicit BaseClient(bool separate = false, int numPorts = FLAGS_ports_num);
 
   virtual ~BaseClient();
@@ -141,7 +141,7 @@ public:
     return dataType;
   }
 
-protected:
+ protected:
   /// for a > 0, b > 0:
   /// return the smallest x s.t. b*x >= a
   static int divup(int a, int b) { return (a + b - 1) / b; }
@@ -264,7 +264,7 @@ protected:
    */
   virtual void recv(int threadId) = 0;
 
-protected:
+ protected:
   bool stopping_;
   /// nodes * ports that means the number of real pservers
   int serviceNum_;
diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
index f75475a88f..0ae9c6ef6a 100644
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -14,9 +14,6 @@ set(NETWORK_HEADERS
 add_library(paddle_network STATIC
     ${NETWORK_SOURCES})
 
-add_style_check_target(paddle_network ${NETWORK_SOURCES})
-add_style_check_target(paddle_network ${NETWORK_HEADERS})
-
 add_dependencies(paddle_network paddle_proto ${external_project_dependencies})
 
 ################### paddle_pserver ######################
@@ -37,9 +34,6 @@ set(PSERVER_HEADERS
 add_library(paddle_pserver STATIC
     ${PSERVER_SOURCES})
 
-add_style_check_target(paddle_pserver ${PSERVER_SOURCES})
-add_style_check_target(paddle_pserver ${PSERVER_HEADERS})
-
 add_dependencies(paddle_pserver paddle_proto ${external_project_dependencies})
 
 set(PSERVER_MAIN_SOURCES
diff --git a/paddle/pserver/LightNetwork.h b/paddle/pserver/LightNetwork.h
index 2aaa26a5c7..bcfc9655e9 100644
--- a/paddle/pserver/LightNetwork.h
+++ b/paddle/pserver/LightNetwork.h
@@ -41,7 +41,7 @@ class SocketServer : public Thread {
   // rdmaCpu controls the cpu affinity of RDMA server daemon,
   // which could benifit performance. rdmaCpu = -1 means TCP
   // is used instead of RDMA transport.
-public:
+ public:
   SocketServer(const std::string& addr, int port, int rdmaCpu);
   ~SocketServer();
 
@@ -50,7 +50,7 @@ public:
   typedef std::function<void(const std::vector<iovec>& outputIovs)>
       ResponseCallback;
 
-protected:
+ protected:
   //
   // The derived class needs to implement this function
   // to handle the request received by SocketWorker
@@ -70,13 +70,13 @@ protected:
 
   friend class SocketWorker;
 
-private:
+ private:
   void rdmaServer();
   void tcpServer();
 
   void detach() {}  // detach accept thread is forbidden
 
-protected:
+ protected:
   enum ChannelType tcpRdma_;
   // for rdma
   int rdmaCpu_;
@@ -96,7 +96,7 @@ protected:
  * @note  all parameter processing will run in the context of this worker
  */
 class SocketWorker : public Thread {
-public:
+ public:
   SocketWorker(std::unique_ptr<SocketChannel>&& channel, SocketServer* server)
       : channel_(std::move(channel)), server_(server) {}
 
@@ -104,7 +104,7 @@ public:
 
   virtual void run();
 
-protected:
+ protected:
   std::unique_ptr<SocketChannel> channel_;
   SocketServer* server_;
   enum ChannelType tcpRdma_;
@@ -118,12 +118,12 @@ protected:
  *        single cpu core for better load balance performance
  */
 class RdmaClientDaemons {
-private:
+ private:
   RdmaClientDaemons();
 
   static std::unique_ptr<RdmaClientDaemons> daemons_;
 
-public:
+ public:
   static RdmaClientDaemons* get() {
     std::call_once(RdmaClientDaemons::initDataFlag_,
                    &RdmaClientDaemons::getInstance);
@@ -141,10 +141,10 @@ public:
 
   ~RdmaClientDaemons();
 
-public:
+ public:
   friend class SocketClient;
 
-private:
+ private:
   static std::once_flag initDataFlag_;
   static void getInstance() {
     if (!daemons_.get()) daemons_.reset(new RdmaClientDaemons());
@@ -162,19 +162,19 @@ private:
  *        read data
  */
 class SocketClient {
-public:
+ public:
   SocketClient(const std::string& serverAddr,
                int serverPort,
                enum ChannelType channelType);
 
   SocketChannel* getChannel() { return channel_.get(); }
 
-protected:
+ protected:
   std::unique_ptr<SocketChannel> channel_;
   struct sxi_socket* socketDaemon_;
   enum ChannelType tcpRdma_;
 
-private:
+ private:
   void RdmaClient(const std::string& serverAddr, int serverPort);
   void TcpClient(const std::string& serverAddr, int serverPort);
 };
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
index d63273ccbc..c96bb78715 100644
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -50,11 +50,11 @@ struct PServerVector {
  * @brief A class to help to prepare server-side operations.
  */
 class PreparedOperations {
-protected:
+ protected:
   class ResultsAdder;
   struct LocalOperationResult;
 
-public:
+ public:
   /**
    * Offers an easy way to prepare operations that will be performed on
    * server-side.
@@ -93,7 +93,7 @@ public:
     return ResultsAdder(&localResults_.back());
   }
 
-protected:
+ protected:
   void addOperationHelper(Operation* op) {}
 
   /**
@@ -151,7 +151,7 @@ protected:
    * @brief ResultsAdder offers easy ways to quickly store operation results.
    */
   class ResultsAdder {
-  public:
+   public:
     explicit ResultsAdder(LocalOperationResult* localResult)
         : localResult_(localResult) {}
     template <typename... Args>
@@ -172,11 +172,11 @@ protected:
       addResult(args...);
     }
 
-  protected:
+   protected:
     LocalOperationResult* localResult_;
   };
 
-protected:
+ protected:
   DoOperationRequest request_;
   std::vector<iovec> inputIovs_;
   struct LocalOperationResult {
@@ -214,7 +214,7 @@ struct ParameterSegments {
  * waiting until all parameters are received to CPU host end.
  */
 class ParameterClient2 : public BaseClient {
-public:
+ public:
   /** Constructor.
    * @param separate True if sending and recieving activities are separated
    *                 into 2 threads, otherwise false.
@@ -232,7 +232,7 @@ public:
   static int calcParameterBlockSize(const std::vector<ParameterPtr>& parameters,
                                     size_t serviceNum);
 
-public:
+ public:
   bool init(const std::vector<ParameterPtr>& parameters);
 
   /// service functions
@@ -514,7 +514,7 @@ public:
   void setForwardbackwardTime(uint64_t delta) { forwardbackwordTime_ = delta; }
 #endif
 
-protected:
+ protected:
   template <typename ProtoIn, typename ProtoOut>
   void multiCall(const char* funcName,
                  const ProtoIn& request,
@@ -529,7 +529,7 @@ protected:
     }
   }
 
-private:
+ private:
   void destroy();
 
   /**
@@ -573,7 +573,7 @@ private:
   /// start necessary threads for threadPool
   void initThreads();
 
-protected:
+ protected:
   /// start port number of pserver
   /// it deduce all ports for dense and sparse with some rules
   int port_;
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
index 3ed06b6b04..0b8ef5c170 100644
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
@@ -71,7 +71,7 @@ namespace paddle {
  * to prevent from being polluted.
  */
 class ParameterServer2 : public ProtoServer {
-protected:
+ protected:
   /// parameter_ mutex.
   RWLock parameterMutex_;
 
@@ -169,7 +169,7 @@ protected:
   template <typename T, size_t AlignBytes>
   class ReadWriteBuffer
       : public std::vector<T, AlignedAllocator<T, AlignBytes>> {
-  public:
+   public:
     static_assert(sizeof(T) % AlignBytes == 0 || AlignBytes % sizeof(T) == 0,
                   "Type T must be able to aligned.");
 
@@ -229,7 +229,7 @@ protected:
       return r;
     }
 
-  private:
+   private:
     size_t curOffset_;
   };
 
@@ -298,17 +298,17 @@ protected:
   /// barrier performance tuning sync-sgd required
   std::atomic<int64_t> batchId_;
 
-public:
+ public:
   struct Buffer {
     real* base;
     size_t size;
   };
 
-protected:
+ protected:
   /// async gradient commit control
   bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request);
 
-public:
+ public:
   /// disable default parameter for overloading
   /// @rdmaCpu:the id of cpu core hosting RDMA server(0-N)
   /// -1 means using TCP transport instead of RDMA
@@ -437,7 +437,7 @@ public:
   void saveValueVector(const SaveValueRequest& request,
                        ProtoResponseCallback callback);
 
-public:
+ public:
   /**
    * @brief initialize parameter server
    */
@@ -512,7 +512,7 @@ public:
                           SendParameterResponse* response,
                           std::vector<Buffer>* outputBuffers);
 
-protected:
+ protected:
   void mergeSegments(BlockSegments* segments);
 
   /// set the unused segments to zero
@@ -641,7 +641,7 @@ protected:
                      const VectorPtr vecs[],
                      const ParameterOptimizer::TraverseCallback& callback);
 
-public:
+ public:
   typedef void (ParameterServer2::*OperatorFunction)(const Operation& operation,
                                                      OperationResult* result);
 
diff --git a/paddle/pserver/ParameterServerController.h b/paddle/pserver/ParameterServerController.h
index 3a9bc74edf..1308d62fb1 100644
--- a/paddle/pserver/ParameterServerController.h
+++ b/paddle/pserver/ParameterServerController.h
@@ -28,7 +28,7 @@ namespace paddle {
  * by gflags or proto.
  */
 class ParameterServerController final {
-public:
+ public:
   DISABLE_COPY(ParameterServerController);
 
   /**
@@ -67,7 +67,7 @@ public:
    */
   void wait();
 
-private:
+ private:
   std::vector<std::unique_ptr<ParameterServer2>> parameterServers_;
 };
 
diff --git a/paddle/pserver/ProtoServer.h b/paddle/pserver/ProtoServer.h
index 3f78799dbf..2943867de5 100644
--- a/paddle/pserver/ProtoServer.h
+++ b/paddle/pserver/ProtoServer.h
@@ -34,7 +34,7 @@ namespace paddle {
  * for single NIC hardward with --port=N(N>1) for small cluster job.
  */
 class ProtoServer : public SocketServer {
-public:
+ public:
   /// rdmaCpu controls the cpu affinity of RDMA server daemon,
   /// which could benifit performance. rdmaCpu = -1 means TCP
   /// is used instead of RDMA transport.
@@ -87,7 +87,7 @@ public:
                          std::unique_ptr<MsgReader> msgReader,
                          ProtoResponseCallbackEx callback)> func);
 
-protected:
+ protected:
   /**
    * @brief handle rpc request
    * @param[in] msgReader  Message reader for reading data from connection
@@ -111,7 +111,7 @@ protected:
   void registerServiceFunctionImp(const std::string& funcName,
                                   ServiceFunction func);
 
-protected:
+ protected:
   /// Tuning bare network overhead: the beginning of receiving request
   ThreadLocal<struct timeval> handleRequestBegin_;
 
@@ -120,7 +120,7 @@ protected:
 };
 
 class ProtoClient : public SocketClient {
-public:
+ public:
   ProtoClient(const std::string& serverAddr,
               int serverPort,
               enum ChannelType channelType = F_TCP)
diff --git a/paddle/pserver/SocketChannel.h b/paddle/pserver/SocketChannel.h
index c0f30d0db7..8b45ac5609 100644
--- a/paddle/pserver/SocketChannel.h
+++ b/paddle/pserver/SocketChannel.h
@@ -33,7 +33,7 @@ enum ChannelType {
 
 /// reading a set of blocks of data from SocketChannel.
 class MsgReader {
-public:
+ public:
   MsgReader(SocketChannel* channel, size_t numIovs);
   ~MsgReader() {
     /// ensure all data blocks have been processed
@@ -75,7 +75,7 @@ public:
   void readBlocks(const std::vector<void*>& bufs);
   void readNextBlock(void* buf);
 
-protected:
+ protected:
   SocketChannel* channel_;
   std::vector<size_t> blockLengths_;
   size_t currentBlockIndex_;
@@ -84,7 +84,7 @@ protected:
 /// APIs for reading and writing byte stream data or naive iov data
 /// from the APIs both RDMA and TCP exhibits byte stream style
 class SocketChannel {
-public:
+ public:
   SocketChannel(int socket, const std::string& peerName)
       : tcpSocket_(socket), peerName_(peerName) {
     tcpRdma_ = F_TCP;
@@ -137,7 +137,7 @@ public:
   /// return null to indicate socket is closed
   std::unique_ptr<MsgReader> readMessage();
 
-protected:
+ protected:
   struct MessageHeader {
     int64_t totalLength;  /// include the header
     int64_t numIovs;
diff --git a/paddle/pserver/SparseParameterDistribution.h b/paddle/pserver/SparseParameterDistribution.h
index 13f199548d..e168f36c75 100644
--- a/paddle/pserver/SparseParameterDistribution.h
+++ b/paddle/pserver/SparseParameterDistribution.h
@@ -31,7 +31,7 @@ namespace paddle {
  * if unbalanced distribution exhibts by default.
  */
 class SparseParameterDistribution {
-public:
+ public:
   /// serviceNum means the number of ParameterServers
   explicit SparseParameterDistribution(size_t serviceNum);
   ~SparseParameterDistribution() {}
@@ -39,7 +39,7 @@ public:
   void probeDistribution(int serverId, size_t data);
   void checkAndResetDistribution();
 
-private:
+ private:
   std::vector<size_t> data_;
   std::atomic<size_t> totBytes_;
 
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
index 6019dccaad..206cd17c37 100644
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -30,12 +30,12 @@ struct MessageHeader {
 };
 
 class Thread {
-public:
+ public:
   void start();
   virtual void run() = 0;
   virtual ~Thread() {}
 
-protected:
+ protected:
   std::unique_ptr<std::thread> thread_;
 };
 
@@ -44,13 +44,13 @@ void Thread::start() {
 }
 
 class SocketChannel {
-public:
+ public:
   explicit SocketChannel(int socket) : socket_(socket) {}
   int getSocketFd() const { return socket_; }
   uint64_t readAll(void* buf, size_t size);
   uint64_t writeAll(const void* buf, size_t size);
 
-protected:
+ protected:
   int socket_;
 };
 
@@ -79,7 +79,7 @@ uint64_t SocketChannel::writeAll(const void* buf, size_t size) {
 }
 
 class SocketWorker : public Thread {
-public:
+ public:
   explicit SocketWorker(int socket) : channel_(socket) {}
   virtual void run();
 
@@ -88,19 +88,19 @@ public:
 
   // write n bytes
 
-protected:
+ protected:
   SocketChannel channel_;
   std::string buffer_;
 };
 
 class SocketServer : public Thread {
-public:
+ public:
   explicit SocketServer(int port)
       : port_(port), socket_(0), maxPendingConnections_(100) {}
 
   virtual void run();
 
-protected:
+ protected:
   int port_;
   int socket_;
   int maxPendingConnections_;
@@ -161,11 +161,11 @@ void SocketWorker::run() {
 }
 
 class SocketClient {
-public:
+ public:
   SocketClient(const std::string& serverAddr, int serverPort);
   SocketChannel* getChannel() const { return channel_.get(); }
 
-protected:
+ protected:
   std::unique_ptr<SocketChannel> channel_;
 };
 
diff --git a/paddle/pserver/test/test_ParameterServer2.cpp b/paddle/pserver/test/test_ParameterServer2.cpp
index e742cd0871..01d179258d 100644
--- a/paddle/pserver/test/test_ParameterServer2.cpp
+++ b/paddle/pserver/test/test_ParameterServer2.cpp
@@ -26,7 +26,7 @@ DEFINE_string(server_addr, "127.0.0.1", "assign server address");
 DEFINE_int32(server_cpu, 0, "assign server cpu");
 
 class ParameterServer2Tester : public ParameterServer2 {
-public:
+ public:
   ParameterServer2Tester(std::string serverAddr,
                          int port,
                          int rdmaCpu = -1,
@@ -88,7 +88,7 @@ public:
   void waitPassFinishTest();
   void synchronizeTest();
 
-protected:
+ protected:
   ParameterClient2 client_;
   vector<ParameterConfig> clientConfigs_;
   vector<ParameterPtr> parameters_;
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index d68a8d2180..a66b14a1cc 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -28,7 +28,7 @@ DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
 using namespace paddle;  // NOLINT
 
 class MyServer : public ProtoServer {
-public:
+ public:
   explicit MyServer(int port, int rdmaCpu = -1)
       : ProtoServer(FLAGS_server_addr, port, rdmaCpu),
         status_(PSERVER_STATUS_NOT_SET) {
@@ -62,7 +62,7 @@ public:
     callback(response);
   }
 
-protected:
+ protected:
   PServerStatus status_;
   std::string buffer_;
 };
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
deleted file mode 100755
index 92b8b90880..0000000000
--- a/paddle/scripts/docker/build.sh
+++ /dev/null
@@ -1,259 +0,0 @@
-#!/bin/bash
-
-function cmake_gen() {
-    mkdir -p /paddle/build
-    cd /paddle/build
-
-    # build script will not fail if *.deb does not exist
-    rm *.deb 2>/dev/null || true
-    # delete previous built whl packages
-    rm -rf /paddle/paddle/dist 2>/dev/null || true
-
-    # Support build for all python versions, currently
-    # including cp27-cp27m and cp27-cp27mu.
-    PYTHON_FLAGS=""
-    if [ "$1" != "" ]; then
-        echo "using python abi: $1"
-        if [ "$1" == "cp27-cp27m" ]; then
-            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
-            export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
-            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
-        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
-        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
-        elif [ "$1" == "cp27-cp27mu" ]; then
-            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
-            export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
-            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
-        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
-        fi
-    fi
-
-    cat <<EOF
-    ========================================
-    Configuring cmake in /paddle/build ...
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
-        ${PYTHON_FLAGS}
-        -DWITH_DSO=ON
-        -DWITH_DOC=${WITH_DOC:-OFF}
-        -DWITH_GPU=${WITH_GPU:-OFF}
-        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
-        -DWITH_MKL=${WITH_MKL:-ON}
-        -DWITH_AVX=${WITH_AVX:-OFF}
-        -DWITH_GOLANG=${WITH_GOLANG:-OFF}
-        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-        -DWITH_SWIG_PY=ON
-        -DWITH_C_API=${WITH_C_API:-OFF}
-        -DWITH_PYTHON=${WITH_PYTHON:-ON}
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-        -DCUDNN_ROOT=/usr/
-        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
-        -DWITH_TESTING=${WITH_TESTING:-ON}
-        -DWITH_FAST_BUNDLE_TEST=ON
-        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
-    ========================================
-EOF
-    # Disable UNITTEST_USE_VIRTUALENV in docker because
-    # docker environment is fully controlled by this script.
-    # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
-    cmake .. \
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
-        ${PYTHON_FLAGS} \
-        -DWITH_DSO=ON \
-        -DWITH_DOC=${WITH_DOC:-OFF} \
-        -DWITH_GPU=${WITH_GPU:-OFF} \
-        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
-        -DWITH_MKL=${WITH_MKL:-ON} \
-        -DWITH_AVX=${WITH_AVX:-OFF} \
-        -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
-        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-        -DWITH_C_API=${WITH_C_API:-OFF} \
-        -DWITH_PYTHON=${WITH_PYTHON:-ON} \
-        -DCUDNN_ROOT=/usr/ \
-        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
-        -DWITH_TESTING=${WITH_TESTING:-ON} \
-        -DWITH_FAST_BUNDLE_TEST=ON \
-        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-}
-
-function run_build() {
-    cat <<EOF
-    ============================================
-    Building in /paddle/build ...
-    ============================================
-EOF
-    make clean
-    make -j `nproc`
-}
-
-function run_test() {
-    if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
-    cat <<EOF
-    ========================================
-    Running unit tests ...
-    ========================================
-EOF
-        ctest --output-on-failure
-        # make install should also be test when unittest
-        make install -j `nproc`
-        pip install /usr/local/opt/paddle/share/wheels/*.whl
-        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
-            paddle version
-        fi
-    fi
-}
-
-
-function gen_docs() {
-    if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
-        cat <<EOF
-    ========================================
-    Building documentation ...
-    In /paddle/build_doc
-    ========================================
-EOF
-        mkdir -p /paddle/build_doc
-        pushd /paddle/build_doc
-        cmake .. \
-            -DWITH_DOC=ON \
-            -DWITH_GPU=OFF \
-            -DWITH_AVX=${WITH_AVX:-ON} \
-            -DWITH_SWIG_PY=ON \
-            -DWITH_STYLE_CHECK=OFF
-
-        make -j `nproc` paddle_docs paddle_apis
-        popd
-    fi
-
-
-    if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
-        cat <<EOF
-    ========================================
-    Converting C++ source code into HTML ...
-    ========================================
-EOF
-        export WOBOQ_OUT=/paddle/build/woboq_out
-        mkdir -p $WOBOQ_OUT
-        cp -rv /woboq/data $WOBOQ_OUT/../data
-        /woboq/generator/codebrowser_generator \
-            -b /paddle/build \
-            -a \
-            -o $WOBOQ_OUT \
-            -p paddle:/paddle
-        /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-    fi
-}
-
-
-function gen_dockerfile() {
-    # Set BASE_IMAGE according to env variables
-    if [[ ${WITH_GPU} == "ON" ]]; then
-    BASE_IMAGE="nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04"
-    else
-    BASE_IMAGE="ubuntu:16.04"
-    fi
-
-    DOCKERFILE_GPU_ENV=""
-    DOCKERFILE_CUDNN_DSO=""
-    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
-        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/x86_64-linux-gnu/libcudnn.so"
-    fi
-
-    cat <<EOF
-    ========================================
-    Generate /paddle/build/Dockerfile ...
-    ========================================
-EOF
-
-    cat > /paddle/build/Dockerfile <<EOF
-    FROM ${BASE_IMAGE}
-    MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-    ENV HOME /root
-EOF
-
-    if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 &&"
-    else
-        NCCL_DEPS=""
-    fi
-
-    if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
-        PADDLE_VERSION="paddle version"
-        CMD='"paddle", "version"'
-    else
-        PADDLE_VERSION="true"
-        CMD='"true"'
-    fi
-
-    cat >> /paddle/build/Dockerfile <<EOF
-    ADD python/dist/*.whl /
-    # run paddle version to install python packages first
-    RUN apt-get update &&\
-        ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode python-tk && easy_install -U pip && \
-        pip install /*.whl; apt-get install -f -y && \
-        apt-get clean -y && \
-        rm -f /*.whl && \
-        ${PADDLE_VERSION} && \
-        ldconfig
-    ${DOCKERFILE_CUDNN_DSO}
-    ${DOCKERFILE_GPU_ENV}
-    ENV NCCL_LAUNCH_MODE PARALLEL
-EOF
-    if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
-        cat >> /paddle/build/Dockerfile <<EOF
-        ADD go/cmd/pserver/pserver /usr/bin/
-        ADD go/cmd/master/master /usr/bin/
-EOF
-    fi
-    cat >> /paddle/build/Dockerfile <<EOF
-    # default command shows the paddle version and exit
-    CMD [${CMD}]
-EOF
-}
-
-function gen_capi_package() {
-  if [[ ${WITH_C_API} == "ON" ]]; then
-    install_prefix="/paddle/build/capi_output"
-    rm -rf $install_prefix
-    make DESTDIR="$install_prefix" install
-    cd $install_prefix/usr/local
-    ls | egrep -v "^Found.*item$" | xargs tar -cf /paddle/build/paddle.tgz
-  fi
-}
-
-function gen_fluid_inference_lib() {
-    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
-    cat <<EOF
-    ========================================
-    Deploying fluid inference library ...
-    ========================================
-EOF
-        make -j `nproc` inference_lib_dist
-    fi
-}
-
-set -xe
-
-cmake_gen ${PYTHON_ABI:-""}
-run_build
-run_test
-gen_docs
-gen_dockerfile
-gen_capi_package
-gen_fluid_inference_lib
-
-if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
-  printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n"
-else
-  printf "If you need to install PaddlePaddle in develop docker image,"
-  printf "please make install or pip install build/python/dist/*.whl.\n"
-fi
diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh
deleted file mode 100644
index 3d5e775faf..0000000000
--- a/paddle/scripts/docker/build_android.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/bin/bash
-
-set -xe
-
-if [ $ANDROID_ABI == "arm64-v8a" ]; then
-  ANDROID_ARCH=arm64
-  if [ $ANDROID_API -lt 21 ]; then
-    echo "Warning: arm64-v8a requires ANDROID_API >= 21."
-    ANDROID_API=21
-  fi
-else # armeabi, armeabi-v7a
-  ANDROID_ARCH=arm
-fi
-
-ANDROID_STANDALONE_TOOLCHAIN=$ANDROID_TOOLCHAINS_DIR/$ANDROID_ARCH-android-$ANDROID_API
-
-cat <<EOF
-============================================
-Generating the standalone toolchain ...
-${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh
-      --arch=$ANDROID_ARCH
-      --platform=android-$ANDROID_API
-      --install-dir=${ANDROID_STANDALONE_TOOLCHAIN}
-============================================
-EOF
-${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh \
-      --arch=$ANDROID_ARCH \
-      --platform=android-$ANDROID_API \
-      --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
-
-BUILD_ROOT=/paddle/build_android
-DEST_ROOT=/paddle/install_android
-
-mkdir -p $BUILD_ROOT
-cd $BUILD_ROOT
-
-if [ $ANDROID_ABI == "armeabi-v7a" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_NEON=ON \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DUSE_EIGEN_FOR_BLAS=ON \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        -DWITH_STYLE_CHECK=OFF \
-        ..
-elif [ $ANDROID_ABI == "arm64-v8a" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DUSE_EIGEN_FOR_BLAS=OFF \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        -DWITH_STYLE_CHECK=OFF \
-        ..
-elif [ $ANDROID_ABI == "armeabi" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        -DWITH_STYLE_CHECK=OFF \
-        ..
-else
-  echo "Invalid ANDROID_ABI: $ANDROID_ABI"
-fi
-
-cat <<EOF
-============================================
-Building in $BUILD_ROOT ...
-============================================
-EOF
-make -j `nproc`
-make install -j `nproc`
diff --git a/paddle/scripts/docker/entrypoint b/paddle/scripts/docker/entrypoint
deleted file mode 100755
index bc194bd909..0000000000
--- a/paddle/scripts/docker/entrypoint
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-/usr/sbin/sshd -D &
-jupyter notebook --ip=0.0.0.0 /paddle/book/
diff --git a/paddle/scripts/docker/test.sh b/paddle/scripts/docker/test.sh
deleted file mode 100755
index 8180737a8f..0000000000
--- a/paddle/scripts/docker/test.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-set -e
-
-# the number of process to run tests
-NUM_PROC=6
-
-# calculate and set the memory usage for each process
-MEM_USAGE=$(printf "%.2f" `echo "scale=5; 1.0 / $NUM_PROC" | bc`)
-export FLAGS_fraction_of_gpu_memory_to_use=$MEM_USAGE
-
-# get the CUDA device count
-CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
-
-for (( i = 0; i < $NUM_PROC; i++ )); do
-    cuda_list=()
-    for (( j = 0; j < $CUDA_DEVICE_COUNT; j++ )); do
-        s=$[i+j]
-        n=$[s%CUDA_DEVICE_COUNT]
-        if [ $j -eq 0 ]; then
-            cuda_list=("$n")
-        else
-            cuda_list="$cuda_list,$n"
-        fi
-    done
-    echo $cuda_list
-    # CUDA_VISIBLE_DEVICES http://acceleware.com/blog/cudavisibledevices-masking-gpus
-    # ctest -I https://cmake.org/cmake/help/v3.0/manual/ctest.1.html?highlight=ctest
-    env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC --output-on-failure &
-done
-wait
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index fbe219a1c9..c6eef8683d 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -99,12 +99,13 @@ function cmake_gen() {
         -DWITH_PYTHON=${WITH_PYTHON:-ON}
         -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
         -DCUDNN_ROOT=/usr/
-        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
         -DWITH_TESTING=${WITH_TESTING:-ON}
         -DWITH_FAST_BUNDLE_TEST=ON
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -126,12 +127,12 @@ EOF
         -DWITH_C_API=${WITH_C_API:-OFF} \
         -DWITH_PYTHON=${WITH_PYTHON:-ON} \
         -DCUDNN_ROOT=/usr/ \
-        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
         -DWITH_TESTING=${WITH_TESTING:-ON} \
         -DWITH_FAST_BUNDLE_TEST=ON \
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
 }
 
 function abort(){
@@ -144,19 +145,17 @@ function check_style() {
     trap 'abort' 0
     set -e
 
-    # install glide
-    curl https://glide.sh/get | bash
-    eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+    if [ -x "$(command -v gimme)" ]; then
+    	eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+    fi
 
     # set up go environment for running gometalinter
     mkdir -p $GOPATH/src/github.com/PaddlePaddle/
     ln -sf ${PADDLE_ROOT} $GOPATH/src/github.com/PaddlePaddle/Paddle
-    cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
-
-    go get github.com/alecthomas/gometalinter
-    gometalinter --install
+    mkdir -p ./build/go
+    cp go/glide.* build/go
+    cd build/go; glide install; cd -
 
-    cd ${PADDLE_ROOT}
     export PATH=/usr/bin:$PATH
     pre-commit install
     clang-format --version
@@ -183,6 +182,7 @@ function build() {
 EOF
     make clean
     make -j `nproc`
+    make install -j `nproc`
 }
 
 function build_android() {
@@ -231,7 +231,6 @@ EOF
             -DUSE_EIGEN_FOR_BLAS=ON \
             -DWITH_C_API=ON \
             -DWITH_SWIG_PY=OFF \
-            -DWITH_STYLE_CHECK=OFF \
             ..
     elif [ $ANDROID_ABI == "arm64-v8a" ]; then
       cmake -DCMAKE_SYSTEM_NAME=Android \
@@ -245,7 +244,6 @@ EOF
             -DUSE_EIGEN_FOR_BLAS=OFF \
             -DWITH_C_API=ON \
             -DWITH_SWIG_PY=OFF \
-            -DWITH_STYLE_CHECK=OFF \
             ..
     elif [ $ANDROID_ABI == "armeabi" ]; then
       cmake -DCMAKE_SYSTEM_NAME=Android \
@@ -258,7 +256,6 @@ EOF
             -DCMAKE_BUILD_TYPE=MinSizeRel \
             -DWITH_C_API=ON \
             -DWITH_SWIG_PY=OFF \
-            -DWITH_STYLE_CHECK=OFF \
             ..
     else
       echo "Invalid ANDROID_ABI: $ANDROID_ABI"
@@ -287,7 +284,6 @@ function build_ios() {
           -DUSE_EIGEN_FOR_BLAS=ON \
           -DWITH_TESTING=OFF \
           -DWITH_SWIG_PY=OFF \
-          -DWITH_STYLE_CHECK=OFF \
           -DCMAKE_BUILD_TYPE=Release
     
     make -j 2
@@ -375,8 +371,7 @@ EOF
         -DCMAKE_BUILD_TYPE=Release \
         -DWITH_DOC=ON \
         -DWITH_GPU=OFF \
-        -DWITH_MKL=OFF \
-        -DWITH_STYLE_CHECK=OFF
+        -DWITH_MKL=OFF
 
     make -j `nproc` paddle_docs paddle_apis
 
@@ -415,9 +410,11 @@ function gen_dockerfile() {
 
     DOCKERFILE_GPU_ENV=""
     DOCKERFILE_CUDNN_DSO=""
+    DOCKERFILE_CUBLAS_DSO=""
     if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
         DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
-        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.${CUDNN_MAJOR} /usr/lib/x86_64-linux-gnu/libcudnn.so"
+        DOCKERFILE_CUDNN_DSO="RUN ln -sf /usr/lib/x86_64-linux-gnu/libcudnn.so.${CUDNN_MAJOR} /usr/lib/x86_64-linux-gnu/libcudnn.so"
+        DOCKERFILE_CUBLAS_DSO="RUN ln -sf /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.${CUDA_MAJOR} /usr/lib/x86_64-linux-gnu/libcublas.so"
     fi
 
     cat <<EOF
@@ -433,7 +430,7 @@ EOF
 EOF
 
     if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 &&"
+        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.1.2-1+cuda${CUDA_MAJOR} libnccl-dev=2.1.2-1+cuda${CUDA_MAJOR} &&"
     else
         NCCL_DEPS=""
     fi
@@ -451,13 +448,14 @@ EOF
     # run paddle version to install python packages first
     RUN apt-get update &&\
         ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode python-tk && easy_install -U pip && \
+        apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \
         pip install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f /*.whl && \
         ${PADDLE_VERSION} && \
         ldconfig
     ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_CUBLAS_DSO}
     ${DOCKERFILE_GPU_ENV}
     ENV NCCL_LAUNCH_MODE PARALLEL
 EOF
@@ -493,7 +491,10 @@ function gen_fluid_inference_lib() {
     ========================================
 EOF
         make -j `nproc` inference_lib_dist
-    fi
+        cd ${PADDLE_ROOT}/build
+        mv fluid_install_dir fluid
+        tar -cf fluid.tgz fluid
+      fi
 }
 
 function main() {
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index 77588b8872..3462deb9c2 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -14,20 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-function container_running() {
-    name=$1
-    docker ps -a --format "{{.Names}}" | grep "${name}" > /dev/null
-    return $?
-}
-
 function start_build_docker() {
     docker pull $IMG
 
-    if container_running "${CONTAINER_ID}"; then
-        docker stop "${CONTAINER_ID}" 1>/dev/null
-        docker rm -f "${CONTAINER_ID}" 1>/dev/null
-    fi
-
     apt_mirror='s#http://archive.ubuntu.com/ubuntu#mirror://mirrors.ubuntu.com/mirrors.txt#g'
     DOCKER_ENV=$(cat <<EOL
         -e FLAGS_fraction_of_gpu_memory_to_use=0.15 \
@@ -61,7 +50,6 @@ EOL
     fi
     set -ex
     ${DOCKER_CMD} run -it \
-        --name $CONTAINER_ID \
         ${DOCKER_ENV} \
         -e SCRIPT_NAME=$0 \
         -v $PADDLE_ROOT:/paddle \
@@ -75,10 +63,8 @@ EOL
 function main() {
     DOCKER_REPO="paddlepaddle/paddle"
     VERSION="latest-dev"
-    CONTAINER_ID="${USER}_paddle_dev"
     PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
     if [ "$1" == "build_android" ]; then
-        CONTAINER_ID="${USER}_paddle_dev_android"
         VERSION="latest-dev-android"
     fi
     IMG=${DOCKER_REPO}:${VERSION}
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
deleted file mode 100755
index d7527d9948..0000000000
--- a/paddle/scripts/travis/build_doc.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-set -e
-
-# Create the build directory for CMake.
-mkdir -p $TRAVIS_BUILD_DIR/build
-cd $TRAVIS_BUILD_DIR/build
-
-# Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON -DWITH_STYLE_CHECK=OFF
-
-make -j `nproc` paddle_docs paddle_apis
-
-# check websites for broken links
-linkchecker doc/v2/en/html/index.html
-linkchecker doc/v2/cn/html/index.html
-linkchecker doc/v2/api/en/html/index.html
diff --git a/paddle/scripts/travis/build_ios.sh b/paddle/scripts/travis/build_ios.sh
deleted file mode 100755
index dee7cf7cbb..0000000000
--- a/paddle/scripts/travis/build_ios.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-set -e
-
-# Create the build directory for CMake.
-mkdir -p $TRAVIS_BUILD_DIR/build_ios
-cd $TRAVIS_BUILD_DIR/build_ios
-
-# Compile paddle binaries
-cmake -DCMAKE_SYSTEM_NAME=iOS \
-      -DIOS_PLATFORM=OS \
-      -DCMAKE_OSX_ARCHITECTURES="arm64" \
-      -DWITH_C_API=ON \
-      -DUSE_EIGEN_FOR_BLAS=ON \
-      -DWITH_TESTING=OFF \
-      -DWITH_SWIG_PY=OFF \
-      -DWITH_STYLE_CHECK=OFF \
-      -DCMAKE_BUILD_TYPE=Release \
-      ..
-
-make -j 2
diff --git a/paddle/scripts/travis/check_style.sh b/paddle/scripts/travis/check_style.sh
deleted file mode 100755
index e71d243efa..0000000000
--- a/paddle/scripts/travis/check_style.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-function abort(){
-    echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
-    echo "Please use pre-commit to check what is wrong." 1>&2
-    exit 1
-}
-
-trap 'abort' 0
-set -e
-
-# install glide
-curl https://glide.sh/get | bash
-eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
-
-# set up go environment for running gometalinter
-mkdir -p $GOPATH/src/github.com/PaddlePaddle/
-ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
-cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
-
-go get github.com/alecthomas/gometalinter
-gometalinter --install
-
-cd $TRAVIS_BUILD_DIR
-export PATH=/usr/bin:$PATH
-pre-commit install
-clang-format --version
-
-
-
-if ! pre-commit run -a ; then
-    git diff
-    exit 1
-fi
-
-trap : 0
diff --git a/paddle/scripts/travis/deploy_key.enc b/paddle/scripts/travis/deploy_key.enc
deleted file mode 100644
index b0aa45c5ac..0000000000
Binary files a/paddle/scripts/travis/deploy_key.enc and /dev/null differ
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 586ec48477..507479c862 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -30,7 +30,7 @@ int main(int argc, char** argv) {
   new_argv.push_back(
       strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
 #else
-  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,use_mkldnn"));
 #endif
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt
index 72911695bd..6192de4388 100644
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -36,17 +36,12 @@ endif()
 add_library(paddle_trainer_lib STATIC
     ${TRAINER_SOURCES})
 
-add_style_check_target(paddle_trainer_lib
-    ${TRAINER_SOURCES})
-add_style_check_target(paddle_trainer_lib
-    ${TRAINER_HEADERS})
 add_dependencies(paddle_trainer_lib
     paddle_proto
     ${external_project_dependencies})
 
 macro(add_paddle_exe TARGET_NAME)
   add_executable(${TARGET_NAME} ${ARGN})
-  add_style_check_target(${TARGET_NAME} ${ARGN})
   link_paddle_exe(${TARGET_NAME})
 endmacro()
 
diff --git a/paddle/trainer/NewRemoteParameterUpdater.h b/paddle/trainer/NewRemoteParameterUpdater.h
index 6223ba427c..02693c675e 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
@@ -29,7 +29,7 @@ namespace paddle {
  * New remote parameter updater for dense parameters that use cclient of go.
  */
 class NewRemoteParameterUpdater : public ParameterUpdater {
-public:
+ public:
   NewRemoteParameterUpdater(const OptimizationConfig& config,
                             const std::string pserverSpec);
   NewRemoteParameterUpdater(const OptimizationConfig& config,
@@ -61,13 +61,13 @@ public:
   virtual void startPass();
   virtual bool finishPass();
 
-protected:
+ protected:
   /**
    * work need to do after finishBatch
    */
   virtual void updateImpl(Parameter* para);
 
-private:
+ private:
   int parameterSize() { return (int)parameters_.size(); }
 
   /**
@@ -104,7 +104,7 @@ private:
     }
   }
 
-protected:
+ protected:
   const OptimizationConfig& trainerConfig_;
   /// internal parameter client object for exchanging data with pserver
   paddle_pserver_client parameterClient_;
diff --git a/paddle/trainer/ParamUtil.h b/paddle/trainer/ParamUtil.h
index 2e05595848..10746b4d58 100644
--- a/paddle/trainer/ParamUtil.h
+++ b/paddle/trainer/ParamUtil.h
@@ -56,7 +56,7 @@ struct ParameterUtilConfig {
  * Utility class for loading and saving parameters
  */
 class ParameterUtil {
-public:
+ public:
   /**
    * Ctor.
    *
@@ -115,7 +115,7 @@ public:
     }
   }
 
-private:
+ private:
   std::shared_ptr<TrainerConfigHelper> config_;
   std::unique_ptr<ParameterUtilConfig> intConfig_;
   GradientMachinePtr gserver_;
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h
index 9e9e948b88..ef7ab92eca 100644
--- a/paddle/trainer/ParameterUpdater.h
+++ b/paddle/trainer/ParameterUpdater.h
@@ -36,7 +36,7 @@ namespace paddle {
  * @brief Parameter Updater for SGD, and local(not cluster) run.
  */
 class SgdLocalUpdater : public ParameterUpdater {
-public:
+ public:
   /**
    * @brief Ctor. Initialize optimizer locally by optConfig.
    * @param optConfig optimization config.
@@ -131,7 +131,7 @@ public:
     }
   }
 
-protected:
+ protected:
   /**
    * @brief update method. Update value from gradient.
    * @param para parameter that will be updated.
@@ -159,7 +159,7 @@ protected:
  * @deprecated
  */
 class SgdCpuUpdater : public SgdLocalUpdater, public Deprecated {
-public:
+ public:
   explicit SgdCpuUpdater(const OptimizationConfig& optConfig)
       : SgdLocalUpdater(optConfig),
         Deprecated(
@@ -178,7 +178,7 @@ public:
     optimizer_->finishBatch();
   }
 
-protected:
+ protected:
   /**
    * @brief do nothing.
    * @param para
@@ -192,7 +192,7 @@ protected:
  * It will do model average in cpu to reduce gpu memory comsuption.
  */
 class SgdUpdaterWithCpuAverager : public SgdLocalUpdater {
-public:
+ public:
   /**
    * @brief Ctor.
    *
@@ -233,12 +233,12 @@ public:
    */
   virtual void restore();
 
-protected:
+ protected:
   virtual void updateImpl(Parameter* para);
 
   void updateFunc(Parameter* para);
 
-protected:
+ protected:
   std::unique_ptr<ParameterOptimizer> averager_;
 
   /**
diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/trainer/RemoteParameterUpdater.h
index 5e82c94475..3a40a46354 100644
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ b/paddle/trainer/RemoteParameterUpdater.h
@@ -53,7 +53,7 @@ namespace paddle {
  * backward and communication is not supported.
  */
 class RemoteParameterUpdater : public ParameterUpdater {
-public:
+ public:
   RemoteParameterUpdater(
       const OptimizationConfig& config,
       int expectedPassCount,
@@ -101,7 +101,7 @@ public:
   virtual void apply();
   virtual void restore();
 
-protected:
+ protected:
   /**
    * control all pservers with all trainers for sync-sgd
    */
@@ -128,7 +128,7 @@ protected:
    */
   void copyParametersFromDevice(ParameterType parameterType);
 
-protected:
+ protected:
   /// Optimization config used to guide initialization and finishBatch
   OptimizationConfig config_;
   /// internal parameter client object for exchanging data with pserver
@@ -178,7 +178,7 @@ protected:
  * It contains separate send and recv thread for pipeline usage.
  */
 class ConcurrentRemoteParameterUpdater : public RemoteParameterUpdater {
-public:
+ public:
   ConcurrentRemoteParameterUpdater(
       OptimizationConfig config,
       int expectedPassCount,
@@ -194,7 +194,7 @@ public:
    */
   virtual void finishBatch(real cost);
 
-protected:
+ protected:
   virtual void updateImpl(Parameter* para);
   /// internal thread called in send thread
   void send(Parameter* para);  // para == NULL indicate end of a minibatch
@@ -221,7 +221,7 @@ protected:
     return (numBatches_ + 1) % config_.num_batches_per_send_parameter() == 0;
   }
 
-private:
+ private:
   /// send thread used for overlapping
   std::unique_ptr<std::thread> sendThread_;
   /// recv thread used for overlapping
@@ -263,7 +263,7 @@ private:
  * to encapsulate sparse specified message for all pservers.
  */
 class SparseRemoteParameterUpdater : public ParameterUpdater {
-public:
+ public:
   SparseRemoteParameterUpdater(const OptimizationConfig& config,
                                int expectedPassCount,
                                bool testing);
@@ -303,7 +303,7 @@ public:
   }
 #endif
 
-protected:
+ protected:
   /// update implimentation, not implemented
   virtual void updateImpl(Parameter* para) {}
 
@@ -313,7 +313,7 @@ protected:
   /// start controller thread
   void startController();
 
-protected:
+ protected:
   /// optimization config
   OptimizationConfig config_;
   /// internal parameter client
@@ -335,7 +335,7 @@ protected:
  * it directly call internal dense and sparse udpater individually.
  */
 class SparseRemoteParameterUpdaterComposite : public ParameterUpdaterComposite {
-public:
+ public:
   enum {
     UPDATER_SPARSE_REMOTE = 0,  // execute in sync thread pool(tid:0)
     UPDATER_NORMAL = 1,         // execute in Owner thread(tid:1)
@@ -364,7 +364,7 @@ public:
 };
 
 class ParameterUpdaterCreators {
-public:
+ public:
   /**
    * @brief add a creator to create custom ParameterUpdater while training.
    *        The creator is a function with type (alogrithm, optConfig, isLocal,
@@ -407,7 +407,7 @@ public:
     return nullptr;
   }
 
-private:
+ private:
   static std::vector<std::function<ParameterUpdater*(
       const std::string&, const OptimizationConfig&, bool, size_t)>>
       constructors_;
diff --git a/paddle/trainer/Tester.h b/paddle/trainer/Tester.h
index e892744db2..801c77e311 100644
--- a/paddle/trainer/Tester.h
+++ b/paddle/trainer/Tester.h
@@ -38,7 +38,7 @@ namespace paddle {
  * It is a private class for Trainer.
  */
 class Tester {
-public:
+ public:
   /**
    * Ctor
    * @param config Trainer Config.
@@ -87,7 +87,7 @@ public:
    */
   void test();
 
-protected:
+ protected:
   std::shared_ptr<ParameterClient2> testParameterClient_;
   std::shared_ptr<TrainerConfigHelper> config_;
   std::unique_ptr<TesterConfig> intconfig_;
@@ -107,7 +107,7 @@ protected:
     real cost;
   } testContext_;
 
-private:
+ private:
   /**
    * Test one batch by batchId. It is only used for testOnePass.
    *
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h
index bc08a9e9f0..b5e6a7ce3c 100644
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ b/paddle/trainer/ThreadParameterUpdater.h
@@ -39,7 +39,7 @@ namespace paddle {
    class.
  */
 class SgdThreadUpdater : public ParameterUpdater {
-public:
+ public:
   explicit SgdThreadUpdater(const OptimizationConfig& optConfig);
   virtual ~SgdThreadUpdater() {}
 
@@ -57,7 +57,7 @@ public:
   virtual void apply();
   virtual void restore();
 
-protected:
+ protected:
   // This is the function that will be eventualy called by the GradientMachine.
   // used only for GPU update.
   virtual void updateImpl(Parameter* para);
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index fac589d1d7..78127b7be5 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -41,7 +41,7 @@ namespace paddle {
  * train/test a NeuralNetwork.
  */
 class Trainer {
-public:
+ public:
   /**
    * Ctor.
    * @return
@@ -138,7 +138,7 @@ public:
    */
   ParameterUtil* getParameterUtilPtr();
 
-protected:
+ protected:
   /**
    * Train one pass of data.
    *
@@ -159,10 +159,10 @@ protected:
 
   void createTester();
 
-private:
+ private:
   std::unique_ptr<TesterConfig> createTesterConfig();
 
-protected:
+ protected:
   std::shared_ptr<TrainerConfigHelper> config_;
   std::shared_ptr<TrainerStats> stats_;
 
diff --git a/paddle/trainer/TrainerConfigHelper.h b/paddle/trainer/TrainerConfigHelper.h
index f1366cc041..b21dda964e 100644
--- a/paddle/trainer/TrainerConfigHelper.h
+++ b/paddle/trainer/TrainerConfigHelper.h
@@ -37,7 +37,7 @@ class DataConfig;
  * Define a macro to unify 'final' keyword
  */
 class TrainerConfigHelper /*final*/ {
-public:
+ public:
   DISABLE_COPY(TrainerConfigHelper);
 
   /**
@@ -193,7 +193,7 @@ public:
    */
   static std::shared_ptr<TrainerConfigHelper> createFromFlagConfig();
 
-private:
+ private:
   static std::string getConfigNameFromPassId(int passId,
                                              const std::string& modelPath);
 
diff --git a/paddle/trainer/TrainerInternal.h b/paddle/trainer/TrainerInternal.h
index 7018faab24..48ee53a5e6 100644
--- a/paddle/trainer/TrainerInternal.h
+++ b/paddle/trainer/TrainerInternal.h
@@ -34,7 +34,7 @@ namespace paddle {
  * the core training class for driving training logic
  */
 class TrainerInternal {
-public:
+ public:
   struct ParaStat {
     real maxAbsGrad;
     real avgAbsGrad;
@@ -126,7 +126,7 @@ public:
                                     UpdateCallback updateCallback,
                                     bool doPipelineUpdate);
 
-protected:
+ protected:
   std::shared_ptr<ParameterUpdater> parameterUpdater_;
   GradientMachinePtr gradientMachine_;
   std::shared_ptr<TrainerConfigHelper> config_;
diff --git a/paddle/trainer/TrainerInternalConfig.h b/paddle/trainer/TrainerInternalConfig.h
index b47692720e..43aae38102 100644
--- a/paddle/trainer/TrainerInternalConfig.h
+++ b/paddle/trainer/TrainerInternalConfig.h
@@ -37,7 +37,7 @@ namespace paddle {
  * through one mini-batch.
  */
 class TrainerStats {
-public:
+ public:
   /**
    * @brief reset all stats.
    *
@@ -147,7 +147,7 @@ public:
     return os.str();
   }
 
-private:
+ private:
   int64_t numProcessed_;
   real totalCost_;
   real currentCost_;
diff --git a/paddle/trainer/tests/picojson.h b/paddle/trainer/tests/picojson.h
index eaa8b9baf6..75349537b1 100644
--- a/paddle/trainer/tests/picojson.h
+++ b/paddle/trainer/tests/picojson.h
@@ -125,7 +125,7 @@ enum { INDENT_WIDTH = 2 };
 struct null {};
 
 class value {
-public:
+ public:
   typedef std::vector<value> array;
   typedef std::map<std::string, value> object;
   union _storage {
@@ -139,11 +139,11 @@ public:
     object* object_;
   };
 
-protected:
+ protected:
   int type_;
   _storage u_;
 
-public:
+ public:
   value();
   value(int type, bool);
   explicit value(bool b);
@@ -179,7 +179,7 @@ public:
   void serialize(Iter os, bool prettify = false) const;
   std::string serialize(bool prettify = false) const;
 
-private:
+ private:
   template <typename T>
   value(const T*);  // intentionally defined to block implicit conversion of
                     // pointer to bool
@@ -588,13 +588,13 @@ inline std::string value::_serialize(int indent) const {
 
 template <typename Iter>
 class input {
-protected:
+ protected:
   Iter cur_, end_;
   int last_ch_;
   bool ungot_;
   int line_;
 
-public:
+ public:
   input(const Iter& first, const Iter& last)
       : cur_(first), end_(last), last_ch_(-1), ungot_(false), line_(1) {}
   int getc() {
@@ -873,7 +873,7 @@ inline bool _parse(Context& ctx, input<Iter>& in) {
 }
 
 class deny_parse_context {
-public:
+ public:
   bool set_null() { return false; }
   bool set_bool(bool) { return false; }
 #ifdef PICOJSON_USE_INT64
@@ -898,10 +898,10 @@ public:
 };
 
 class default_parse_context {
-protected:
+ protected:
   value* out_;
 
-public:
+ public:
   default_parse_context(value* out) : out_(out) {}
   bool set_null() {
     *out_ = value();
@@ -949,18 +949,18 @@ public:
     return _parse(ctx, in);
   }
 
-private:
+ private:
   default_parse_context(const default_parse_context&);
   default_parse_context& operator=(const default_parse_context&);
 };
 
 class null_parse_context {
-public:
+ public:
   struct dummy_str {
     void push_back(int) {}
   };
 
-public:
+ public:
   null_parse_context() {}
   bool set_null() { return true; }
   bool set_bool(bool) { return true; }
@@ -985,7 +985,7 @@ public:
     return _parse(*this, in);
   }
 
-private:
+ private:
   null_parse_context(const null_parse_context&);
   null_parse_context& operator=(const null_parse_context&);
 };
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index b2a93d4d5e..de12c4d649 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -38,7 +38,7 @@ DECLARE_int32(num_passes);
 DECLARE_int32(saving_period);
 
 class TrainerForTest : public paddle::Trainer {
-public:
+ public:
   inline const std::shared_ptr<ParameterUpdater>& getParameterUpdaterForTest() {
     return this->trainerInternal_.getParameterUpdater();
   }
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 6292e7fa52..b42b2bae96 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -14,9 +14,6 @@ add_library(paddle_utils STATIC
         ${UTIL_SOURCES}
         ${UTIL_ARCH_SOURCES}
         ${UTIL_RES})
-add_style_check_target(paddle_utils ${UTIL_HEADERS})
-add_style_check_target(paddle_utils ${UTIL_SOURCES}
-    ${UTIL_ARCH_SOURCES})
 add_dependencies(paddle_utils paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
     add_subdirectory(tests)
diff --git a/paddle/utils/ClassRegistrar.h b/paddle/utils/ClassRegistrar.h
index 1ac27bafab..5f40a0b25e 100644
--- a/paddle/utils/ClassRegistrar.h
+++ b/paddle/utils/ClassRegistrar.h
@@ -41,7 +41,7 @@ namespace paddle {
  */
 template <class BaseClass, typename... CreateArgs>
 class ClassRegistrar {
-public:
+ public:
   typedef std::function<BaseClass*(CreateArgs...)> ClassCreator;
 
   // Register a class using a creation function.
@@ -74,7 +74,7 @@ public:
     }
   }
 
-protected:
+ protected:
   std::map<std::string, ClassCreator> creatorMap_;
 };
 
diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h
index 869be5be54..ed58211d13 100644
--- a/paddle/utils/CpuId.h
+++ b/paddle/utils/CpuId.h
@@ -35,7 +35,7 @@ enum simd_t {
 // clang-format on
 
 class SIMDFlags final {
-public:
+ public:
   DISABLE_COPY(SIMDFlags);
 
   SIMDFlags();
@@ -46,7 +46,7 @@ public:
     return !((simd_flags_ & flags) ^ flags);
   }
 
-private:
+ private:
   int simd_flags_ = SIMD_NONE;
 };
 
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h
index 52a6df9497..b60077ea2d 100644
--- a/paddle/utils/CustomStackTrace.h
+++ b/paddle/utils/CustomStackTrace.h
@@ -49,7 +49,7 @@ namespace paddle {
  */
 template <typename T>
 class CustomStackTrace {
-public:
+ public:
   /**
    * @brief Pop out an item from the top of the stack if item == top.
    *        Else, just set status to popping.
@@ -136,7 +136,7 @@ public:
     p.push(item);
   }
 
-private:
+ private:
   /**
    * Get thread local attribute, and save them into a map (threadId => TYPE*)
    *
@@ -174,7 +174,7 @@ private:
     return this->getThreadLocal(this->isPushing_, this->pushingBuffers_);
   }
 
-private:
+ private:
   mutable std::mutex mtx_;
 
   std::unordered_map<std::thread::id, std::stack<T>*> stackBuffers_;
diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
index 7cde983060..1fc8482e3a 100644
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
@@ -95,7 +95,7 @@ namespace paddle {
  * log(FATAL) and CHECK in Paddle, 'check' method will be removed.
  */
 class Error {
-public:
+ public:
   /**
    * Construct a no-error value.
    */
@@ -138,7 +138,7 @@ public:
    */
   bool isOK() const { return msg_ == nullptr; }
 
-private:
+ private:
   std::shared_ptr<std::string> msg_;
 };
 
diff --git a/paddle/utils/GlobalConstants.h b/paddle/utils/GlobalConstants.h
index 0ec1c28dfb..3f45e82268 100644
--- a/paddle/utils/GlobalConstants.h
+++ b/paddle/utils/GlobalConstants.h
@@ -78,7 +78,7 @@ enum ParameterType {
 using namespace enumeration_wrapper;  // NOLINT
 
 class TrainAlgorithm {
-public:
+ public:
   static const std::string SGD;
   static const std::string AsyncSGD;
   static const std::string OWLQN;
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
index e87abb9139..65f983685f 100644
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
@@ -42,7 +42,7 @@ namespace paddle {
  * Use unlock() to unlock the lock.
  */
 class RWLock {
-public:
+ public:
   RWLock() { pthread_rwlock_init(&rwlock_, NULL); }
   ~RWLock() { pthread_rwlock_destroy(&rwlock_); }
   RWLock(const RWLock&) = delete;
@@ -62,7 +62,7 @@ public:
   void lock_shared() { pthread_rwlock_rdlock(&rwlock_); }
   void unlock() { pthread_rwlock_unlock(&rwlock_); }
 
-protected:
+ protected:
   pthread_rwlock_t rwlock_;
 };
 
@@ -71,7 +71,7 @@ protected:
  * using RAII management mechanism.
  */
 class ReadLockGuard {
-public:
+ public:
   /**
    * @brief Construct Function. Lock on rwlock in read mode.
    */
@@ -86,7 +86,7 @@ public:
    */
   ~ReadLockGuard() { rwlock_->unlock(); }
 
-protected:
+ protected:
   RWLock* rwlock_;
 };
 
@@ -98,7 +98,7 @@ protected:
  */
 class SpinLockPrivate;
 class SpinLock {
-public:
+ public:
   DISABLE_COPY(SpinLock);
   SpinLock();
   ~SpinLock();
@@ -107,7 +107,7 @@ public:
   void lock();
   void unlock();
 
-private:
+ private:
   SpinLockPrivate* m;
 };
 
@@ -116,7 +116,7 @@ private:
  */
 class SemaphorePrivate;
 class Semaphore {
-public:
+ public:
   //! Disable copy & assign
   Semaphore(const Semaphore& other) = delete;
   Semaphore& operator=(const Semaphore&& other) = delete;
@@ -124,7 +124,7 @@ public:
   //! Enable move.
   Semaphore(Semaphore&& other) : m(std::move(other.m)) {}
 
-public:
+ public:
   /**
    * @brief Construct Function.
    * @param[in] initValue the initial value of the
@@ -156,7 +156,7 @@ public:
    */
   void post();
 
-private:
+ private:
   SemaphorePrivate* m;
 };
 
@@ -166,7 +166,7 @@ private:
  */
 class ThreadBarrierPrivate;
 class ThreadBarrier {
-public:
+ public:
   DISABLE_COPY(ThreadBarrier);
 
   /**
@@ -184,7 +184,7 @@ public:
    */
   void wait();
 
-private:
+ private:
   ThreadBarrierPrivate* m;
 };
 
@@ -192,7 +192,7 @@ private:
  * A wrapper for condition variable with mutex.
  */
 class LockedCondition : public std::condition_variable {
-public:
+ public:
   /**
    * @brief execute op and notify one thread which was blocked.
    * @param[in] op a thread can do something in op before notify.
@@ -235,7 +235,7 @@ public:
    */
   std::mutex* mutex() { return &mutex_; }
 
-protected:
+ protected:
   std::mutex mutex_;
 };
 
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
index daebaffc85..6f8d7e0930 100644
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
@@ -55,12 +55,12 @@ std::string callPythonFunc(const std::string& moduleName,
  * NOTE: the lock of this guard is reentrant or recursive.
  */
 class PyGuard {
-public:
+ public:
   PyGuard();
   PyGuard(const PyGuard& other) = delete;
   PyGuard& operator=(const PyGuard& other) = delete;
 
-private:
+ private:
   std::lock_guard<std::recursive_mutex> guard_;
 };
 
@@ -133,7 +133,7 @@ std::string getPyCallStack();
  * Implements getAttr method for object.
  */
 class ObjectHelper {
-public:
+ public:
   explicit ObjectHelper(const PyObjectPtr& obj) : obj_(obj) {}
 
   /**
@@ -192,7 +192,7 @@ public:
     return PyObject_IsTrue(tmp.get());
   }
 
-private:
+ private:
   const PyObjectPtr& obj_;
 };
 
@@ -202,7 +202,7 @@ private:
  * The python sequence means list or tuple.
  */
 class SequenceHelper {
-public:
+ public:
   explicit SequenceHelper(const PyObjectPtr& seq) : seq_(seq.get()) {
     CHECK(PySequence_Check(seq_));
   }
@@ -248,12 +248,12 @@ public:
     }
   }
 
-private:
+ private:
   PyObject* seq_;
 };
 
 class DictHelper {
-public:
+ public:
   explicit DictHelper(PyObject* d) : dict_(d) {}
 
   explicit DictHelper(const PyObjectPtr& d) : dict_(d.get()) {}
@@ -275,7 +275,7 @@ public:
     this->set(key, list);
   }
 
-private:
+ private:
   inline void checkDict() { CHECK(PyDict_Check(this->dict_)); }
 
   PyObject* dict_;
@@ -289,7 +289,7 @@ inline static bool isCallable(const PyObjectPtr& obj) {
  * Wrap a callable object.
  */
 class CallableHelper {
-public:
+ public:
   explicit CallableHelper(const PyObjectPtr& obj) : obj_(obj) {
     CHECK(py::isCallable(obj_));
   }
@@ -315,7 +315,7 @@ public:
     return PyObject_Call(obj_.get(), args.get(), kwargs.get());
   }
 
-private:
+ private:
   const PyObjectPtr& obj_;
   PyObjectPtr args;
   PyObjectPtr kwargs;
diff --git a/paddle/utils/Queue.h b/paddle/utils/Queue.h
index f054738f87..189e1a14f7 100644
--- a/paddle/utils/Queue.h
+++ b/paddle/utils/Queue.h
@@ -56,7 +56,7 @@ namespace paddle {
  */
 template <class T>
 class Queue {
-public:
+ public:
   /**
    * @brief Construct Function. Default capacity of Queue is zero.
    */
@@ -147,7 +147,7 @@ public:
     });
   }
 
-private:
+ private:
   std::deque<T> elements_;
   int numElements_;
   std::mutex queueLock_;
@@ -185,7 +185,7 @@ private:
  */
 template <typename T>
 class BlockingQueue {
-public:
+ public:
   /**
    * @brief Construct Function.
    * @param[in] capacity the max numer of elements the queue can have.
@@ -244,7 +244,7 @@ public:
     return queue_.empty();
   }
 
-private:
+ private:
   std::mutex mutex_;
   std::condition_variable notEmpty_;
   std::condition_variable notFull_;
diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h
index 79fd3b8cf0..100e9eba90 100644
--- a/paddle/utils/Stat.h
+++ b/paddle/utils/Stat.h
@@ -33,7 +33,7 @@ namespace paddle {
 class Stat;
 
 class StatInfo {
-public:
+ public:
   explicit StatInfo(Stat* stat = nullptr) : stat_(stat) {
     total_ = 0;
     max_ = 0;
@@ -61,7 +61,7 @@ class Stat;
 typedef std::shared_ptr<Stat> StatPtr;
 
 class StatSet {
-public:
+ public:
   explicit StatSet(const std::string& name) : name_(name) {}
   ~StatSet() {}
 
@@ -102,7 +102,7 @@ public:
   // pserver code logic, -_- ).
   void reset(bool clearRawData = true);
 
-private:
+ private:
   std::unordered_map<std::string, StatPtr> statSet_;
   const std::string name_;
   RWLock lock_;
@@ -112,7 +112,7 @@ extern StatSet globalStat;
 
 /*@brief : a simple stat*/
 class Stat {
-public:
+ public:
   explicit Stat(const std::string& statName)
       : destructStat_(nullptr), name_(statName), openThreadInfo_(false) {}
   ~Stat() {}
@@ -137,7 +137,7 @@ public:
 
   friend class StatInfo;
 
-private:
+ private:
   void mergeThreadStat(StatInfo& allThreadStat);
 
   std::mutex lock_;
@@ -164,7 +164,7 @@ inline uint64_t nowInMicroSec() {
  * A simple help class to measure time interval
  */
 class Timer {
-public:
+ public:
   explicit Timer(bool autoStart = true) : total_(0), startStamp_(0) {
     if (autoStart) {
       start();
@@ -181,13 +181,13 @@ public:
 
   void reset() { total_ = 0; }
 
-protected:
+ protected:
   uint64_t total_;
   uint64_t startStamp_;
 };
 
 class TimerOnce {
-public:
+ public:
   TimerOnce(Stat* stat,
             const char* info = "",
             uint64_t threshold = -1,
@@ -208,7 +208,7 @@ public:
     stat_->addSample(span);
   }
 
-private:
+ private:
   Stat* stat_;
   const char* info_;
   Timer timer_;
@@ -280,11 +280,11 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1,
 #endif  // DISABLE_TIMER
 
 class GpuProfiler final {
-public:
+ public:
   GpuProfiler(std::string statName, std::string info);
   ~GpuProfiler();
 
-private:
+ private:
   std::lock_guard<std::recursive_mutex> guard_;
 };
 
diff --git a/paddle/utils/Thread.h b/paddle/utils/Thread.h
index ef36a8c5b2..2ee6eba1a6 100644
--- a/paddle/utils/Thread.h
+++ b/paddle/utils/Thread.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class Thread {
-public:
+ public:
   /**
    * @brief Construct Function. Default thread pointer is null.
    */
@@ -62,7 +62,7 @@ public:
    */
   virtual void run() = 0;
 
-protected:
+ protected:
   std::unique_ptr<std::thread> thread_;
 };
 
@@ -73,7 +73,7 @@ protected:
  * Use addJob() to add a new job to the job queue.
  */
 class ThreadWorker : protected Thread {
-public:
+ public:
   typedef std::function<void()> JobFunc;
 
   /**
@@ -116,7 +116,7 @@ public:
     finishCV_.wait([this] { return empty_; });
   }
 
-protected:
+ protected:
   /**
    * @brief Execute jobs in the job queue sequentianlly,
    * @note If finish all the jobs in the job queue,
@@ -150,7 +150,7 @@ protected:
  * JobFunc can use tid to divide input data.
  */
 class SyncThreadPool {
-public:
+ public:
   typedef std::function<void(int tid, size_t numThreads)> JobFunc;
 
   /**
@@ -236,7 +236,7 @@ public:
     }
   }
 
-protected:
+ protected:
   /**
    * @brief Start all the workers in the pool, call their run() function.
    */
@@ -285,7 +285,7 @@ protected:
     }
   }
 
-protected:
+ protected:
   pid_t ownerThreadId_;
   bool stopping_;
   ThreadBarrier jobStartBarrier_;
@@ -323,7 +323,7 @@ protected:
  */
 template <class T>
 class MultiThreadWorker {
-public:
+ public:
   typedef T ResultType;
   typedef std::shared_ptr<ResultType> ResultPtrType;
   typedef std::function<ResultPtrType()> JobFunc;
@@ -424,7 +424,7 @@ public:
    */
   bool testResult() { return results_.empty(); }
 
-protected:
+ protected:
   /**
    * @brief Do the jobs in the job queue sequentianlly
    * and enqueue the result into the result queue.
@@ -476,7 +476,7 @@ protected:
  *    thread pool.
  */
 class AsyncThreadPool {
-public:
+ public:
   typedef std::function<void()> JobFunc;
 
   AsyncThreadPool() { LOG(FATAL) << "Not implemented"; }
@@ -594,7 +594,7 @@ public:
     }
   }
 
-protected:
+ protected:
   /**
    * @brief Execute the jobs in the job queue.
    */
@@ -606,7 +606,7 @@ protected:
     }
   }
 
-private:
+ private:
   std::vector<std::unique_ptr<std::thread>> workers_;
   Queue<JobFunc> jobs_;
   bool stopping_;
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index 0a27b8b97b..c5b07506d3 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -49,7 +49,7 @@ namespace paddle {
  */
 template <class T>
 class ThreadLocal {
-public:
+ public:
   ThreadLocal() {
     CHECK_EQ(pthread_key_create(&threadSpecificKey_, dataDestructor), 0);
   }
@@ -92,7 +92,7 @@ public:
    */
   operator T*() { return get(); }
 
-private:
+ private:
   static void dataDestructor(void* p) { delete (T*)p; }
 
   pthread_key_t threadSpecificKey_;
@@ -111,7 +111,7 @@ private:
  */
 template <class T>
 class ThreadLocalD {
-public:
+ public:
   ThreadLocalD() { CHECK_EQ(pthread_key_create(&threadSpecificKey_, NULL), 0); }
   ~ThreadLocalD() {
     pthread_key_delete(threadSpecificKey_);
@@ -150,7 +150,7 @@ public:
    */
   T& operator*() { return *get(); }
 
-private:
+ private:
   static void dataDestructor(void* p) { delete (T*)p; }
 
   void updateMap(T* p) {
@@ -172,7 +172,7 @@ private:
  * @brief Thread-safe C-style random API.
  */
 class ThreadLocalRand {
-public:
+ public:
   /**
    * initSeed just like srand,
    * called by main thread,
@@ -205,7 +205,7 @@ public:
    */
   static int getDefaultSeed() { return defaultSeed_; }
 
-protected:
+ protected:
   static unsigned int defaultSeed_;
   static ThreadLocal<unsigned int> seed_;
 };
@@ -214,7 +214,7 @@ protected:
  * @brief Thread-safe C++ style random engine.
  */
 class ThreadLocalRandomEngine {
-public:
+ public:
   /**
    * get random_engine for each thread.
    *
@@ -222,7 +222,7 @@ public:
    */
   static std::default_random_engine& get();
 
-protected:
+ protected:
   static ThreadLocal<std::default_random_engine> engine_;
 };
 
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index 9579881ea3..e6f05e30d3 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -179,7 +179,7 @@ void loadFileList(const std::string& fileListFileName,
  */
 void registerInitFunction(std::function<void()> func, int priority = 0);
 class InitFunction {
-public:
+ public:
   explicit InitFunction(std::function<void()> func, int priority = 0) {
     registerInitFunction(func, priority);
   }
@@ -191,7 +191,7 @@ public:
  * When the SetDevice object is destructed, it will restore device environment.
  */
 class SetDevice {
-public:
+ public:
   explicit SetDevice(int deviceId) {
     isSet_ = deviceId >= 0;
     devId_ = 0;
@@ -206,7 +206,7 @@ public:
     }
   }
 
-protected:
+ protected:
   bool isSet_;
   int devId_;
 };
@@ -240,7 +240,7 @@ inline void enablePeerAccess(int d1, int d2) {
  * }
  */
 class AsyncGpuBlock {
-public:
+ public:
   AsyncGpuBlock() : syncFlag_(hl_get_sync_flag()) { hl_set_sync_flag(false); }
   ~AsyncGpuBlock() {
     if (syncFlag_) {
@@ -249,7 +249,7 @@ public:
     }
   }
 
-private:
+ private:
   bool syncFlag_;
 };
 
@@ -378,7 +378,7 @@ std::string join(const std::string& part1,
  * A Checker for each invoke of method in same thread.
  */
 class SameThreadChecker {
-public:
+ public:
   SameThreadChecker() {}
 
   /**
@@ -400,7 +400,7 @@ public:
         << invokeThreadId_ << " current invoked in " << curThreadId;
   }
 
-private:
+ private:
   std::once_flag onceFlag_;
   std::thread::id invokeThreadId_;
 };
@@ -421,7 +421,7 @@ private:
  */
 template <typename KType, typename VType, typename Hash>
 class WeakKVCache {
-public:
+ public:
   WeakKVCache() {}
 
   std::shared_ptr<VType> get(const KType& key,
@@ -442,7 +442,7 @@ public:
     return retVal;
   }
 
-private:
+ private:
   std::mutex lock_;
   std::unordered_map<KType, std::weak_ptr<VType>, Hash> storage_;
 };
@@ -453,7 +453,7 @@ private:
  */
 template <typename CallbackType, typename... Args>
 class ScopedCallbacks {
-public:
+ public:
   ScopedCallbacks(CallbackType enter, CallbackType exit, Args&... args)
       : exit_(std::bind(exit, args...)) {
     enter(args...);
@@ -464,7 +464,7 @@ public:
 
   ~ScopedCallbacks() { exit_(); }
 
-private:
+ private:
   std::function<void()> exit_;
 };
 
@@ -475,7 +475,7 @@ private:
  */
 template <typename T, size_t Alignment>
 class AlignedAllocator {
-public:
+ public:
   /// std campatible typedefs.
   typedef T* pointer;
   typedef const T* const_pointer;
@@ -552,12 +552,12 @@ public:
     return this->allocate(n);
   }
 
-private:
+ private:
   AlignedAllocator& operator=(const AlignedAllocator&);  // disable
 };
 
 class Deprecated {
-public:
+ public:
   explicit Deprecated(const std::string& msg = "") {
     if (msg.empty()) {
       LOG(WARNING) << "This class is deprecated, please do not use this class.";
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
index a4e6c8f7b8..409af8bce3 100644
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 
 namespace paddle {
 class SemaphorePrivate {
-public:
+ public:
   sem_t sem;
 };
 
@@ -45,7 +45,7 @@ void Semaphore::post() { sem_post(&m->sem); }
 #ifdef PADDLE_USE_PTHREAD_SPINLOCK
 
 class SpinLockPrivate {
-public:
+ public:
   inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
   inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
 
@@ -63,7 +63,7 @@ public:
 // clang-format on
 
 class SpinLockPrivate {
-public:
+ public:
   inline void lock() {
     while (lock_.test_and_set(std::memory_order_acquire)) {
     }
@@ -86,7 +86,7 @@ void SpinLock::unlock() { m->unlock(); }
 #ifdef PADDLE_USE_PTHREAD_BARRIER
 
 class ThreadBarrierPrivate {
-public:
+ public:
   pthread_barrier_t barrier_;
 
   inline explicit ThreadBarrierPrivate(int count) {
@@ -101,7 +101,7 @@ public:
 #else
 
 class ThreadBarrierPrivate {
-public:
+ public:
   pthread_mutex_t mutex_;
   pthread_cond_t cond_;
   int count_;
diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/utils/arch/osx/Locks.cpp
index e03992363f..f3905091bd 100644
--- a/paddle/utils/arch/osx/Locks.cpp
+++ b/paddle/utils/arch/osx/Locks.cpp
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class SemaphorePrivate {
-public:
+ public:
   ~SemaphorePrivate() { dispatch_release(sem); }
 
   dispatch_semaphore_t sem;
@@ -45,7 +45,7 @@ void Semaphore::wait() {
 void Semaphore::post() { dispatch_semaphore_signal(m->sem); }
 
 class SpinLockPrivate {
-public:
+ public:
   std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
   char padding_[64 - sizeof(lock_)];  // Padding to cache line size
 };
@@ -61,7 +61,7 @@ void SpinLock::lock() {
 void SpinLock::unlock() { m->lock_.clear(std::memory_order_release); }
 
 class ThreadBarrierPrivate {
-public:
+ public:
   pthread_mutex_t mutex_;
   pthread_cond_t cond_;
   int count_;
diff --git a/python/paddle/batch.py b/python/paddle/batch.py
index 317cf037c6..d48c54fcbb 100644
--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
 
 
-def batch(reader, batch_size):
+def batch(reader, batch_size, drop_last=False):
     """
     Create a batched reader.
 
@@ -23,6 +23,8 @@ def batch(reader, batch_size):
     :type reader: callable
     :param batch_size: size of each mini-batch
     :type batch_size: int
+    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
+    :type drop_last: bool
     :return: the batched reader.
     :rtype: callable
     """
@@ -35,7 +37,7 @@ def batch(reader, batch_size):
             if len(b) == batch_size:
                 yield b
                 b = []
-        if b:
+        if drop_last == False and len(b) != 0:
             yield b
 
     return batch_reader
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index f082e33be3..527044b415 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -119,7 +119,8 @@ def reader_creator(data_file,
                 yield sample, int(label) - 1
 
     if use_xmap:
-        return xmap_readers(mapper, reader, cpu_count(), buffered_size)
+        cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
+        return xmap_readers(mapper, reader, cpu_num, buffered_size)
     else:
         return map_readers(mapper, reader)
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 67aa5ec997..bd985ad733 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -26,6 +26,7 @@ from trainer import BeginEpochEvent
 from trainer import EndEpochEvent
 from trainer import BeginStepEvent
 from trainer import EndStepEvent
+from trainer import CheckpointConfig
 
 import inferencer
 from inferencer import Inferencer
@@ -44,10 +45,11 @@ import transpiler
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
-from transpiler import DistributeTranspiler, SimpleDistributeTranspiler, \
-    InferenceTranspiler, memory_optimize, release_memory
+from transpiler import DistributeTranspiler, InferenceTranspiler, \
+    memory_optimize, release_memory
 from concurrency import (Go, make_channel, channel_send, channel_recv,
                          channel_close, Select)
+from lod_tensor import create_lod_tensor, create_random_int_lodtensor
 import clip
 import profiler
 import unique_name
@@ -59,7 +61,7 @@ Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
           trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
-          parallel_executor.__all__ + [
+          parallel_executor.__all__ + lod_tensor.__all__ + [
               'io',
               'initializer',
               'layers',
@@ -115,11 +117,11 @@ def __bootstrap__():
 
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'eager_delete_scope'
+        'eager_delete_scope', 'use_mkldnn'
     ]
     if core.is_compiled_with_cuda():
         read_env_flags += [
-            'fraction_of_gpu_memory_to_use', 'cudnn_algo_use_autotune'
+            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic'
         ]
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 32b1b65bd9..4f9622d04d 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -51,6 +51,12 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
         op_desc.set_input(para, args)
     for para, args in outputs.iteritems():
         op_desc.set_output(para, args)
+
+    op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
+
+    if op_role_attr_name not in attrs:
+        attrs[
+            op_role_attr_name] = core.op_proto_and_checker_maker.OpRole.Backward
     for name, val in attrs.iteritems():
         if isinstance(val, framework.Block):
             op_desc.set_block_attr(name, val.desc)
@@ -141,7 +147,7 @@ def _addup_repetitive_outputs_(op_descs):
             else:
                 if len(renamed_vars[var_name]) == 1:
                     new_name = var_name + "@RENAME@" + \
-                        str(var_rename_count[var_name])
+                               str(var_rename_count[var_name])
                     var_rename_count[var_name] += 1
                     # rename original var_name
                     renamed_vars[var_name][0] = new_name
@@ -149,7 +155,7 @@ def _addup_repetitive_outputs_(op_descs):
                     _rename_arg_(pending_sum_ops, var_name, new_name)
 
                 new_name = var_name + "@RENAME@" + \
-                    str(var_rename_count[var_name])
+                           str(var_rename_count[var_name])
                 var_rename_count[var_name] += 1
                 op_desc.rename_output(var_name, new_name)
                 renamed_vars[var_name].append(new_name)
@@ -335,9 +341,12 @@ def _append_backward_ops_(block,
                                             no_grad_dict[block.idx])
 
     # append op_desc in grad_op_descs to target_block
+    op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
+    backward = core.op_proto_and_checker_maker.OpRole.Backward
     for op_desc in grad_op_descs:
         new_op_desc = target_block.desc.append_op()
         new_op_desc.copy_from(op_desc)
+        new_op_desc.set_attr(op_role_attr_name, backward)
         grad_to_var["__current_op_desc__"] = new_op_desc
         if callbacks is not None:
             assert (isinstance(callbacks, list))
@@ -439,6 +448,22 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
         (list[(Variable,Variable)]): list of (parameter, gradient) pair.
     """
     assert isinstance(loss, framework.Variable)
+
+    if loss.op is None:
+        # the loss is from a cloned program. Find loss op manually.
+        for op in reversed(loss.block.ops):
+            assert isinstance(op, framework.Operator)
+            if len(op.output_arg_names) == 1 and op.output_arg_names[
+                    0] == loss.name:
+                loss.op = op
+                break
+        if loss.op is None:
+            raise ValueError("loss.op is None. Should not happend")
+
+    loss.op.set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(),
+                     int(core.op_proto_and_checker_maker.OpRole.Forward) |
+                     int(core.op_proto_and_checker_maker.OpRole.Loss))
+
     if callbacks is not None:
         isinstance(callbacks, list)
 
@@ -456,12 +481,16 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
     current_block_idx = program.current_block_idx
     grad_to_var = dict()
 
-    op_desc = _create_op_desc_("fill_constant", {}, {
-        "Out": [_append_grad_suffix_(loss.name)]
-    }, {"shape": [1],
-        "value": 1.0,
-        "dtype": loss.dtype,
-        "force_cpu": False})
+    op_desc = _create_op_desc_(
+        "fill_constant", {}, {"Out": [_append_grad_suffix_(loss.name)]}, {
+            "shape": [1],
+            "value": 1.0,
+            "dtype": loss.dtype,
+            "force_cpu": False,
+            core.op_proto_and_checker_maker.kOpRoleAttrName():
+            int(core.op_proto_and_checker_maker.OpRole.Backward) |
+            int(core.op_proto_and_checker_maker.OpRole.Loss),
+        })
     root_block.desc.append_op().copy_from(op_desc)
 
     block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
@@ -505,6 +534,24 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
             params_and_grads.append((param_var, grad_var))
         else:
             params_and_grads.append((param_var, None))
+
+    op_role_var_attr_name = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+    for p, g in params_and_grads:
+        if g is None:
+            continue
+        for op in reversed(program.global_block().ops):
+            assert isinstance(op, framework.Operator)
+            if g.name in op.output_arg_names:
+                g.op = op
+                break
+
+        if g.op is None:
+            raise ValueError("Unexpected branch")
+        attr_val = [p.name, g.name]
+        if g.op.has_attr(op_role_var_attr_name):
+            attr_val.extend(g.op.attr(op_role_var_attr_name))
+        g.op.set_attr(op_role_var_attr_name, attr_val)
+
     return params_and_grads
 
 
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 12add9e686..66c3fc6b66 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -214,21 +214,24 @@ def set_gradient_clip(clip, param_list=None, program=None):
 
 def append_gradient_clip_ops(param_grad):
     context = dict()
-    create_op_callbacks = []
     for p, g in param_grad:
-        clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
-        if clip_attr is None:
-            clip_attr = NullGradientClipAttr()
-        if not isinstance(clip_attr, BaseGradientClipAttr):
-            raise TypeError(
-                "clip attribute should be an instance of BaseGradientClipAttr")
+        with p.block.program.optimized_guard(p):
+            clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
+            if clip_attr is None:
+                clip_attr = NullGradientClipAttr()
+            if not isinstance(clip_attr, BaseGradientClipAttr):
+                raise TypeError(
+                    "clip attribute should be an instance of BaseGradientClipAttr"
+                )
 
-        clip_attr.process_context(context=context, param=p, grad=g)
-        create_op_callbacks.append(
-            functools.partial(
-                clip_attr.create_operators, param=p, grad=g))
+            clip_attr.process_context(context=context, param=p, grad=g)
+
+    res = []
+    for p, g in param_grad:
+        with p.block.program.optimized_guard(p):
+            res.append(clip_attr.create_operators(param=p, grad=g))
 
-    return [each_callback() for each_callback in create_op_callbacks]
+    return res
 
 
 ClipByValue = GradientClipByValue
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index a44e078d0c..e2013137b1 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import core
 import numpy
+import os
 import six.moves as six
 import multiprocessing
 
@@ -36,9 +37,11 @@ class DataToLoDTensorConverter(object):
             self.dtype = 'float64'
         elif dtype == core.VarDesc.VarType.INT32:
             self.dtype = 'int32'
+        elif dtype == core.VarDesc.VarType.UINT8:
+            self.dtype = 'uint8'
         else:
             raise ValueError("dtype must be any of [int32, float32, int64, "
-                             "float64]")
+                             "float64, uint8]")
 
         self.data = []
         self.lod = []
@@ -148,7 +151,9 @@ class DataFeeder(object):
         elif isinstance(self.place, core.CUDAPlace):
             return core.get_cuda_device_count()
         else:
-            return multiprocessing.cpu_count()
+            cpu_num = int(
+                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+            return cpu_num
 
     def decorate_reader(self,
                         reader,
diff --git a/python/paddle/fluid/debuger.py b/python/paddle/fluid/debugger.py
similarity index 100%
rename from python/paddle/fluid/debuger.py
rename to python/paddle/fluid/debugger.py
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 1ee1d37271..7c6ad6f27d 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -273,10 +273,11 @@ class DetectionMAP(Evaluator):
             [M, 6]. The layout is [label, confidence, xmin, ymin, xmax, ymax].
         gt_label (Variable): The ground truth label index, which is a LoDTensor
             with shape [N, 1].
-        gt_difficult (Variable): Whether this ground truth is a difficult
-            bounding box (bbox), which is a LoDTensor [N, 1].
         gt_box (Variable): The ground truth bounding box (bbox), which is a
             LoDTensor with shape [N, 6]. The layout is [xmin, ymin, xmax, ymax].
+        gt_difficult (Variable|None): Whether this ground truth is a difficult
+            bounding bbox, which can be a LoDTensor [N, 1] or not set. If None,
+            it means all the ground truth labels are not difficult bbox.
         class_num (int): The class number.
         background_label (int): The index of background label, the background
             label will be ignored. If set to -1, then all categories will be
@@ -284,7 +285,8 @@ class DetectionMAP(Evaluator):
         overlap_threshold (float): The threshold for deciding true/false
             positive, 0.5 by defalut.
         evaluate_difficult (bool): Whether to consider difficult ground truth
-            for evaluation, True by defalut.
+            for evaluation, True by defalut. This argument does not work when
+            gt_difficult is None.
         ap_version (string): The average precision calculation ways, it must be
             'integral' or '11point'. Please check
             https://sanchom.wordpress.com/tag/average-precision/ for details.
@@ -295,7 +297,7 @@ class DetectionMAP(Evaluator):
 
         exe = fluid.executor(place)
         map_evaluator = fluid.Evaluator.DetectionMAP(input,
-            gt_label, gt_difficult, gt_box)
+            gt_label, gt_box, gt_difficult)
         cur_map, accum_map = map_evaluator.get_map_var()
         fetch = [cost, cur_map, accum_map]
         for epoch in PASS_NUM:
@@ -313,8 +315,8 @@ class DetectionMAP(Evaluator):
                  input,
                  gt_label,
                  gt_box,
-                 gt_difficult,
-                 class_num,
+                 gt_difficult=None,
+                 class_num=None,
                  background_label=0,
                  overlap_threshold=0.5,
                  evaluate_difficult=True,
@@ -322,8 +324,11 @@ class DetectionMAP(Evaluator):
         super(DetectionMAP, self).__init__("map_eval")
 
         gt_label = layers.cast(x=gt_label, dtype=gt_box.dtype)
-        gt_difficult = layers.cast(x=gt_difficult, dtype=gt_box.dtype)
-        label = layers.concat([gt_label, gt_difficult, gt_box], axis=1)
+        if gt_difficult:
+            gt_difficult = layers.cast(x=gt_difficult, dtype=gt_box.dtype)
+            label = layers.concat([gt_label, gt_difficult, gt_box], axis=1)
+        else:
+            label = layers.concat([gt_label, gt_box], axis=1)
 
         # calculate mean average precision (mAP) of current mini-batch
         map = layers.detection_map(
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 93aa5f908e..33d8f70941 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -170,6 +170,8 @@ def get_program_cache_key(feed, fetch_list):
             return var.desc.name()
         elif isinstance(var, str):
             return var
+        elif isinstance(var, basestring):
+            return str(var)
         else:
             raise TypeError(str(var) + " should be Variable or str")
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 161ea55586..bbd35aaecb 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -72,6 +72,8 @@ def convert_np_dtype_to_dtype_(np_dtype):
         return core.VarDesc.VarType.INT64
     elif dtype == np.bool:
         return core.VarDesc.VarType.BOOL
+    elif dtype == np.uint16:
+        return core.VarDesc.VarType.INT16
     elif dtype == np.uint8:
         return core.VarDesc.VarType.UINT8
     else:
@@ -361,6 +363,13 @@ class OpProtoHolder(object):
             raise ValueError("Operator \"%s\" has not been registered." % type)
         return self.op_proto_map[type]
 
+    @staticmethod
+    def generated_op_attr_names():
+        return {
+            core.op_proto_and_checker_maker.kOpRoleAttrName(),
+            core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+        }
+
 
 class Operator(object):
     """
@@ -368,6 +377,13 @@ class Operator(object):
     Block. Users can use the build in instructions to describe their neural
     network.
     """
+    OP_WITHOUT_KERNEL_SET = {
+        'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
+        'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
+        'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
+        'ncclInit', 'channel_create', 'channel_close', 'channel_send',
+        'channel_recv', 'select'
+    }
 
     def __init__(self,
                  block,
@@ -404,6 +420,23 @@ class Operator(object):
         self.block = block
         self.desc = desc
         self.attrs = attrs
+        if self.attrs is None:
+            self.attrs = dict()
+        del attrs
+
+        op_maker = core.op_proto_and_checker_maker
+
+        if op_maker.kOpRoleAttrName() not in self.attrs:
+            self.attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role
+
+        role_var_name = op_maker.kOpRoleVarAttrName()
+        if len(self.block.program.
+               op_role_var) != 0 and role_var_name not in self.attrs:
+            self.attrs[role_var_name] = self.block.program.op_role_var
+
+        if role_var_name in self.attrs and len(self.attrs[role_var_name]) == 0:
+            del self.attrs[role_var_name]
+
         if len(self.desc.type()) != 0:
             return
         if type is None:
@@ -469,34 +502,31 @@ class Operator(object):
                     arg.op = self
                 self.desc.set_output(out_proto.name, out_arg_names)
 
-        if attrs is not None:
-            if not isinstance(attrs, dict):
+        if self.attrs is not None:
+            if not isinstance(self.attrs, dict):
                 raise TypeError("'attrs' should be a dict.")
             for attr in proto.attrs:
                 attr_name = attr.name
-                if (attr_name not in attrs) or (attrs[attr_name] is None):
+                if (attr_name not in self.attrs) or (
+                        self.attrs[attr_name] is None):
                     continue
-                if isinstance(attrs[attr_name], Block):
-                    self.desc.set_block_attr(attr_name, attrs[attr_name].desc)
-                elif isinstance(attrs[attr_name], core.BlockDesc) or \
-                        isinstance(attrs[attr_name], core.ProgramDesc):
+                if isinstance(self.attrs[attr_name], Block):
+                    self.desc.set_block_attr(attr_name,
+                                             self.attrs[attr_name].desc)
+                elif isinstance(self.attrs[attr_name], core.BlockDesc) or \
+                        isinstance(self.attrs[attr_name], core.ProgramDesc):
                     self.desc.set_serialized_attr(
-                        attr_name, attrs[attr_name].serialize_to_string())
+                        attr_name, self.attrs[attr_name].serialize_to_string())
                 else:
-                    self.desc.set_attr(attr_name, attrs[attr_name])
-
+                    self.desc.set_attr(attr_name, self.attrs[attr_name])
         self.desc.check_attrs()
-        no_kernel_op_set = {
-            'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
-            'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
-            'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
-            'load_combine', 'ncclInit', 'channel_create', 'channel_close',
-            'channel_send', 'channel_recv', 'select', 'gen_nccl_id'
-        }
-        if type not in no_kernel_op_set:
+        if self.has_kernel(type):
             self.desc.infer_var_type(self.block.desc)
             self.desc.infer_shape(self.block.desc)
 
+    def has_kernel(self, op_type):
+        return op_type not in self.OP_WITHOUT_KERNEL_SET
+
     def to_string(self, throw_on_error):
         """
         To debug string.
@@ -612,6 +642,10 @@ class Operator(object):
         """
         return self.desc.attr_type(name)
 
+    def set_attr(self, name, val):
+        self.attrs[name] = val
+        self.desc.set_attr(name, val)
+
     @property
     def attr_names(self):
         """
@@ -720,7 +754,9 @@ class Block(object):
 
     def var(self, name):
         if not isinstance(name, basestring):
-            raise TypeError()
+            raise TypeError(
+                "var require string as parameter, but get %s instead." %
+                (type(name)))
         v = self.vars.get(name, None)
         if v is None:
             raise ValueError("var %s not in this block" % name)
@@ -775,7 +811,7 @@ class Block(object):
         Rename variable in vars and ops' inputs and outputs
         """
         if not self.has_var(name):
-            raise ValueError("var %s is not in current" % name)
+            raise ValueError("var %s is not in current block" % name)
         v = self.var(name)
         if type(v) == Parameter:
             var_type = "Parameter"
@@ -821,6 +857,7 @@ class Block(object):
         self.vars[new_name] = var
         del self.vars[name]
         self.sync_with_cpp()
+        return var
 
     def remove_var(self, name):
         self.sync_with_cpp()
@@ -1002,6 +1039,33 @@ class Program(object):
         self.blocks = [Block(self, 0)]
         self.current_block_idx = 0
         self._seed = 0
+        self._current_role = core.op_proto_and_checker_maker.OpRole.Forward
+        self._op_role_var = []
+
+    @property
+    def op_role(self):
+        return self._current_role
+
+    @op_role.setter
+    def set_op_role(self, role):
+        self._current_role = role
+
+    @property
+    def op_role_var(self):
+        return self._op_role_var
+
+    @op_role_var.setter
+    def set_op_role_var(self, var_name):
+        self._op_role_var = [var_name]
+
+    @contextlib.contextmanager
+    def optimized_guard(self, var):
+        OpRole = core.op_proto_and_checker_maker.OpRole
+        self._current_role = OpRole.Optimize
+        self._op_role_var = [var.name if isinstance(var, Variable) else var]
+        yield
+        self._op_role_var = []
+        self._current_role = OpRole.Forward
 
     def __str__(self):
         return self.to_string(True)
diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py
index 894f6dbfad..6baac00905 100644
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -56,7 +56,9 @@ class Inferencer(object):
         else:
             self.exe = executor.Executor(self.place)
 
-    def infer(self, inputs):
+        self.inference_program = self.inference_program.clone(for_test=True)
+
+    def infer(self, inputs, return_numpy=True):
         """
         :param inputs: a map of {"input_name": input_var} that will be feed into the inference program
         to get the predict value
@@ -66,9 +68,11 @@ class Inferencer(object):
             raise ValueError(
                 "inputs should be a map of {'input_name': input_var}")
 
-        with self._prog_and_scope_guard():
-            results = self.exe.run(feed=inputs,
-                                   fetch_list=[self.predict_var.name])
+        with executor.scope_guard(self.scope):
+            results = self.exe.run(self.inference_program,
+                                   feed=inputs,
+                                   fetch_list=[self.predict_var],
+                                   return_numpy=return_numpy)
 
         return results
 
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 54506e97ed..6323c9899e 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -13,21 +13,19 @@
 # limitations under the License.
 
 import os
+import time
+import shutil
 
 from paddle.fluid.evaluator import Evaluator
 from paddle.fluid.framework import Program, Parameter, default_main_program, Variable
 from . import core
 
 __all__ = [
-    'save_vars',
-    'save_params',
-    'save_persistables',
-    'load_vars',
-    'load_params',
-    'load_persistables',
-    'save_inference_model',
-    'load_inference_model',
-    'get_inference_program',
+    'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
+    'load_persistables', 'save_inference_model', 'load_inference_model',
+    'get_inference_program', 'save_checkpoint', 'load_checkpoint',
+    'clean_checkpoint', 'load_persist_vars_without_grad',
+    'save_persist_vars_without_grad', 'get_latest_checkpoint_serial'
 ]
 
 
@@ -195,6 +193,8 @@ def load_vars(executor,
         load_var_map = {}
         for each_var in vars:
             assert isinstance(each_var, Variable)
+            if each_var.type == core.VarDesc.VarType.RAW:
+                continue
             new_var = _clone_var_in_block_(load_block, each_var)
             if filename is None:
                 load_block.append_op(
@@ -454,3 +454,293 @@ def get_parameter_value_by_name(name, executor, program=None):
         program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
+
+
+SUCCESS_MARK_FILENAME = "_SUCCESS"
+CHECKPOINT_PREFIX = "checkpoint"
+MODEL_DIR = "__model__"
+TRAINER_PREFIX = "trainer"
+CHECKPOINT_SEPARATOR = "_"
+
+
+def save_checkpoint(executor,
+                    checkpoint_dir,
+                    trainer_id,
+                    trainer_args=None,
+                    main_program=None,
+                    max_num_checkpoints=3):
+    """
+    Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory,
+    the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy
+    to keep numbers of checkpoint directory,  the numbers of checkpoint directory are max_num_checkpoints at most,
+    The interval between two saved checkpoints must greater than save_interval_secs.
+
+    :param executor executor for save the value
+    :param checkpoint_dir the checkpoint directory 
+    :param trainer_id currect trainer id, if id is equal to 0, the trainer is chief
+    :param main_program   will save all variables in program 
+    :param max_num_checkpoints will keep numbers of checkpoint serials not bigger than max_num_checkpoints
+    """
+    if checkpoint_dir is None:
+        raise ValueError("'checkpoint_dir' should not be None")
+
+    if trainer_args:
+        assert isinstance(trainer_args, dict)
+
+    if not os.path.isdir(checkpoint_dir):
+        os.makedirs(checkpoint_dir)
+
+    serial = get_latest_checkpoint_serial(checkpoint_dir) + 1
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
+
+    save_trainer_args(cur_dir, trainer_id, trainer_args)
+
+    if trainer_id == 0:
+        save_persist_vars_without_grad(executor, cur_dir, main_program)
+
+    _scroll_delete(checkpoint_dir, max_num_checkpoints)
+
+
+def load_checkpoint(executor, checkpoint_dir, serial, main_program):
+    """
+    Load checkpoint from a directory by executor,
+    it will find  the most recent saved checkpoint file and load it auto.
+
+    :param executor executor for load the value
+    :param checkpoint_dir  the checkpoint directory 
+    :param serial the serial folder in checkpoint directory will be load
+    :param main_program  will load all variables in program 
+    """
+
+    if checkpoint_dir is None:
+        raise ValueError("'checkpoint_dir' should not be None")
+
+    if serial is None or serial < 0:
+        raise ValueError("'serial' should not be None or <0 ")
+
+    if main_program is None:
+        raise ValueError('main_program should not be None.')
+
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
+    load_persist_vars_without_grad(executor, cur_dir, main_program, True)
+
+
+def clean_checkpoint(checkpoint_dir, delete_dir=False):
+    """
+    clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before.
+    delete_dir only works when the directory is empty, otherwise, OSError is raised.  
+
+    :param checkpoint_dir
+    :param delete_dir
+    """
+
+    if checkpoint_dir is None:
+        raise ValueError("'checkpoint_dir' should not be None")
+    _scroll_delete(checkpoint_dir, max_num_checkpoints=0)
+
+    if delete_dir and not os.listdir(checkpoint_dir):
+        os.rmdir(checkpoint_dir)
+
+
+def load_persist_vars_without_grad(executor,
+                                   dirname,
+                                   program,
+                                   has_model_dir=False):
+    """
+    load_persist_vars_without_grad will load variables from a directory by an executor,
+    the variable named end with "@GRAD" will not be loaded.
+
+    :param executor  executor for load the value
+    :param dirname the checkpoint directory 
+    :param program   will load all variables in program 
+    :param has_model_dir if has_model_dir is True, will load variables from  sub directory named __model__
+    """
+
+    if has_model_dir:
+        dirname = _get_model_dir(dirname)
+
+    load_vars(
+        executor,
+        dirname=dirname,
+        main_program=program,
+        predicate=_is_checkpoint_var,
+        filename=None)
+
+
+def save_persist_vars_without_grad(executor, dirname, program):
+    """
+    save_persist_vars_without_grad  will save variables to a directory by an executor,
+    the variable named end with "@GRAD" will not be saved.
+
+    :param executor  executor for load the value
+    :param dirname the checkpoint directory 
+    :param program   will load all variables in program
+    """
+    cur_dir = _get_model_dir(dirname)
+    save_vars(
+        executor,
+        dirname=cur_dir,
+        main_program=program,
+        vars=None,
+        predicate=_is_checkpoint_var,
+        filename=None)
+    _write_success(cur_dir)
+
+
+def save_trainer_args(dirname, trainer_id, trainer_args):
+    assert isinstance(trainer_args, dict)
+
+    cur_dir = _get_trainer_dir(dirname, trainer_id)
+
+    for name, value in trainer_args.iteritems():
+        args_file = os.path.join(cur_dir, name)
+        with open(args_file, 'w') as f:
+            f.write(str(value))
+    _write_success(cur_dir)
+
+
+def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
+    assert isinstance(trainer_args, list)
+
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
+    cur_dir = _get_trainer_dir(cur_dir, trainer_id)
+
+    ret_values = []
+
+    for arg in trainer_args:
+        cur_file = os.path.join(cur_dir, arg)
+        with open(cur_file, 'r') as f:
+            contents = f.read()
+            ret_values.append(contents.strip())
+    return ret_values
+
+
+def _is_checkpoint_var(var):
+    """
+    the checkpoint will not save or load all the variables.
+    var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
+
+    :param var
+    """
+    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+            var.desc.type() == core.VarDesc.VarType.RAW:
+        return False
+    # @GRAD are named for gradient variables, checkpoint will not save it.
+    if "@GRAD" in var.name:
+        return False
+    # .trainer_ are named for distribute train variables, checkpoint will not save it.
+    if ".trainer_" in var.name:
+        return False
+
+    # .block is named for distribute train variables, checkpoint will not save it.
+    if ".block" in var.name:
+        return False
+
+    return var.persistable
+
+
+def _get_dir_serial(dirname):
+    _, serial = dirname.split(CHECKPOINT_SEPARATOR)
+
+    try:
+        serial_num = int(serial)
+    except ValueError:
+        serial_num = -1
+    return serial_num
+
+
+def _get_serial_dir(dirname, serial):
+    serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
+    serial_dir = os.path.join(dirname, serial_folder)
+
+    if not os.path.isdir(serial_dir):
+        os.makedirs(serial_dir)
+
+    return serial_dir
+
+
+def _get_model_dir(dirname):
+    model_dir = os.path.join(dirname, MODEL_DIR)
+
+    if not os.path.isdir(model_dir):
+        os.makedirs(model_dir)
+
+    return model_dir
+
+
+def _get_trainer_dir(dirname, trainer_id):
+    trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
+    trainer_dir = os.path.join(dirname, trainer_folder)
+
+    if not os.path.isdir(trainer_dir):
+        os.makedirs(trainer_dir)
+
+    return trainer_dir
+
+
+def _scroll_delete(dirname, max_num_checkpoints=3):
+    dirs = os.listdir(dirname)
+    serial_map = {}
+    for serial in dirs:
+        serial_num = _get_dir_serial(serial)
+        serial_map[serial_num] = serial
+
+    if len(serial_map.keys()) <= max_num_checkpoints:
+        return
+
+    serials = serial_map.keys()
+    serials.sort(reverse=True)
+    serials = serials[max_num_checkpoints:]
+    for serial in serials:
+        cur_dir = _get_serial_dir(dirname, serial)
+        shutil.rmtree(cur_dir)
+
+
+def _write_success(dirname):
+    """
+    write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct.
+
+    :param dirname
+    """
+    success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
+    with open(success_file, 'a') as f:
+        now = time.ctime()
+        f.write(now)
+
+
+def get_latest_checkpoint_serial(checkpoint_dir):
+    """
+    get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
+
+    :param checkpoint_dir
+    """
+    if not checkpoint_dir:
+        return -1
+
+    def has_success(checkpoint_dir, cur_dir):
+        """
+        is _SUCCESS in this dir
+        """
+
+        serial = _get_dir_serial(cur_dir)
+        if serial == -1 or not os.path.isdir(
+                os.path.join(checkpoint_dir, cur_dir)):
+            return -1
+
+        success_path = os.path.join(
+            _get_serial_dir(checkpoint_dir, serial), MODEL_DIR,
+            SUCCESS_MARK_FILENAME)
+        if os.path.isfile(success_path):
+            return serial
+
+    if not os.path.isdir(checkpoint_dir):
+        return -1
+
+    current_dir = -1
+    dirs = os.listdir(checkpoint_dir)
+    for cur_dir in dirs:
+        success_num = has_success(checkpoint_dir, cur_dir)
+        if success_num > current_dir:
+            current_dir = success_num
+    return current_dir
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index dee4144808..80e8ff484a 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import contextlib
 
-from layer_function_generator import autodoc
+from layer_function_generator import autodoc, templatedoc
 from tensor import assign, fill_constant
 from .. import core
 from ..framework import Program, Variable, Operator
@@ -721,26 +721,22 @@ def lod_rank_table(x, level=0):
     return table
 
 
+@templatedoc()
 def max_sequence_len(rank_table):
-    """Max Sequence Len Operator. Given a LoDRankTable object, this layer
-    returns the max length of a batch of sequences. In fact, a LoDRankTable
-    object contains a list of tuples(<sequence index, sequence length>) and
-    the list is already sorted by sequence length in descending order, so the
-    operator just returns the sequence length of the first tuple element.
+    """
+    ${comment}
+
+    >>> import paddle.fluid as fluid
+    >>> x = fluid.layers.data(name='x', shape=[10], dtype='float32',
+    >>>                       lod_level=1)
+    >>> rank_table = layers.lod_rank_table(x=x, level=0)
+    >>> max_seq_len = layers.max_sequence_len(rank_table)
 
     Args:
-        rank_table (Variable): Input variable which is a LoDRankTable object.
+        rank_table(${rank_table_type}): ${rank_table_comment}.
 
     Returns:
-        Variable: The max length of sequence.
-
-    Examples:
-        .. code-block:: python
-
-            x = fluid.layers.data(name='x', shape=[10],
-                            dtype='float32', lod_level=1)
-            rank_table = layers.lod_rank_table(x=x, level=0)
-            max_seq_len = layers.max_sequence_len(rank_table)
+        ${out_comment}.
     """
     helper = LayerHelper("max_seqence_len", **locals())
     res = helper.create_tmp_variable(dtype="int64")
@@ -1098,7 +1094,7 @@ class ConditionalBlock(object):
         input_set = set([ipt.name for ipt in self.inputs])
 
         param_list = [
-            parent_block.var(each_name) for each_name in params
+            parent_block.var_recursive(each_name) for each_name in params
             if each_name not in input_set
         ]
 
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index b33adf55cf..3a83db12fd 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -569,7 +569,7 @@ def prior_box(input,
               image,
               min_sizes,
               max_sizes=None,
-              aspect_ratios=None,
+              aspect_ratios=[1.],
               variance=[0.1, 0.1, 0.2, 0.2],
               flip=False,
               clip=False,
@@ -589,19 +589,19 @@ def prior_box(input,
        input(Variable): The Input Variables, the format is NCHW.
        image(Variable): The input image data of PriorBoxOp,
             the layout is NCHW.
-       min_sizes(list|tuple): min sizes of generated prior boxes.
+       min_sizes(list|tuple|float value): min sizes of generated prior boxes.
        max_sizes(list|tuple|None): max sizes of generated prior boxes.
             Default: None.
-       aspect_ratios(list|tuple): the aspect ratios of generated prior
-            boxes. Default: None.
+       aspect_ratios(list|tuple|float value): the aspect ratios of generated
+            prior boxes. Default: [1.].
        variance(list|tuple): the variances to be encoded in prior boxes.
             Default:[0.1, 0.1, 0.2, 0.2].
        flip(bool): Whether to flip aspect ratios. Default:False.
        clip(bool): Whether to clip out-of-boundary boxes. Default: False.
-       step(list|turple): Prior boxes step across weight and height, If
+       step(list|turple): Prior boxes step across width and height, If
             step[0] == 0.0/step[1] == 0.0, the prior boxes step across
-            height/weight  of the input will be automatically calculated.
-            Default: [0.0]
+            height/weight of the input will be automatically calculated.
+            Default: [0., 0.]
        offset(float): Prior boxes center offset. Default: 0.5
        name(str): Name of the prior box op. Default: None.
 
@@ -630,6 +630,21 @@ def prior_box(input,
     helper = LayerHelper("prior_box", **locals())
     dtype = helper.input_dtype()
 
+    def _is_list_or_tuple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if not _is_list_or_tuple_(min_sizes):
+        min_sizes = [min_sizes]
+    if not _is_list_or_tuple_(aspect_ratios):
+        aspect_ratios = [aspect_ratios]
+    if not (_is_list_or_tuple_(steps) and len(steps) == 2):
+        raise ValueError('steps should be a list or tuple ',
+                         'with length 2, (step_width, step_height).')
+
+    min_sizes = list(map(float, min_sizes))
+    aspect_ratios = list(map(float, aspect_ratios))
+    steps = list(map(float, steps))
+
     attrs = {
         'min_sizes': min_sizes,
         'aspect_ratios': aspect_ratios,
@@ -641,6 +656,8 @@ def prior_box(input,
         'offset': offset
     }
     if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0:
+        if not _is_list_or_tuple_(max_sizes):
+            max_sizes = [max_sizes]
         attrs['max_sizes'] = max_sizes
 
     box = helper.create_tmp_variable(dtype)
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 4d6ee3c51b..9de88e2c32 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import contextlib
 
 from .. import core
 from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program, Program
@@ -18,10 +19,12 @@ from ..unique_name import generate as unique_name
 from control_flow import BlockGuard
 from ..layer_helper import LayerHelper
 from ..executor import global_scope
+from layer_function_generator import generate_layer_fn, templatedoc
 
 __all__ = [
     'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
-    'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer'
+    'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
+    'random_data_generator', 'Preprocessor', 'load'
 ]
 
 
@@ -193,21 +196,23 @@ def Send(endpoints, send_vars, get_vars=None):
     endpoints = list(set(epmap))
 
     helper = LayerHelper("Send", **locals())
-    rpc_client_var = default_main_program().global_block().create_var(
-        name="RPC_CLIENT_VAR", persistable=True, type=core.VarDesc.VarType.RAW)
     if not get_vars:
         get_vars = []
         for s in send_vars:
             v = helper.create_tmp_variable(dtype=s.dtype, stop_gradient=True)
             get_vars.append(v)
+    rpc_op_role_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
     helper.append_op(
         type="send",
         inputs={"X": send_vars},
-        outputs={"Out": get_vars,
-                 "RPCClient": rpc_client_var},
-        attrs={"endpoints": endpoints,
-               "epmap": epmap})
+        outputs={"Out": get_vars},
+        attrs={
+            "endpoints": endpoints,
+            "epmap": epmap,
+            rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC
+        })
+
     return get_vars
 
 
@@ -321,7 +326,7 @@ def open_recordio_file(filename,
                                           dtypes=['float32', 'int64'])
 
          # Via the reader, we can use 'read_file' layer to get data:
-         image, label = fluid.layers.read_file(reader)
+         image, label = fluid.layers.io.read_file(reader)
     """
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
     shape_concat = []
@@ -359,11 +364,78 @@ def open_recordio_file(filename,
     return monkey_patch_reader_methods(main_prog_var)
 
 
+def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
+    """
+    Create a uniform random data generator
+
+    This layer returns a Reader Variable.
+    Instead of opening a file and reading data from it, this 
+    Reader Variable generates float uniform random data by itself. 
+    It can be used as a dummy reader to test a network without 
+    opening a real file.
+
+    Args:
+       low(float): The lower bound of data's uniform distribution.
+       high(float): The upper bound of data's uniform distribution.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(list): List of ints which declaring data lod_level.
+       for_parallel(Bool): Set it as True if you are going to run
+            subsequent operators in parallel.
+
+    Returns:
+       Variable: A Reader Variable from which we can get random data.
+
+    Examples:
+       .. code-block:: python
+
+         reader = fluid.layers.io.random_data_generator(
+                                          low=0.0,
+                                          high=1.0,
+                                          shapes=[(3,224,224), (1)],
+                                          lod_levels=[0, 0])
+
+         # Via the reader, we can use 'read_file' layer to get data:
+         image, label = fluid.layers.io.read_file(reader)
+    """
+    dtypes = [core.VarDesc.VarType.FP32] * len(shapes)
+    shape_concat = []
+    ranks = []
+
+    for shape in shapes:
+        shape_concat.extend(shape)
+        ranks.append(len(shape))
+
+    var_name = unique_name('random_data_generator')
+
+    startup_blk = default_startup_program().current_block()
+    startup_var = startup_blk.create_var(name=var_name)
+    startup_blk.append_op(
+        type='create_random_data_generator',
+        outputs={'Out': [startup_var]},
+        attrs={
+            'low': low,
+            'high': high,
+            'shape_concat': shape_concat,
+            'lod_levels': lod_levels,
+            'ranks': ranks
+        })
+
+    startup_var.desc.set_dtypes(dtypes)
+    startup_var.persistable = True
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+
+    if for_parallel:
+        main_prog_var = parallel(reader=main_prog_var)
+
+    return monkey_patch_reader_methods(main_prog_var)
+
+
 def open_files(filenames,
                shapes,
                lod_levels,
                dtypes,
-               thread_num,
+               thread_num=1,
                buffer_size=None,
                pass_num=1,
                for_parallel=True):
@@ -468,8 +540,6 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
         inputs={'UnderlyingReader': reader},
         outputs={'Out': [new_reader]},
         attrs=attrs)
-    new_reader.persistable = True
-    new_reader.stop_gradient = True
     return monkey_patch_reader_methods(new_reader)
 
 
@@ -514,3 +584,108 @@ def read_file(file_obj):
         return out[0]
     else:
         return out
+
+
+class Preprocessor(object):
+    BEFORE_SUB_BLOCK = 0
+    IN_SUB_BLOCK = 1
+    AFTER_SUB_BLOCK = 2
+
+    def __init__(self, reader, name=None):
+        self.underlying_reader = reader
+        new_reader_name = name if name is not None else unique_name(
+            "create_custom_reader")
+        self.main_prog = default_main_program()
+        self.reader = self.main_prog.current_block().create_var(
+            name=new_reader_name)
+        self.sub_block = None
+        self.source_var_names = None
+        self.sink_var_names = None
+        self.status = Preprocessor.BEFORE_SUB_BLOCK
+
+    def is_completed(self):
+        return self.sub_block and self.source_var_names and self.sink_var_names
+
+    @contextlib.contextmanager
+    def block(self):
+        self.status = Preprocessor.IN_SUB_BLOCK
+        self.sub_block = self.main_prog.create_block()
+        yield
+        self.main_prog.rollback()
+        self.status = Preprocessor.AFTER_SUB_BLOCK
+        if not self.is_completed():
+            raise RuntimeError(
+                "The definition of preprocessor is incompleted! "
+                "Please make sure that you have set input and output "
+                "variables by invoking 'inputs' and 'outputs' in "
+                "Preprocessor's sub-block.")
+
+    def inputs(self):
+        if self.status != Preprocessor.IN_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor.inputs() can only be invoked inside the sub-block."
+            )
+
+        source_shapes = self.underlying_reader.desc.shapes()
+        source_dtypes = self.underlying_reader.desc.dtypes()
+        source_lod_levels = self.underlying_reader.desc.lod_levels()
+        self.source_var_names = [
+            unique_name("preprocessor_source")
+            for _ in xrange(len(source_shapes))
+        ]
+        source_vars = []
+        for var_name, shape, dtype, lod_level in zip(
+                self.source_var_names, source_shapes, source_dtypes,
+                source_lod_levels):
+            source_vars.append(self.main_prog.current_block().create_var(
+                name=var_name, shape=shape, dtype=dtype, lod_level=lod_level))
+        return source_vars
+
+    def outputs(self, *outs):
+        if self.status != Preprocessor.IN_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor.outputs() can only be invoked inside the sub-block."
+            )
+        self.sink_var_names = [var.name for var in outs]
+
+    def __call__(self, *args, **kwargs):
+        if self.status != Preprocessor.AFTER_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor output can only be retrieved after rnn block.")
+
+        self.main_prog.current_block().append_op(
+            type="create_custom_reader",
+            inputs={'UnderlyingReader': self.underlying_reader},
+            outputs={'Out': [self.reader]},
+            attrs={
+                "sub_block": self.sub_block,
+                "source_var_names": self.source_var_names,
+                "sink_var_names": self.sink_var_names
+            })
+        return monkey_patch_reader_methods(self.reader)
+
+
+@templatedoc()
+def load(out, file_path, load_as_fp16=None):
+    """
+    ${comment}
+
+    >>> import paddle.fluid as fluid
+    >>> tmp_tensor = fluid.layers.create_tensor(dtype='float32')
+    >>> fluid.layers.load(tmp_tensor, "./tmp_tensor.bin")
+
+    Args:
+        out(${out_type}): ${out_comment}.
+
+        file_path(${file_path_type}): ${file_path_comment}.
+
+        load_as_fp16(${load_as_fp16_type}): ${load_as_fp16_comment}.
+
+    Returns:
+        None
+    """
+    helper = LayerHelper("load", **locals())
+    attrs = {"file_path": file_path}
+    if load_as_fp16 is not None:
+        attrs['load_as_fp16'] = load_as_fp16
+    helper.append_op(type="load", inputs={}, output={"Out": out}, args=attrs)
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 295d1b7190..cb60a3aec9 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -15,16 +15,13 @@ import re
 import cStringIO
 import functools
 import warnings
+import string
 
 from ..proto import framework_pb2
 from ..framework import OpProtoHolder, Variable
 from ..layer_helper import LayerHelper
 
-__all__ = [
-    'deprecated',
-    'generate_layer_fn',
-    'autodoc',
-]
+__all__ = ['deprecated', 'generate_layer_fn', 'autodoc', 'templatedoc']
 
 
 def _convert_(name):
@@ -43,6 +40,10 @@ def _convert_(name):
     return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
 
 
+def _type_to_str_(tp):
+    return framework_pb2.AttrType.Name(tp)
+
+
 def _generate_doc_string_(op_proto):
     """
     Generate docstring by OpProto
@@ -54,9 +55,6 @@ def _generate_doc_string_(op_proto):
         str: the document string
     """
 
-    def _type_to_str_(tp):
-        return framework_pb2.AttrType.Name(tp)
-
     if not isinstance(op_proto, framework_pb2.OpProto):
         raise TypeError("OpProto should be `framework_pb2.OpProto`")
 
@@ -75,7 +73,11 @@ def _generate_doc_string_(op_proto):
         buf.write(str(each_input.dispensable))
         buf.write('\n')
 
+    skip_attrs = OpProtoHolder.generated_op_attr_names()
+
     for each_attr in op_proto.attrs:
+        if each_attr.name in skip_attrs:
+            continue
         buf.write('    ')
         buf.write(each_attr.name)
         buf.write(' (')
@@ -220,3 +222,67 @@ def autodoc(comment=""):
         return func
 
     return __impl__
+
+
+_inline_math_single_dollar = re.compile(r"\$([^\$]+)\$")
+
+
+def templatedoc(op_type=None):
+    """
+    Decorator of layer function. It will use the docstring from the layer
+    function as the template. The template arguments are:
+
+    * ${comment}: The operator comment written in CPP.
+    * ${{name}_comment}: The comment of ${name} written with AddAttr, AddOutput,
+        and AddInput. The ${name} is Python snake style. i.e., xxx_xxx.
+    * ${{name}_type}: The type of ${name}.
+
+    Returns:
+        Decorated function.
+    """
+
+    def trim_ending_dot(msg):
+        return msg.rstrip('.')
+
+    def escape_inline_math(msg):
+        return _inline_math_single_dollar.sub(repl=r':math:`\1`', string=msg)
+
+    def __impl__(func):
+        if op_type is None:
+            op_type_name = func.__name__
+        else:
+            op_type_name = op_type
+        op_proto = OpProtoHolder.instance().get_op_proto(op_type_name)
+        tmpl = string.Template(func.__doc__)
+
+        comment_lines = op_proto.comment.split("\n")
+        comment = ""
+        for line in comment_lines:
+            line = line.strip()
+            if len(line) != 0:
+                comment += escape_inline_math(line)
+                comment += " "
+            elif len(comment) != 0:
+                comment += "\n    \n    "
+
+        args = {"comment": trim_ending_dot(comment)}
+        for each_input in op_proto.inputs:
+            input_name = _convert_(each_input.name)
+            args["{0}_comment".format(input_name)] = trim_ending_dot(
+                each_input.comment)
+            args["{0}_type".format(input_name)] = "Variable"
+        for each_attr in op_proto.attrs:
+            input_name = _convert_(each_attr.name)
+            args["{0}_comment".format(input_name)] = trim_ending_dot(
+                each_attr.comment)
+            args["{0}_type".format(input_name)] = _type_to_str_(each_attr.type)
+
+        for each_opt in op_proto.outputs:
+            output_name = _convert_(each_opt.name)
+            args["{0}_comment".format(output_name)] = trim_ending_dot(
+                each_opt.comment)
+            args["{0}_type".format(output_name)] = "Variable"
+        func.__doc__ = tmpl.substitute(args)
+        return func
+
+    return __impl__
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index d13c54daa5..716cc7824e 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -11,6 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+When training a model, it's often useful to decay the
+learning rate during training process, this is called
+learning_rate_decay. There are many strategies to do
+this, this module will provide some classical method.
+User can also implement their own learning_rate_decay
+strategy according to this module.
+"""
 
 import control_flow
 import nn
@@ -22,14 +30,6 @@ __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
     'polynomial_decay', 'piecewise_decay', 'noam_decay'
 ]
-"""
-When training a model, it's often useful to decay the
-learning rate during training process, this is called
-learning_rate_decay. There are many strategies to do
-this, this module will provide some classical method.
-User can also implement their own learning_rate_decay
-strategy according to this module.
-"""
 
 
 def _decay_step_counter(begin=0):
@@ -41,18 +41,20 @@ def _decay_step_counter(begin=0):
 
 
 def noam_decay(d_model, warmup_steps):
-    """Apply decay to learning rate.
-    ```python
-    lr_value = np.power(d_model, -0.5) * np.min([
-            np.power(current_steps, -0.5),
-            np.power(warmup_steps, -1.5) * current_steps
-        ])
-    ```
+    """
+    Noam decay method. The numpy implementation of noam decay as follows.
+
+    >>> import numpy as np
+    >>> lr_value = np.power(d_model, -0.5) * np.min([
+    >>>                         np.power(current_steps, -0.5),
+    >>>                         np.power(warmup_steps, -1.5) * current_steps])
+
+    Please reference `attention is all you need
+    <https://arxiv.org/pdf/1706.03762.pdf>`_.
 
     Args:
         d_model(Variable): The dimensionality of input and output of model.
-            Reference: attention is all you need
-                https://arxiv.org/pdf/1706.03762.pdf
+
         warmup_steps(Variable): A super parameter.
 
     Returns:
diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric.py
index cab2eb5551..a1c64ce277 100644
--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
@@ -64,10 +64,6 @@ def auc(input, label, curve='ROC', num_thresholds=200):
     topk_indices = helper.create_tmp_variable(dtype="int64")
     topk_out, topk_indices = nn.topk(input, k=k)
     auc_out = helper.create_tmp_variable(dtype="float32")
-    if correct is None:
-        correct = helper.create_tmp_variable(dtype="int64")
-    if total is None:
-        total = helper.create_tmp_variable(dtype="int64")
     helper.append_op(
         type="accuracy",
         inputs={
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ddf502f08a..7a4dc3d7be 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-All layers just related to the neural network.
+All layers just related to the neural network. 
 """
 
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-from layer_function_generator import autodoc
+from layer_function_generator import autodoc, templatedoc
 from tensor import concat
 import utils
+import random
 
 __all__ = [
     'fc',
@@ -80,6 +81,12 @@ __all__ = [
     'pad',
     'label_smooth',
     'roi_pool',
+    'dice_loss',
+    'image_resize',
+    'image_resize_short',
+    'resize_bilinear',
+    'gather',
+    'random_crop',
 ]
 
 
@@ -88,7 +95,6 @@ def fc(input,
        num_flatten_dims=1,
        param_attr=None,
        bias_attr=None,
-       use_cudnn=False,
        use_mkldnn=False,
        act=None,
        is_test=False,
@@ -152,7 +158,8 @@ def fc(input,
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          data = fluid.layers.data(
+              name="data", shape=[32, 32], dtype="float32")
           fc = fluid.layers.fc(input=data, size=1000, act="tanh")
     """
 
@@ -175,11 +182,8 @@ def fc(input,
             inputs={"X": input_var,
                     "Y": w},
             outputs={"Out": tmp},
-            attrs={
-                "x_num_col_dims": num_flatten_dims,
-                "y_num_col_dims": 1,
-                "use_mkldnn": use_mkldnn
-            })
+            attrs={"x_num_col_dims": num_flatten_dims,
+                   "y_num_col_dims": 1})
         mul_results.append(tmp)
 
     if len(mul_results) == 1:
@@ -217,6 +221,7 @@ def embedding(input,
             have two elements which indicate the size of the dictionary of
             embeddings and the size of each embedding vector respectively.
         is_sparse(bool): The flag indicating whether to use sparse update.
+        is_distributed (bool): Whether to run lookup table from remote parameter server.
         padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
             Otherwise the given :attr:`padding_idx` indicates padding the output
             with zeros whenever lookup encounters it in :attr:`input`. If
@@ -347,7 +352,8 @@ def dynamic_lstm(input,
         cell_activation(str): The activation for cell output. Choices = ["sigmoid",
                               "tanh", "relu", "identity"], default "tanh".
         candidate_activation(str): The activation for candidate hidden state.
-                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              Choices = ["sigmoid", "tanh",
+                                  "relu", "identity"],
                               default "tanh".
         dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
         name(str|None): A name for this layer(optional). If set None, the layer
@@ -514,10 +520,12 @@ def dynamic_lstmp(input,
         cell_activation(str): The activation for cell output. Choices = ["sigmoid",
                               "tanh", "relu", "identity"], default "tanh".
         candidate_activation(str): The activation for candidate hidden state.
-                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              Choices = ["sigmoid", "tanh",
+                                  "relu", "identity"],
                               default "tanh".
         proj_activation(str): The activation for projection output.
-                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              Choices = ["sigmoid", "tanh",
+                                  "relu", "identity"],
                               default "tanh".
         dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
         name(str|None): A name for this layer(optional). If set None, the layer
@@ -646,8 +654,9 @@ def dynamic_gru(input,
             :attr:`False`.
         gate_activation(str): The activation for update gate and reset gate.
             Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid".
-        activation(str): The activation for candidate hidden state.
+        candidate_activation(str): The activation for candidate hidden state.
             Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
+        h_0 (Variable): The hidden output of the first time step.
 
     Returns:
         Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \
@@ -699,8 +708,8 @@ def dynamic_gru(input,
 def gru_unit(input,
              hidden,
              size,
-             weight=None,
-             bias=None,
+             param_attr=None,
+             bias_attr=None,
              activation='tanh',
              gate_activation='sigmoid'):
     """
@@ -731,8 +740,8 @@ def gru_unit(input,
         input (Variable): The fc transformed input value of current step.
         hidden (Variable): The hidden value of lstm unit from previous step.
         size (integer): The input dimension value.
-        weight (ParamAttr): The weight parameters for gru unit. Default: None
-        bias (ParamAttr): The bias parameters for gru unit. Default: None
+        param_attr (ParamAttr): The weight parameters for gru unit. Default: None
+        bias_attr (ParamAttr): The bias parameters for gru unit. Default: None
         activation (string): The activation type for cell (actNode).
                              Default: 'tanh'
         gate_activation (string): The activation type for gates (actGate).
@@ -764,40 +773,52 @@ def gru_unit(input,
     size = size / 3
 
     # create weight
-    if weight is None:
-        weight = helper.create_parameter(
-            attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
 
+    gate = helper.create_tmp_variable(dtype)
+    reset_hidden_pre = helper.create_tmp_variable(dtype)
+    updated_hidden = helper.create_tmp_variable(dtype)
+    inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight}
     # create bias
-
-    if bias is None:
+    if helper.bias_attr:
         bias_size = [1, 3 * size]
         bias = helper.create_parameter(
             attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
-
-    gate = helper.create_tmp_variable(dtype)
-    reset_hidden_pre = helper.create_tmp_variable(dtype)
-    updated_hidden = helper.create_tmp_variable(dtype)
+        inputs['Bias'] = bias
 
     helper.append_op(
         type='gru_unit',
-        inputs={'Input': input,
-                'HiddenPrev': hidden,
-                'Weight': weight},
+        inputs=inputs,
         outputs={
             'Gate': gate,
             'ResetHiddenPrev': reset_hidden_pre,
             'Hidden': updated_hidden,
         },
         attrs={
-            'activation': 0,
-            'gate_activation': 1,
+            'activation': 2,  # tanh
+            'gate_activation': 1,  # sigmoid
         })
 
     return updated_hidden, reset_hidden_pre, gate
 
 
+@templatedoc()
 def linear_chain_crf(input, label, param_attr=None):
+    """
+    Linear Chain CRF.
+
+    ${comment}
+
+    Args:
+        input(${emission_type}): ${emission_comment}
+        label(${label_type}): ${label_comment}
+        param_attr(ParamAttr): The attribute of the learnable parameter.
+
+    Returns:
+        ${log_likelihood_comment}
+
+    """
     helper = LayerHelper('linear_chain_crf', **locals())
     size = input.shape[1]
     transition = helper.create_parameter(
@@ -823,7 +844,19 @@ def linear_chain_crf(input, label, param_attr=None):
     return log_likelihood
 
 
+@templatedoc()
 def crf_decoding(input, param_attr, label=None):
+    """
+    ${comment}
+
+    Args:
+        input(${emission_type}): ${emission_comment}
+        param_attr(ParamAttr): The parameter attribute for training.
+        label(${label_type}): ${label_comment}
+
+    Returns:
+        ${viterbi_path_comment}
+    """
     helper = LayerHelper('crf_decoding', **locals())
     transition = helper.get_parameter(param_attr.name)
     viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -841,6 +874,13 @@ def cos_sim(X, Y):
     """
     This function performs the cosine similarity between two tensors
     X and Y and returns that as the output.
+
+    Args:
+        X (Variable): The input X.
+        Y (Variable): The input Y.
+    
+    Returns:
+        Variable: the output of cosine(X, Y).
     """
     helper = LayerHelper('cos_sim', **locals())
     out = helper.create_tmp_variable(dtype=X.dtype)
@@ -856,7 +896,7 @@ def cos_sim(X, Y):
     return out
 
 
-def dropout(x, dropout_prob, is_test=False, seed=None):
+def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
     """
     Computes dropout.
 
@@ -867,13 +907,15 @@ def dropout(x, dropout_prob, is_test=False, seed=None):
     unchanged.
 
     Args:
-       x(variable): The input tensor.
-       dropout_prob(float): Probability of setting units to zero.
-       is_test(bool): A flag indicating whether it is in test phrase or not.
-       seed(int): A Python integer used to create random seeds. If this
-                  parameter is set to None, a random seed is used.
-                  NOTE: If an integer seed is given, always the same output
-                  units will be dropped. DO NOT use a fixed seed in training.
+        x (Variable): The input tensor.
+         dropout_prob (float): Probability of setting units to zero.
+        is_test (bool): A flag indicating whether it is in test phrase or not.
+        seed (int): A Python integer used to create random seeds. If this
+                    parameter is set to None, a random seed is used.
+                    NOTE: If an integer seed is given, always the same output
+                    units will be dropped. DO NOT use a fixed seed in training.
+        name (str|None): A name for this layer(optional). If set None, the layer
+                         will be named automatically.
 
     Returns:
         Variable: A tensor variable.
@@ -995,8 +1037,8 @@ def square_error_cost(input, label):
         * :math:`Out`: Output value, same shape with :math:`X`.
 
     Args:
-       input(Variable): Input tensor, has predictions.
-       label(Variable): Label tensor, has target labels.
+        input (Variable): Input tensor, has predictions.
+        label (Variable): Label tensor, has target labels.
 
     Returns:
         Variable: The tensor variable storing the element-wise squared error \
@@ -1025,6 +1067,7 @@ def square_error_cost(input, label):
     return square_out
 
 
+@templatedoc()
 def chunk_eval(input,
                label,
                chunk_scheme,
@@ -1033,6 +1076,18 @@ def chunk_eval(input,
     """
     This function computes and outputs the precision, recall and
     F1-score of chunk detection.
+
+    Args:
+        input (Variable): prediction output of the network.
+        label (Variable): label of the test data set.
+        chunk_scheme (str): ${chunk_scheme_comment}
+        num_chunk_types (int): ${num_chunk_types_comment}
+        excluded_chunk_types (list): ${excluded_chunk_types_comment}
+    
+    Returns:
+        tuple: tuple containing: (precision, recall, f1_score,
+               num_infer_chunks, num_label_chunks,
+               num_correct_chunks)
     """
     helper = LayerHelper("chunk_eval", **locals())
 
@@ -1065,6 +1120,7 @@ def chunk_eval(input,
             num_correct_chunks)
 
 
+@templatedoc()
 def sequence_conv(input,
                   num_filters,
                   filter_size=3,
@@ -1077,6 +1133,19 @@ def sequence_conv(input,
     This function creates the op for sequence_conv, using the inputs and
     other convolutional configurations for the filters and stride as given
     in the input parameters to the function.
+
+    Args:
+        input (Variable): ${x_comment}
+        num_filters (int): number of filters.
+        filter_size (int): the filter size (H and W).
+        filter_stride (int): stride of the filter.
+        padding (bool): if True, add paddings.
+        bias_attr (ParamAttr|None): attributes for bias
+        param_attr (ParamAttr|None): attributes for parameter
+        act (str): the activation type
+    
+    Returns:
+        Variable: output of sequence_conv
     """
 
     # FIXME(dzh) : want to unify the argument of python layer
@@ -1118,7 +1187,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
     return softmax_out
 
 
-def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
+def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
     helper = LayerHelper('softmax', **locals())
     dtype = helper.input_dtype()
     softmax_out = helper.create_tmp_variable(dtype)
@@ -1176,48 +1245,49 @@ def conv2d(input,
 
         - Input:
 
-          Input shape: $(N, C_{in}, H_{in}, W_{in})$
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
 
-          Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
 
         - Output:
-          Output shape: $(N, C_{out}, H_{out}, W_{out})$
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
 
         Where
 
         .. math::
 
-        H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-        W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
 
     Args:
-       input(Variable): The input image with [N, C, H, W] format.
-       num_filters(int): The number of filter. It is as same as the output
-           image channel.
-       filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-           it must contain two integers, (filter_size_H, filter_size_W).
-           Otherwise, the filter will be a square.
-       stride(int|tuple): The stride size. If stride is a tuple, it must
-           contain two integers, (stride_H, stride_W). Otherwise, the
-           stride_H = stride_W = stride. Default: stride = 1.
-       padding(int|tuple): The padding size. If padding is a tuple, it must
-           contain two integers, (padding_H, padding_W). Otherwise, the
-           padding_H = padding_W = padding. Default: padding = 0.
-       dilation(int|tuple): The dilation size. If dilation is a tuple, it must
-           contain two integers, (dilation_H, dilation_W). Otherwise, the
-           dilation_H = dilation_W = dilation. Default: dilation = 1.
-       groups(int): The groups number of the Conv2d Layer. According to grouped
-           convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-           the first half of the filters is only connected to the first half
-           of the input channels, while the second half of the filters is only
-           connected to the second half of the input channels. Default: groups=1
-       param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None
-       bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
-       use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
-           library is installed. Default: True
-       act(str): Activation type. Default: None
-       name(str|None): A name for this layer(optional). If set None, the layer
-           will be named automatically.
+        input (Variable): The input image with [N, C, H, W] format.
+            num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride (int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        padding (int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups (int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
+        bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        use_mkldnn (bool): Use mkldnn kernels or not.
+        act (str): Activation type. Default: None
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
 
     Returns:
         Variable: The tensor variable storing the convolution and \
@@ -1375,7 +1445,7 @@ def sequence_pool(input, pool_type):
 
 def sequence_first_step(input):
     """
-    This funciton get the first step of sequence.
+    This function gets the first step of sequence.
 
     .. code-block:: text
 
@@ -1408,7 +1478,7 @@ def sequence_first_step(input):
 
 def sequence_last_step(input):
     """
-    This funciton get the last step of sequence.
+    This function gets the last step of sequence.
 
     .. code-block:: text
 
@@ -1452,6 +1522,22 @@ def pool2d(input,
     """
     This function adds the operator for pooling in 2 dimensions, using the
     pooling configurations mentioned in input parameters.
+
+    Args:
+        input (Variable): ${input_comment}
+        pool_size (int): ${ksize_comment}
+        pool_type (str): ${pooling_type_comment}
+        pool_stride (int): stride of the pooling layer.
+        pool_padding (int): padding size.
+        global_pooling (bool): ${global_pooling_comment}
+        use_cudnn (bool): ${use_cudnn_comment}
+        ceil_mode (bool): ${ceil_mode_comment}
+        use_mkldnn (bool): ${use_mkldnn_comment}
+        name (str): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+    
+    Returns:
+        Variable: output of pool2d layer.
     """
     if pool_type not in ["max", "avg"]:
         raise ValueError(
@@ -1509,6 +1595,25 @@ def batch_norm(input,
     """
     This function helps create an operator to implement
     the BatchNorm layer using the configurations from the input parameters.
+
+    Args:
+        input (Variable): the input variable.
+        act (str): activation type
+        is_test (bool): whether to run batch_norm as test mode.
+        momentum (float): momentum
+        epsilon (float): epsilon, default 1e-05
+        param_attr (ParamAttr|None): attributes for parameter
+        bias_attr (ParamAttr|None): attributes for bias
+        data_layout (str): data layout, default NCHW
+        in_place (bool): if True, do not create tmp variable
+        use_mkldnn (bool): ${use_mkldnn_comment}
+        name (str): The name of this layer. It is optional.
+        moving_mean_name (str): The name of moving mean variable name, optional.
+        moving_variance_name (str): The name of moving variance name, optional.
+        do_model_average_for_mean_and_var (bool):
+
+    Returns:
+        Variable: output of batch_norm layer.
     """
     helper = LayerHelper('batch_norm', **locals())
     dtype = helper.input_dtype()
@@ -1636,6 +1741,7 @@ def layer_norm(input,
         bias_attr(ParamAttr|None): The parameter attribute for the learnable
             bias :math:`b`.
         act(str): Activation to be applied to the output of layer normalizaiton.
+        name (str): The name of this layer. It is optional.
 
     Returns:
         Variable: A tensor variable with the same shape as the input.
@@ -1746,6 +1852,7 @@ def conv2d_transpose(input,
                      padding=0,
                      stride=1,
                      dilation=1,
+                     groups=None,
                      param_attr=None,
                      bias_attr=None,
                      use_cudnn=True,
@@ -1797,40 +1904,46 @@ def conv2d_transpose(input,
            W_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1
 
     Args:
-       input(Variable): The input image with [N, C, H, W] format.
-       num_filters(int): The number of the filter. It is as same as the output
-           image channel.
-       output_size(int|tuple|None): The output image size. If output size is a
-           tuple, it must contain two integers, (image_H, image_W). This
-           parameter only works when filter_size is None.
-       filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-           it must contain two integers, (filter_size_H, filter_size_W).
-           Otherwise, the filter will be a square. None if use output size to
-           calculate filter_size.
-       padding(int|tuple): The padding size. If padding is a tuple, it must
-           contain two integers, (padding_H, padding_W). Otherwise, the
-           padding_H = padding_W = padding. Default: padding = 0.
-       stride(int|tuple): The stride size. If stride is a tuple, it must
-           contain two integers, (stride_H, stride_W). Otherwise, the
-           stride_H = stride_W = stride. Default: stride = 1.
-       dilation(int|tuple): The dilation size. If dilation is a tuple, it must
-           contain two integers, (dilation_H, dilation_W). Otherwise, the
-           dilation_H = dilation_W = dilation. Default: dilation = 1.
-       param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
-                              Default: None
-       bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
-       use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
-           library is installed. Default: True
-       act(str): Activation type. Default: None
-       name(str|None): A name for this layer(optional). If set None, the layer
-           will be named automatically.
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of the filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). This
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square. None if use output size to
+            calculate filter_size.
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups(int): The groups number of the Conv2d transpose layer. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups=1
+        param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
+                               Default: None
+        bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act(str): Activation type. Default: None
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
 
     Returns:
-       Variable: The tensor variable storing the convolution transpose result.
+        Variable: The tensor variable storing the convolution transpose result.
 
     Raises:
-       ValueError: If the shapes of input, filter_size, stride, padding and
-                   groups mismatch.
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
 
     Examples:
        .. code-block:: python
@@ -1870,7 +1983,8 @@ def conv2d_transpose(input,
         filter_size = utils.convert_to_list(filter_size, 2,
                                             'conv2d_transpose.filter_size')
 
-    filter_shape = [input_channel, num_filters] + filter_size
+    groups = 1 if groups is None else groups
+    filter_shape = [input_channel, num_filters / groups] + filter_size
     img_filter = helper.create_parameter(
         dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
 
@@ -1884,6 +1998,7 @@ def conv2d_transpose(input,
             'strides': stride,
             'paddings': padding,
             'dilations': dilation,
+            'groups': groups,
             'use_cudnn': use_cudnn
         })
 
@@ -2194,11 +2309,11 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the sum is performed. If
+        dim (list|int|None): The dimensions along which the sum is performed. If
             :attr:`None`, sum all elements of :attr:`input` and return a
             Tensor variable with a single element, otherwise must be in the
-            range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`,
-            the dimension to reduce is :math:`rank + dim`.
+            range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
+            the dimension to reduce is :math:`rank + dim[i]`.
         keep_dim (bool|False): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
@@ -2219,15 +2334,25 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
             fluid.layers.reduce_sum(x, dim=-1)  # [1.9, 1.6]
             fluid.layers.reduce_sum(x, dim=1, keep_dim=True)  # [[1.9], [1.6]]
+
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
+            #      [[[1, 2], [3, 4]],
+            #      [[5, 6], [7, 8]]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_sum(x, dim=[1, 2]) # [10, 26]
+            fluid.layers.reduce_sum(x, dim=[0, 1]) # [16, 20]
+
     """
     helper = LayerHelper('reduce_sum', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
     helper.append_op(
         type='reduce_sum',
         inputs={'X': input},
         outputs={'Out': out},
         attrs={
-            'dim': dim if dim != None else 0,
+            'dim': dim if dim != None else [0],
             'keep_dim': keep_dim,
             'reduce_all': True if dim == None else False
         })
@@ -2240,11 +2365,11 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the mean is computed. If
+        dim (list|int|None): The dimensions along which the mean is computed. If
             :attr:`None`, compute the mean over all elements of :attr:`input`
             and return a Tensor variable with a single element, otherwise
             must be in the range :math:`[-rank(input), rank(input))`. If
-            :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+            :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
@@ -2264,16 +2389,26 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_mean(x)  # [0.4375]
             fluid.layers.reduce_mean(x, dim=0)  # [0.15, 0.25, 0.55, 0.8]
             fluid.layers.reduce_mean(x, dim=-1)  # [0.475, 0.4]
-            fluid.layers.reduce_mean(x, dim=1, keep_dim=True)  # [[0.475], [0.4]]
+            fluid.layers.reduce_mean(
+                x, dim=1, keep_dim=True)  # [[0.475], [0.4]]
+
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
+            #      [[[1.0, 2.0], [3.0, 4.0]],
+            #      [[5.0, 6.0], [7.0, 8.0]]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_mean(x, dim=[1, 2]) # [2.5, 6.5]
+            fluid.layers.reduce_mean(x, dim=[0, 1]) # [4.0, 5.0]
     """
     helper = LayerHelper('reduce_mean', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
     helper.append_op(
         type='reduce_mean',
         inputs={'X': input},
         outputs={'Out': out},
         attrs={
-            'dim': dim if dim != None else 0,
+            'dim': dim if dim != None else [0],
             'keep_dim': keep_dim,
             'reduce_all': True if dim == None else False
         })
@@ -2286,11 +2421,11 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the maximum is computed.
+        dim (list|int|None): The dimension along which the maximum is computed.
             If :attr:`None`, compute the maximum over all elements of
             :attr:`input` and return a Tensor variable with a single element,
             otherwise must be in the range :math:`[-rank(input), rank(input))`.
-            If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
@@ -2311,15 +2446,24 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_max(x, dim=0)  # [0.2, 0.3, 0.6, 0.9]
             fluid.layers.reduce_max(x, dim=-1)  # [0.9, 0.7]
             fluid.layers.reduce_max(x, dim=1, keep_dim=True)  # [[0.9], [0.7]]
+
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
+            #      [[[1.0, 2.0], [3.0, 4.0]],
+            #      [[5.0, 6.0], [7.0, 8.0]]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_max(x, dim=[1, 2]) # [4.0, 8.0]
+            fluid.layers.reduce_max(x, dim=[0, 1]) # [7.0, 8.0]
     """
     helper = LayerHelper('reduce_max', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
     helper.append_op(
         type='reduce_max',
         inputs={'X': input},
         outputs={'Out': out},
         attrs={
-            'dim': dim if dim != None else 0,
+            'dim': dim if dim != None else [0],
             'keep_dim': keep_dim,
             'reduce_all': True if dim == None else False
         })
@@ -2332,11 +2476,11 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the minimum is computed.
+        dim (list|int|None): The dimensions along which the minimum is computed.
             If :attr:`None`, compute the minimum over all elements of
             :attr:`input` and return a Tensor variable with a single element,
             otherwise must be in the range :math:`[-rank(input), rank(input))`.
-            If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
@@ -2357,15 +2501,24 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_min(x, dim=0)  # [0.1, 0.2, 0.5, 0.7]
             fluid.layers.reduce_min(x, dim=-1)  # [0.2, 0.1]
             fluid.layers.reduce_min(x, dim=1, keep_dim=True)  # [[0.2], [0.1]]
+
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
+            #      [[[1.0, 2.0], [3.0, 4.0]],
+            #      [[5.0, 6.0], [7.0, 8.0]]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_min(x, dim=[1, 2]) # [1.0, 5.0]
+            fluid.layers.reduce_min(x, dim=[0, 1]) # [1.0, 2.0]
     """
     helper = LayerHelper('reduce_min', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
     helper.append_op(
         type='reduce_min',
         inputs={'X': input},
         outputs={'Out': out},
         attrs={
-            'dim': dim if dim != None else 0,
+            'dim': dim if dim != None else [0],
             'keep_dim': keep_dim,
             'reduce_all': True if dim == None else False
         })
@@ -2378,11 +2531,11 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the product is performed. If
+        dim (list|int|None): The dimensions along which the product is performed. If
             :attr:`None`, multipy all elements of :attr:`input` and return a
             Tensor variable with a single element, otherwise must be in the
-            range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`,
-            the dimension to reduce is :math:`rank + dim`.
+            range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
+            the dimension to reduce is :math:`rank + dim[i]`.
         keep_dim (bool|False): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
@@ -2404,15 +2557,24 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_prod(x, dim=-1)  # [0.027, 0.0084]
             fluid.layers.reduce_prod(x, dim=1,
                                      keep_dim=True)  # [[0.027], [0.0084]]
+
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
+            #      [[[1.0, 2.0], [3.0, 4.0]],
+            #      [[5.0, 6.0], [7.0, 8.0]]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_prod(x, dim=[1, 2]) # [24.0, 1680.0]
+            fluid.layers.reduce_prod(x, dim=[0, 1]) # [105.0, 384.0]
     """
     helper = LayerHelper('reduce_prod', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
     helper.append_op(
         type='reduce_prod',
         inputs={'X': input},
         outputs={'Out': out},
         attrs={
-            'dim': dim if dim != None else 0,
+            'dim': dim if dim != None else [0],
             'keep_dim': keep_dim,
             'reduce_all': True if dim == None else False
         })
@@ -2447,7 +2609,8 @@ def split(input, num_or_sections, dim=-1, name=None):
             x0.shape  # [3, 3, 5]
             x1.shape  # [3, 3, 5]
             x2.shape  # [3, 3, 5]
-            x0, x1, x2 = fluid.layers.split(x, num_or_sections=[2, 3, 4], dim=1)
+            x0, x1, x2 = fluid.layers.split(
+                x, num_or_sections=[2, 3, 4], dim=1)
             x0.shape  # [3, 2, 5]
             x1.shape  # [3, 3, 5]
             x2.shape  # [3, 4, 5]
@@ -2486,19 +2649,21 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
     The l2 normalize layer normalizes `x` along dimension `axis` using an L2
     norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes
 
-    output = x / sqrt(max(sum(x**2), epsilon))
+    .. math::
+    y = \frac{x}{ \sqrt{\sum {x^2} + epsion }}
 
     For `x` with more dimensions, this layer independently normalizes each 1-D
     slice along dimension `axis`.
 
     Args:
-       x(Variable|list): The input tensor to l2_normalize layer.
-       axis(int): Dimension along which to normalize the input.
-       epsilon(float): A lower bound value for `x`'s l2 norm. sqrt(epsilon) will
-                       be used as the divisor if the l2 norm of `x` is less than
-                       sqrt(epsilon).
-       name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
+        x(Variable|list): The input tensor to l2_normalize layer.
+        axis(int): The axis on which to apply normalization. If `axis < 0`,
+            the dimension to normalization is rank(X) + axis. -1 is the
+            last dimension.
+        epsilon(float): The epsilon value is used to avoid division by zero,
+            the defalut value is 1e-10.
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
 
 
     Returns:
@@ -2515,49 +2680,19 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
 
     if len(x.shape) == 1:
         axis = 0
-
     helper = LayerHelper("l2_normalize", **locals())
 
-    square = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(type="square", inputs={"X": x}, outputs={"Out": square})
-
-    reduced_sum = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    norm = helper.create_tmp_variable(dtype=x.dtype)
     helper.append_op(
-        type="reduce_sum",
-        inputs={"X": square},
-        outputs={"Out": reduced_sum},
+        type="norm",
+        inputs={"X": x},
+        outputs={"Out": out,
+                 "Norm": norm},
         attrs={
-            "dim": 1 if axis is None else axis,
-            "keep_dim": True,
-            "reduce_all": False
+            "axis": 1 if axis is None else axis,
+            "epsilon": epsilon,
         })
-
-    # TODO(caoying) A lower bound value epsilon for the norm is needed to
-    # imporve the numeric stability of reciprocal. This requires a maximum_op.
-    rsquare = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(
-        type="reciprocal", inputs={"X": reduced_sum}, outputs={"Out": rsquare})
-
-    # TODO(caoying) the current elementwise_mul operator does not support a
-    # general broadcast rule which broadcasts input(Y) to have the same
-    # dimension with Input(X) starting from a specified dimension. So this
-    # exanpsion is requred. Once a general broadcast rule is spported, this
-    # expanding canbe removed.
-    rsquare_expanded = helper.create_tmp_variable(dtype=x.dtype)
-    expand_times = [1] * len(x.shape)
-    expand_times[axis] = int(x.shape[axis])
-    helper.append_op(
-        type="expand",
-        inputs={"X": rsquare},
-        outputs={"Out": rsquare_expanded},
-        attrs={"expand_times": expand_times})
-
-    out = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(
-        type="elementwise_mul",
-        inputs={"X": x,
-                "Y": rsquare_expanded},
-        outputs={"Out": out})
     return out
 
 
@@ -2667,7 +2802,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     return out
 
 
-def topk(input, k):
+def topk(input, k, name=None):
     """
     This operator is used to find values and indices of the k largest entries
     for the last dimension.
@@ -2683,6 +2818,8 @@ def topk(input, k):
         input(Variable): The input variable which can be a vector or Tensor with
             higher rank.
         k(int): An integer value to specify the top k largest elements.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         values(Variable): The k largest elements along each last dimensional
@@ -2739,16 +2876,13 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
     the edit distance will be divided by the length of reference string.
 
     Args:
-
         input(Variable): The indices for hypothesis strings.
-
         label(Variable): The indices for reference strings.
-
         normalized(bool): Indicated whether to normalize the edit distance by
                           the length of reference string.
-
         ignored_tokens(list of int): Tokens that should be removed before
                                      calculating edit distance.
+        name (str): The name of this layer. It is optional.
 
     Returns:
         Variable: sequence-to-sequence edit distance in shape [batch_size, 1].
@@ -2838,10 +2972,10 @@ def ctc_greedy_decoder(input, blank, name=None):
                          where Lp is the sum of all input sequences' length and
                          num_classes is the true number of classes. (not
                          including the blank label).
-
         blank(int): the blank label index of Connectionist Temporal
                     Classification (CTC) loss, which is in thehalf-opened
                     interval [0, num_classes + 1).
+        name (str): The name of this layer. It is optional.
 
     Returns:
         Variable: CTC greedy decode result. If all the sequences in result were
@@ -2878,23 +3012,23 @@ def warpctc(input, label, blank=0, norm_by_times=False):
     input tensor.
 
     Args:
-       input(Variable): (LodTensor, default: LoDTensor<float>),
-         the unscaled probabilities of variable-length sequences,
-         which is a 2-D Tensor with LoD information.
-         It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
-         sequences' length and num_classes is the true number of classes.
-         (not including the blank label).
-       label(Variable): (LodTensor, default: LoDTensor<int>), the ground truth
-         of variable-length sequence, which is a 2-D Tensor with LoD
-         information. It is of the shape [Lg, 1], where Lg is th sum of
-         all labels' length.
-       blank: (int, default: 0), the blank label index of Connectionist
-         Temporal Classification (CTC) loss, which is in the
-         half-opened interval [0, num_classes + 1).
-       norm_by_times: (bool, default: false), whether to normalize
-       the gradients by the number of time-step, which is also the
-       sequence's length. There is no need to normalize the gradients
-       if warpctc layer was follewed by a mean_op.
+        input(Variable): (LodTensor, default: LoDTensor<float>),
+            the unscaled probabilities of variable-length sequences,
+            which is a 2-D Tensor with LoD information.
+            It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
+            sequences' length and num_classes is the true number of classes.
+            (not including the blank label).
+        label(Variable): (LodTensor, default: LoDTensor<int>), the ground truth
+            of variable-length sequence, which is a 2-D Tensor with LoD
+            information. It is of the shape [Lg, 1], where Lg is th sum of
+            all labels' length.
+        blank (int): default 0, the blank label index of Connectionist
+            Temporal Classification (CTC) loss, which is in the
+            half-opened interval [0, num_classes + 1).
+        norm_by_times (bool): default false, whether to normalize
+            the gradients by the number of time-step, which is also the
+            sequence's length. There is no need to normalize the gradients
+            if warpctc layer was follewed by a mean_op.
 
     Returns:
         Variable: The Connectionist Temporal Classification (CTC) loss,
@@ -2953,9 +3087,9 @@ def sequence_reshape(input, new_dim):
     no remainder for each sequence.
 
     Args:
-       input (Variable): (LodTensor, default: LoDTensor<float>), a 2-D LoDTensor
-                with shape being [N, M] where M for dimension.
-       new_dim (int): New dimension which the input LoDTensor is reshaped to.
+        input (Variable): (LodTensor, default: LoDTensor<float>), a 2-D LoDTensor
+            with shape being [N, M] where M for dimension.
+        new_dim (int): New dimension which the input LoDTensor is reshaped to.
 
     Returns:
         Variable: Reshaped LoDTensor according to new dimension.
@@ -2977,7 +3111,10 @@ def sequence_reshape(input, new_dim):
     return out
 
 
-@autodoc()
+# FIXME(wuyi): let docstring_checker.py understand @autodoc.
+# For now, the comments in c++ use types like Tensor, but in python side
+# the type is often "Variable", and arguments may vary.
+@templatedoc(op_type="nce")
 def nce(input,
         label,
         num_total_classes,
@@ -2985,6 +3122,21 @@ def nce(input,
         param_attr=None,
         bias_attr=None,
         num_neg_samples=None):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): input variable.
+        label (Variable): label.
+        num_total_classes (int):${num_total_classes_comment}
+        sample_weight (int): ${sample_weight_comment}
+        param_attr (ParamAttr|None): attributes for parameter
+        bias_attr (ParamAttr|None): attributes for bias
+        num_neg_samples (int): ${num_neg_samples_comment}
+    
+    Returns:
+        Variable: output of nce layer.
+    """
     helper = LayerHelper('nce', **locals())
     assert isinstance(input, Variable)
     dim = input.shape[1]
@@ -3042,8 +3194,9 @@ def transpose(x, perm, name=None):
     perm[i]-th dimension of `input`.
 
     Args:
-       input (Variable): (Tensor), A Tensor.
-       perm (list): A permutation of the dimensions of `input`.
+        x (Variable): The input Tensor.
+        perm (list): A permutation of the dimensions of `input`.
+        name (str): The name of this layer. It is optional.
 
     Returns:
         Variable: A transposed Tensor.
@@ -3276,9 +3429,9 @@ def multiplex(inputs, index):
     row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
 
     Args:
-       inputs (list): A list of variables to gather from. All variables have the
+        inputs (list): A list of variables to gather from. All variables have the
                 same shape and the rank is at least 2.
-       index (Variable): Tensor<int32>, index variable which is a 2-D tensor
+        index (Variable): Tensor<int32>, index variable which is a 2-D tensor
                 with shape [M, 1] where M is the batch size.
 
     Returns:
@@ -3358,7 +3511,8 @@ def softmax_with_cross_entropy(logits, label, soft_label=False):
             data = fluid.layers.data(name='data', shape=[128], dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             fc = fluid.layers.fc(input=data, size=100)
-            out = fluid.layers.softmax_with_cross_entropy(logits=fc, label=label)
+            out = fluid.layers.softmax_with_cross_entropy(
+                logits=fc, label=label)
     """
     helper = LayerHelper('softmax_with_cross_entropy', **locals())
     softmax = helper.create_tmp_variable(dtype=logits.dtype)
@@ -3405,7 +3559,8 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
         .. code-block:: python
 
             data = fluid.layers.data(name='data', shape=[128], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[100], dtype='float32')
+            label = fluid.layers.data(
+                name='label', shape=[100], dtype='float32')
             fc = fluid.layers.fc(input=data, size=100)
             out = fluid.layers.smooth_l1(x=fc, y=label)
     """
@@ -3475,7 +3630,8 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
         begin(int): The first value of this counter.
         step(int): The increment step between each execution.
 
-    Returns(Variable): The global run counter.
+    Returns:
+        Variable: The global run counter.
     """
     helper = LayerHelper('global_step_counter')
     if counter_name is None:
@@ -3536,7 +3692,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
     the corresponding dimension of x.
 
     Args:
-        input(variable): The input tensor.
+        x(variable): The input tensor.
         shape(list): The new shape. At most one dimension of the new shape can
                      be -1.
         actual_shape(variable): An optional input. If provided, reshape
@@ -3548,8 +3704,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
         inplace(bool): If this flag is set true, a new output tensor is created
                        whose data is copied from input x, otherwise the output
                        shares data with input without copying.
+        name (str): The name of this layer. It is optional.
 
-    Returns(variable): The output tensor.
+    Returns:
+        Variable: The output tensor.
 
     Examples:
         .. code-block:: python
@@ -3727,7 +3885,8 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(name="data", shape=[3, 112, 112], dtype="float32")
+          data = fluid.layers.data(
+              name="data", shape=[3, 112, 112], dtype="float32")
           lrn = fluid.layers.lrn(input=data)
     """
     helper = LayerHelper('lrn', **locals())
@@ -3911,6 +4070,8 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
                              (num_rois, channels, pooled_h, pooled_w).
 
     Examples:
+        .. code-block:: python
+
             pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
     """
     helper = LayerHelper('roi_pool', **locals())
@@ -3929,3 +4090,278 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
             "spatial_scale": spatial_scale
         })
     return pool_out
+
+
+def dice_loss(input, label, epsilon=0.00001):
+    """
+    Dice loss for comparing the similarity of two batch of data,
+    usually is used for binary image segmentation i.e. labels are binary.
+    The dice loss can be defined as below equation:
+
+    .. math::
+
+        dice\_loss &= 1 - \\frac{2 * intersection\_area}{total\_area} \\\\
+                  &= \\frac{(total\_area - intersection\_area) - intersection\_area}{total\_area} \\\\
+                  &= \\frac{(union\_area - intersection\_area)}{total\_area}
+
+
+    Args:
+        input (Variable): The predictions with rank>=2. The first dimension is batch size,
+                          and the last dimension is class number.
+        label (Variable): The groud truth with the same rank with input. The first dimension
+                          is batch size, and the last dimension is 1.
+        epsilon (float): The epsilon will be added to the numerator and denominator.
+                         If both input and label are empty, it makes sure dice is 1.
+                         Default: 0.00001
+
+    Returns:
+        dice_loss (Variable): The dice loss with shape [1].
+
+    Examples:
+        .. code-block:: python
+
+            predictions = fluid.layers.softmax(x)
+            loss = fluid.layers.dice_loss(input=predictions, label=label, 2)
+    """
+    label = one_hot(label, depth=input.shape[-1])
+    reduce_dim = range(1, len(input.shape))
+    inse = reduce_sum(input * label, dim=reduce_dim)
+    dice_denominator = reduce_sum(
+        input, dim=reduce_dim) + reduce_sum(
+            label, dim=reduce_dim)
+    dice_score = 1 - inse * 2 / (dice_denominator + epsilon)
+    return reduce_mean(dice_score)
+
+
+def image_resize(input,
+                 out_shape=None,
+                 scale=None,
+                 name=None,
+                 resample='BILINEAR'):
+    """
+    Resize a batch of images.
+
+    The input must be a tensor of the shape (num_batches, channels, in_h, in_w), 
+    and the resizing only applies on the last two dimensions(hight and width).
+
+    Supporting resample methods:
+        'BILINEAR' : Bilinear interpolation
+
+    Args:
+        input (Variable): The input tensor of image resize layer,
+                          This is a 4-D tensor of the shape
+                          (num_batches, channels, in_h, in_w).
+        out_shape(list|tuple|Variable|None): Output shape of image resize
+                                    layer, the shape is (out_h, out_w).
+                                    Default: None
+        scale(float|None): The multiplier for the input height or width.
+                         At least one of out_shape or scale must be set.
+                         And out_shape has a higher priority than scale.
+                         Default: None
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+        resample(str): The resample method. It can only be 'BILINEAR' currently.
+                       Default: 'BILINEAR'
+
+    Returns:
+        out (Variable): The output is a 4-D tensor of the shape
+                        (num_batches, channls, out_h, out_w).
+
+    Examples:
+        .. code-block:: python
+
+            out = fluid.layers.image_resize(input, out_shape=[12, 12])
+    """
+    resample_methods = {'BILINEAR': 'bilinear_interp'}
+    if resample not in resample_methods:
+        raise ValueError(
+            "The 'resample' of image_resize can only be 'BILINEAR' currently.")
+    if out_shape is None and scale is None:
+        raise ValueError("One of out_shape and scale must not be None")
+    helper = LayerHelper('bilinear_interp', **locals())
+    dtype = helper.input_dtype()
+
+    def _is_list_or_turple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    out_h = 0
+    out_w = 0
+    inputs = {"X": input}
+    if out_shape is not None:
+        if not (_is_list_or_turple_(out_shape) and
+                len(out_shape) == 2) and not isinstance(out_shape, Variable):
+            raise ValueError('out_shape should be a list or tuple or variable')
+        if _is_list_or_turple_(out_shape):
+            out_shape = list(map(int, out_shape))
+            out_h = out_shape[0]
+            out_w = out_shape[1]
+        else:
+            inputs['OutSize'] = out_shape
+    else:
+        out_h = int(input.shape[2] * scale)
+        out_w = int(input.shape[3] * scale)
+
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type=resample_methods[resample],
+        inputs=inputs,
+        outputs={"Out": out},
+        attrs={"out_h": out_h,
+               "out_w": out_w})
+    return out
+
+
+@templatedoc(op_type="bilinear_interp")
+def resize_bilinear(input, out_shape=None, scale=None, name=None):
+    """
+    ${comment}
+
+    Args:
+        input(${x_type}): ${x_comment}.
+
+        out_shape(${out_size_type}): ${out_size_comment}.
+
+        scale(float|None): The multiplier for the input height or width. At
+             least one of out_shape or scale must be set. And out_shape has
+             a higher priority than scale. Default: None.
+
+        name(str|None): The output variable name.
+
+    Returns:
+        ${out_comment}.
+    """
+
+    return image_resize(input, out_shape, scale, name, 'BILINEAR')
+
+
+def image_resize_short(input, out_short_len, resample='BILINEAR'):
+    """
+    Resize a batch of images. The short edge of input images will be 
+    resized to the given 'out_short_len'. The long edge of input images 
+    will be resized proportionately to make images' length-width ratio 
+    constant.
+
+    Args:
+        input (Variable): The input tensor of image resize layer,
+                          This is a 4-D tensor of the shape
+                          (num_batches, channels, in_h, in_w).
+        out_short_len(int): The length of output images' short edge.
+        resample (str): resample method, default: BILINEAR.
+
+    Returns:
+        out (Variable): The output is a 4-D tensor of the shape
+                        (num_batches, channls, out_h, out_w).
+    """
+    in_shape = input.shape
+    if len(in_shape) != 4:
+        raise ValueError(
+            "The rank of input must be 4 (num_batches, channels, in_h, in_w).")
+    hw = in_shape[2:4]
+    short_idx = hw.index(min(hw))
+    long_idx = 1 - short_idx
+    out_shape = list(hw)
+    out_shape[short_idx] = out_short_len
+    out_shape[long_idx] = int(
+        float(out_shape[long_idx]) * (float(out_short_len) / float(hw[
+            short_idx])) + 0.5)
+    return image_resize(input=input, out_shape=out_shape, resample=resample)
+
+
+def gather(input, index):
+    """
+    Output is obtained by gathering entries of the outer-most dimension 
+    of X indexed by `index` and concatenate them together.
+
+    .. math::
+
+        Out = X[Index]
+
+
+    .. code-block:: text
+
+
+                Given:
+
+                X = [[1, 2],
+                     [3, 4],
+                     [5, 6]]
+
+                Index = [1, 2]
+
+                Then:
+
+                Out = [[3, 4],
+                       [5, 6]]
+
+    Args:
+        input (Variable): The source input with rank>=1. 
+        index (Variable): The index input with rank=1.
+
+    Returns:
+        output (Variable): The output is a tensor with the same rank as input.
+
+    Examples:
+        .. code-block:: python
+
+            output = fluid.layers.gather(x, index)
+    """
+    helper = LayerHelper('gather', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="gather",
+        inputs={"X": input,
+                "Index": index},
+        outputs={"Out": out})
+    return out
+
+
+@templatedoc()
+def random_crop(x, shape, seed=None):
+    """
+    ${comment}
+
+    Examples:
+        >>> img = fluid.layers.data("img", [3, 256, 256])
+        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
+
+    Args:
+        x(${x_type}): ${x_comment}
+        shape(${shape_type}): ${shape_comment}
+        seed(int|${seed_type}|None): ${seed_comment} By default, the seed will
+            get from `random.randint(-65536, 65535)`.
+
+    Returns:
+        ${out_comment}
+
+    """
+    helper = LayerHelper("random_crop", **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    if seed is None:
+        seed = random.randint(-65536, 65535)
+
+    if isinstance(seed, int):
+        seed_value = seed
+        seed = helper.create_tmp_variable(dtype="int64")
+        helper.append_op(
+            type="fill_constant",
+            inputs={},
+            outputs={"Out": seed},
+            attrs={
+                "dtype": seed.dtype,
+                "shape": [1],
+                "value": float(seed_value),
+                "force_cpu": True
+            })
+    elif not isinstance(seed, Variable):
+        raise ValueError("'seed' must be a Variable or an int.")
+    seed_out = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="random_crop",
+        inputs={"X": input,
+                "Seed": seed},
+        outputs={"Out": out,
+                 "SeedOut": seed_out},
+        attrs={"shape": shape})
+    return out
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index a9fe25744c..98f169e8f0 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -71,6 +71,10 @@ __all__ = [
     'cumsum',
     'scatter',
     'sum',
+    'slice',
+    'polygon_box_transform',
+    'shape',
+    'maxout',
 ] + __activations__
 
 for _OP in set(__all__):
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 4be0dc6a6b..62b01d595a 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -18,6 +18,7 @@ from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
 from ..initializer import Constant, force_init_on_cpu
 from ..core import VarDesc
+from layer_function_generator import templatedoc
 import numpy
 
 __all__ = [
@@ -30,6 +31,8 @@ __all__ = [
     'assign',
     'fill_constant_batch_size_like',
     'fill_constant',
+    'argmin',
+    'argmax',
     'ones',
     'zeros',
 ]
@@ -112,7 +115,7 @@ def cast(x, dtype):
     return out
 
 
-def concat(input, axis=0):
+def concat(input, axis=0, name=None):
     """
     **Concat**
 
@@ -122,6 +125,8 @@ def concat(input, axis=0):
     Args:
         input(list): List of tensors to be concatenated
         axis(int): Integer axis along which the tensors will be concatenated
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         Variable: Output variable of the concatenation
@@ -264,6 +269,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
     return out
 
 
+@templatedoc()
 def fill_constant_batch_size_like(input,
                                   shape,
                                   dtype,
@@ -271,30 +277,28 @@ def fill_constant_batch_size_like(input,
                                   input_dim_idx=0,
                                   output_dim_idx=0):
     """
-    **fill_constant_batch_size_like**
-
-    This function creates a tensor of specified *shape*, *dtype* and batch size,
-    and initializes this with a constant supplied in *value*. The batch size is
-    obtained from the `input` tensor.
+    ${comment}
 
     It also sets *stop_gradient* to True.
 
+    >>> data = fluid.layers.fill_constant_batch_size_like(
+    >>>             input=like, shape=[1], value=0, dtype='int64')
+
     Args:
-        input(Variable): Tensor whose dimensions will be used to get batch size
-        shape(tuple|list|None): Shape of output tensor
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor
-        value(float): Constant value to initialize the output tensor
-        input_dim_idx(int): Index of input's batch size dimension
-        output_dim_idx(int): Index of output's batch size dimension
+        input(${input_type}): ${input_comment}.
 
-    Returns:
-        Variable: The tensor variable storing the output
+        shape(${shape_type}): ${shape_comment}.
 
-    Examples:
-        .. code-block:: python
+        dtype(${dtype_type}): ${dtype_comment}.
+
+        value(${value_type}): ${value_comment}.
 
-          data = fluid.layers.fill_constant_batch_size_like(
-              input=like, shape=[1], value=0, dtype='int64')
+        input_dim_idx(${input_dim_idx_type}): ${input_dim_idx_comment}.
+
+        output_dim_idx(${output_dim_idx_type}): ${output_dim_idx_comment}.
+
+    Returns:
+        ${out_comment}.
     """
     helper = LayerHelper("fill_constant_batch_size_like", **locals())
     out = helper.create_tmp_variable(dtype=dtype)
@@ -313,6 +317,68 @@ def fill_constant_batch_size_like(input,
     return out
 
 
+def argmin(x, axis=0):
+    """
+    **argmin**
+
+    This function computes the indices of the min elements 
+    of the input tensor's element along the provided axis.
+
+    Args:
+        x(Variable): The input to compute the indices of
+                     the min elements.
+        axis(int): Axis to compute indices along.
+    
+    Returns:
+        Variable: The tensor variable storing the output
+    
+    Examples:
+        .. code-block:: python
+          
+          out = fluid.layers.argmin(x=in, axis=0)
+          out = fluid.layers.argmin(x=in, axis=-1)  
+    """
+    helper = LayerHelper("arg_min", **locals())
+    out = helper.create_tmp_variable(VarDesc.VarType.INT64)
+    helper.append_op(
+        type='arg_min',
+        inputs={'X': x},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
+
+
+def argmax(x, axis=0):
+    """
+    **argmax**
+
+    This function computes the indices of the max elements 
+    of the input tensor's element along the provided axis.
+
+    Args:
+        x(Variable): The input to compute the indices of
+                     the max elements.
+        axis(int): Axis to compute indices along.
+    
+    Returns:
+        Variable: The tensor variable storing the output
+    
+    Examples:
+        .. code-block:: python
+          
+          out = fluid.layers.argmax(x=in, axis=0)
+          out = fluid.layers.argmax(x=in, axis=-1)  
+    """
+    helper = LayerHelper("arg_max", **locals())
+    out = helper.create_tmp_variable(VarDesc.VarType.INT64)
+    helper.append_op(
+        type='arg_max',
+        inputs={'X': x},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
+
+
 def ones(shape, dtype, force_cpu=False):
     """
     **ones**
@@ -361,6 +427,40 @@ def zeros(shape, dtype, force_cpu=False):
     return fill_constant(value=0.0, **locals())
 
 
+def reverse(x, axis):
+    """
+    **reverse**
+
+    This function reverse the input 'x' along given axises.
+
+    Args:
+        x(Vairbale): the input to be reversed.
+        axis(int|tuple|list): Axis that along which order of elements 
+                    is reversed. If it is a tuple or a list, reversing 
+                    will be apply on each axis in the tuple or list.  
+
+    Returns:
+        Variable: The reversed tensor.
+
+    Examples:
+        .. code-block:: python
+
+          out = fluid.layers.reverse(x=in, axis=0)
+          # or:
+          out = fluid.layers.reverse(x=in, axis=[0,1])
+    """
+    if isinstance(axis, int):
+        axis = [axis]
+    helper = LayerHelper("reverse", **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='reverse',
+        inputs={'Input': x},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
+
+
 def save(x, file_path, overwrite=True):
     """
     Saves a variable as a file.
@@ -401,22 +501,6 @@ def save_combine(x, file_path, overwrite=True):
               "overwrite": overwrite})
 
 
-def load(out, file_path):
-    """
-    Loads a variable from a given file.
-
-    Args:
-        out(variable): The variable to be read from the disk file.
-        file_path(str): The path of the disk file.
-    """
-    helper = LayerHelper("load", **locals())
-    helper.append_op(
-        type="load",
-        inputs={},
-        output={"Out": out},
-        args={"file_path": file_path})
-
-
 def load_combine(out, file_path):
     """
     Loads a list of vairables from a single file.
diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
new file mode 100644
index 0000000000..9946d0a4ff
--- /dev/null
+++ b/python/paddle/fluid/lod_tensor.py
@@ -0,0 +1,189 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import core
+import numpy as np
+
+__all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
+
+
+def _validate_lod(lod, tensor_height=-1):
+    """Check whether the input length-based lod info is valid.
+
+    There are several things to check:
+    1. lod should be a list of lists. Empty list is fine.
+    2. The length of each sublist (a lod level) should be at least one.
+    3. Each element in each lod level should be an integer greater than 0.
+    4. The sum of one lod level should be equal to the length of the next lod level.
+    5. The sum of the last lod level should be equal to the tensor height. 
+       Bypass this check if user does not provide tensor_height as input.
+
+    Args:
+        lod: the length-based lod info, e.g., [[2, 3], [2, 1, 2, 3, 4]].
+        tensor_height: the outermost dimension of the tensor with which the input 
+            lod is associated with. 
+
+    Returns:
+        A boolean indicating whether the input lod is valid or not.
+    """
+    assert isinstance(lod, list), "lod should be a list"
+    # Empty lod is fine
+    if len(lod) == 0:
+        return True
+
+    lod_sum = []
+    for level in lod:
+        assert isinstance(level, list), "each item in lod should be a list"
+        # Each level of lod should have at least one length info
+        if len(level) < 1:
+            return False
+        level_sum = 0
+        for lod_len in level:
+            # Each length in a level should be > 0
+            if lod_len <= 0:
+                return False
+            level_sum += lod_len
+        lod_sum.append(level_sum)
+
+    for idx, val in enumerate(lod_sum[:-1]):
+        # Each level's sum should be equal to 
+        # the number of items in the next level
+        if val != len(lod[idx + 1]):
+            return False
+
+    if tensor_height == -1:
+        return True
+    else:
+        # Last level's sum should be equal to the tensor height
+        return lod_sum[-1] == tensor_height
+
+
+def _convert_lod(lod):
+    """Convert a length-based lod to a offset-based lod.
+
+    If the length-based lod is [[2, 3], [2, 1, 2, 3, 4]],
+    then the offset-based lod is [[0, 2, 5], [0, 2, 3, 5, 8, 12]].
+
+    Args:
+        lod: a length-based lod info. 
+
+    Returns:
+        A list of lists as the offset-based lod converted to from the input lod.
+    """
+    new_lod = []
+    for level in lod:
+        cur_len = 0
+        new_level = [cur_len]
+        for lod_len in level:
+            cur_len += lod_len
+            new_level.append(cur_len)
+        new_lod.append(new_level)
+    return new_lod
+
+
+def create_lod_tensor(data, lod, place):
+    """Create a lod tensor from a numpy array, a list, or an existing lod tensor.
+
+    Create a lod tensor by doing the following:
+    1. Check that the length-based input lod is valid.
+    2. Convert the length-based lod to a offset-based LoD.
+    3. Copy the data from a numpy array, a list or a existing lod tensor to 
+       CPU or GPU device (based on input place).
+    4. Set the level of detail (LoD) using the offset-based LoD.
+    
+    Use example:
+    Suppose we want LoDTensor to hold data for sequences of word, where each word is
+    represented by an integer. If we want to create a LoDTensor to represent two 
+    sentences, one of 2 words, and one of 3 words. 
+
+    Then 'data' can be a numpy array of integers with shape (5, 1).
+    'lod' will be [[2, 3]], indicating the length(# of words) in each sentence.
+    This length-based input lod [[2, 3]] will be converted to offset-based lod [[0, 2, 5]]
+    inside the function call.
+
+    Please refer to 
+    github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md
+    for more details regarding LoD.
+
+    Args:
+        data: a numpy array or a LoDTensor or a list holding the data to be copied.
+        lod: a list of lists indicating the length-based LoD info specified by the user. 
+        place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
+
+    Returns:
+        A fluid LoDTensor object with tensor data and lod info.
+    """
+    if isinstance(data, core.LoDTensor):
+        return create_lod_tensor(np.array(data), lod, place)
+    elif isinstance(data, list):
+        # When input data is a list, it only deal with the case where the base element 
+        # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated 
+        # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number 
+        # of words or other indexes in the sequence. 
+        new_lod = []
+        for seq in data:
+            new_lod.append(len(seq))
+        assert [new_lod] == lod, "data and lod do not match"
+        flattened_data = np.concatenate(data, axis=0).astype("int64")
+        flattened_data = flattened_data.reshape([len(flattened_data), 1])
+        return create_lod_tensor(flattened_data, lod, place)
+    elif isinstance(data, np.ndarray):
+        assert _validate_lod(lod,
+                             data.shape[0]), "the provided lod info is invalid"
+        tensor = core.LoDTensor()
+        tensor.set(data, place)
+        tensor.set_lod(_convert_lod(lod))
+        return tensor
+    else:
+        raise TypeError(
+            "data should be either a LoDTensor, a Numpy array or a list")
+
+
+def create_random_int_lodtensor(lod, base_shape, place, low, high):
+    """Create a LoDTensor containing random integers.
+
+    This function is frequently used in the book examples. So we revised it based on 
+    the new create_lod_tensor API and put it here in the lod_tensor module to simplify 
+    the code. 
+
+    The function does the following:
+    1. Calculate the overall shape of the LoDTensor based on the length-based 'lod' input 
+    and the shape of the basic element in 'base_shape'.
+    2. Create a numpy array of this shape.
+    3. Create the LoDTensor using create_lod_tensor API.
+
+    Suppose we want LoDTensor to hold data for sequences of word, where each word is
+    represented by an integer. If we want to create a LoDTensor to represent two 
+    sentences, one of 2 words, and one of 3 words. Then 'base_shape' is [1], input 
+    length-based 'lod' is [[2, 3]]. Then the overall shape of the LoDTensor would be 
+    [5, 1], holding 5 words for two sentences. 
+
+    Args:
+        data: a numpy array or a LoDTensor holding the data to be copied.
+        lod: a list of lists indicating the length-based LoD info specified by the user.
+        base_shape: the shape of the basic element to be held by the LoDTensor. 
+        place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
+        low: the lower bound of the random integers.
+        high: the upper bound of the random integers.
+
+    Returns:
+        A fluid LoDTensor object with tensor data and lod info. 
+    """
+    assert isinstance(base_shape, list), "base_shape should be a list"
+    converted_lod = _convert_lod(lod)
+    # append the total number of basic elements to the front of its shape
+    overall_shape = [converted_lod[-1][-1]] + base_shape
+    # the range of integer data elements is [low, high]    
+    data = np.random.random_integers(low, high, overall_shape).astype("int64")
+    return create_lod_tensor(data, lod, place)
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 0fc4805522..115362c6bf 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -28,8 +28,8 @@ from contextlib import contextmanager
 __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad',
     'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
-    'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'Adadelta', 'ModelAverage',
-    'Optimizer'
+    'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
+    'Adadelta', 'ModelAverage', 'Optimizer'
 ]
 
 
@@ -213,11 +213,13 @@ class Optimizer(object):
 
             optimize_ops = []
             for param_and_grad in parameters_and_grads:
-                if param_and_grad[0].trainable is True and param_and_grad[
-                        1] is not None:
-                    optimize_op = self._append_optimize_op(loss.block,
-                                                           param_and_grad)
-                    optimize_ops.append(optimize_op)
+                with param_and_grad[0].block.program.optimized_guard(
+                        param_and_grad[0]):
+                    if param_and_grad[0].trainable is True and param_and_grad[
+                            1] is not None:
+                        optimize_op = self._append_optimize_op(loss.block,
+                                                               param_and_grad)
+                        optimize_ops.append(optimize_op)
 
             # Get custom finish ops for subclasses
             # FIXME: Need to fix this once we figure out how to handle dependencies
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 3117dfe00c..0fdc9a0352 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -18,6 +18,7 @@ import framework
 import executor
 import warnings
 import sys
+import os
 
 __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy']
 
@@ -101,7 +102,9 @@ class ParallelExecutor(object):
                 p.set_place(self._act_places[-1])
                 self._places.append(p)
         else:
-            for i in xrange(multiprocessing.cpu_count()):
+            cpu_num = int(
+                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+            for i in xrange(cpu_num):
                 p = core.Place()
                 self._act_places.append(core.CPUPlace())
                 p.set_place(self._act_places[-1])
@@ -110,19 +113,17 @@ class ParallelExecutor(object):
 
         if exec_strategy is None:
             exec_strategy = ExecutionStrategy()
-            if use_cuda:
-                exec_strategy.use_event = True
-            else:
-                exec_strategy.use_event = False
+        exec_strategy.use_cuda = use_cuda
 
         if exec_strategy.num_threads == 0:
             if use_cuda:
                 # Experiments on se-resnext shows that too many threads hurt
                 # performance. Worth tunning for other models in the future.
-                exec_strategy.num_threads = len(self._places) * 2
+                exec_strategy.num_threads = len(self._places) * 4
             else:
-                exec_strategy.num_threads = min(
-                    len(self._places) * 2, multiprocessing.cpu_count())
+                cpu_num = int(
+                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+                exec_strategy.num_threads = cpu_num
 
         if build_strategy is None:
             build_strategy = BuildStrategy()
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 04fd05cc33..e2bd1d4c9a 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -16,7 +16,10 @@ import core
 from contextlib import contextmanager
 import os
 
-__all__ = ['cuda_profiler', 'reset_profiler', 'profiler']
+__all__ = [
+    'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
+    'stop_profiler'
+]
 
 NVPROF_CONFIG = [
     "gpustarttimestamp",
@@ -72,20 +75,31 @@ def reset_profiler():
     core.reset_profiler()
 
 
-@contextmanager
-def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
-    """The profiler interface.
-    Different from cuda_profiler, this profiler can be used to profile both CPU
-    and GPU program. By defalut, it records the CPU and GPU operator kernels,
-    if you want to profile other program, you can refer the profiling tutorial
-    to add more records.
+def start_profiler(state):
+    """Enable the profiler.
+
+    Args:
+        state (string) : The profiling state, which should be 'CPU', 'GPU'
+            or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
+            GPU as well. 'All' also generates timeline.
+    """
+    if core.is_profiler_enabled():
+        return
+    if state not in ['CPU', 'GPU', "All"]:
+        raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
+    if state == "GPU":
+        prof_state = core.ProfilerState.kCUDA
+    elif state == "CPU":
+        prof_state = core.ProfilerState.kCPU
+    else:
+        prof_state = core.ProfilerState.kAll
+    core.enable_profiler(prof_state)
+
+
+def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
+    """Stop the profiler.
 
     Args:
-        state (string) : The profiling state, which should be 'CPU' or 'GPU',
-            telling the profiler to use CPU timer or GPU timer for profiling.
-            Although users may have already specified the execution place
-            (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
-            would not inherit this place.
         sorted_key (string) : If None, the profiling results will be printed
             in the order of first end time of events. Otherwise, the profiling
             results will be sorted by the this flag. This flag should be one
@@ -98,17 +112,8 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
         profile_path (string) : If state == 'All', it will write a profile
             proto output file.
     """
-    if state not in ['CPU', 'GPU', "All"]:
-        raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
-    if state == "GPU":
-        prof_state = core.ProfilerState.kCUDA
-    elif state == "CPU":
-        prof_state = core.ProfilerState.kCPU
-    else:
-        prof_state = core.ProfilerState.kAll
-    core.enable_profiler(prof_state)
-    yield
-
+    if not core.is_profiler_enabled():
+        return
     sorted_key = 'default' if sorted_key is None else sorted_key
     if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
         raise ValueError("The sorted_key must be None or in 'calls', 'total', "
@@ -124,3 +129,34 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
     # TODO(qingqing) : redirect C++ ostream to Python stream.
     # with core.ostream_redirect(stdout=True, stderr=True):
     core.disable_profiler(key_map[sorted_key], profile_path)
+
+
+@contextmanager
+def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
+    """The profiler interface.
+    Different from cuda_profiler, this profiler can be used to profile both CPU
+    and GPU program. By defalut, it records the CPU and GPU operator kernels,
+    if you want to profile other program, you can refer the profiling tutorial
+    to add more records.
+
+    Args:
+        state (string) : The profiling state, which should be 'CPU' or 'GPU',
+            telling the profiler to use CPU timer or GPU timer for profiling.
+            Although users may have already specified the execution place
+            (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
+            would not inherit this place.
+        sorted_key (string) : If None, the profiling results will be printed
+            in the order of first end time of events. Otherwise, the profiling
+            results will be sorted by the this flag. This flag should be one
+            of 'calls', 'total', 'max', 'min' or 'ave'.
+            The `calls` means sorting by the number of calls.
+            The `total` means sorting by the total execution time.
+            The `max` means sorting by the maximum execution time.
+            The `min` means sorting by the minimum execution time.
+            The `ave` means sorting by the average execution time.
+        profile_path (string) : If state == 'All', it will write a profile
+            proto output file.
+    """
+    start_profiler(state)
+    yield
+    stop_profiler(sorted_key, profile_path)
diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py
index 5accaacd53..8d48e9abef 100644
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import core
 import contextlib
-
-__all__ = ['convert_reader_to_recordio_file']
+__all__ = [
+    'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files'
+]
 
 
 @contextlib.contextmanager
@@ -46,3 +48,36 @@ def convert_reader_to_recordio_file(
             writer.complete_append_tensor()
             counter += 1
     return counter
+
+
+def convert_reader_to_recordio_files(
+        filename,
+        batch_per_file,
+        reader_creator,
+        feeder,
+        compressor=core.RecordIOWriter.Compressor.Snappy,
+        max_num_records=1000,
+        feed_order=None):
+    if feed_order is None:
+        feed_order = feeder.feed_names
+    f_name, f_ext = os.path.splitext(filename)
+    assert (f_ext == ".recordio")
+
+    lines = []
+    f_idx = 0
+    counter = 0
+    for idx, batch in enumerate(reader_creator()):
+        lines.append(batch)
+        if idx >= batch_per_file and idx % batch_per_file == 0:
+            filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
+            with create_recordio_writer(filename, compressor,
+                                        max_num_records) as writer:
+                for l in lines:
+                    res = feeder.feed(l)
+                    for each in feed_order:
+                        writer.append_tensor(res[each])
+                    writer.complete_append_tensor()
+                    counter += 1
+                lines = []
+                f_idx += 1
+    return counter
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index c006bd9a66..c4d6829599 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -43,31 +43,32 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
     """
     params_and_grads = []
     for param, grad in parameters_and_grads:
-        # If no gradient then we don't need to do anything
-        if grad is None:
+        with param.block.program.optimized_guard(param):
+            # If no gradient then we don't need to do anything
+            if grad is None:
+                params_and_grads.append((param, grad))
+                continue
+
+            regularization_term = None
+            if param.regularizer is not None:
+                # Add variable for regularization term in grad block
+                regularization_term = param.regularizer(param, grad, grad.block)
+            elif regularization is not None:
+                regularization_term = regularization(param, grad, grad.block)
+
+            # If no regularization specified, then we don't need to do anything
+            if regularization_term is None:
+                params_and_grads.append((param, grad))
+                continue
+
+            assert grad.shape == regularization_term.shape
+
+            grad.block.append_op(
+                type='elementwise_add',
+                inputs={"X": grad,
+                        "Y": regularization_term},
+                outputs={"Out": grad})
             params_and_grads.append((param, grad))
-            continue
-
-        regularization_term = None
-        if param.regularizer is not None:
-            # Add variable for regularization term in grad block
-            regularization_term = param.regularizer(param, grad, grad.block)
-        elif regularization is not None:
-            regularization_term = regularization(param, grad, grad.block)
-
-        # If no regularization specified, then we don't need to do anything
-        if regularization_term is None:
-            params_and_grads.append((param, grad))
-            continue
-
-        assert grad.shape == regularization_term.shape
-
-        grad.block.append_op(
-            type='elementwise_add',
-            inputs={"X": grad,
-                    "Y": regularization_term},
-            outputs={"Out": grad})
-        params_and_grads.append((param, grad))
 
     return params_and_grads
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
index da76747f82..efa5ee2d06 100644
--- a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
@@ -9,3 +9,8 @@ endforeach()
 add_subdirectory(fit_a_line)
 add_subdirectory(recognize_digits)
 add_subdirectory(image_classification)
+add_subdirectory(understand_sentiment)
+add_subdirectory(label_semantic_roles)
+add_subdirectory(word2vec)
+add_subdirectory(recommender_system)
+add_subdirectory(machine_translation)
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
index 4c8505acf3..ad28c9eff5 100644
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -38,7 +38,7 @@ def inference_program():
     return y_predict
 
 
-def linear():
+def train_program():
     y = fluid.layers.data(name='y', shape=[1], dtype='float32')
     y_predict = inference_program()
 
@@ -48,13 +48,15 @@ def linear():
     return avg_loss
 
 
-def train(use_cuda, train_program, save_dirname):
+def optimizer_func():
+    return fluid.optimizer.SGD(learning_rate=0.001)
+
+
+def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     trainer = fluid.Trainer(
-        train_func=train_program,
-        place=place,
-        optimizer=fluid.optimizer.SGD(learning_rate=0.001))
+        train_func=train_program, place=place, optimizer_func=optimizer_func)
 
     def event_handler(event):
         if isinstance(event, fluid.EndStepEvent):
@@ -68,8 +70,8 @@ def train(use_cuda, train_program, save_dirname):
                 ['15.343549569447836']
                 ...
                 '''
-                if save_dirname is not None:
-                    trainer.save_params(save_dirname)
+                if params_dirname is not None:
+                    trainer.save_params(params_dirname)
                 trainer.stop()
 
     trainer.train(
@@ -80,19 +82,19 @@ def train(use_cuda, train_program, save_dirname):
 
 
 # infer
-def infer(use_cuda, inference_program, save_dirname=None):
-    if save_dirname is None:
+def infer(use_cuda, inference_program, params_dirname=None):
+    if params_dirname is None:
         return
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)
 
     batch_size = 10
     tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
 
     results = inferencer.infer({'x': tensor_x})
-    print("infer results: ", numpy.array(results[0]))
+    print("infer results: ", results[0])
 
 
 def main(use_cuda):
@@ -100,10 +102,10 @@ def main(use_cuda):
         return
 
     # Directory for saving the trained model
-    save_dirname = "fit_a_line.inference.model"
+    params_dirname = "fit_a_line.inference.model"
 
-    train(use_cuda, linear, save_dirname)
-    infer(use_cuda, inference_program, save_dirname)
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
 
 
 class TestFitALine(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index 1160e500db..2df3da9cca 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -85,7 +85,11 @@ def train_network():
     return [avg_cost, accuracy]
 
 
-def train(use_cuda, train_program, save_dirname):
+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
+def train(use_cuda, train_program, params_dirname):
     BATCH_SIZE = 128
     EPOCH_NUM = 1
 
@@ -105,15 +109,13 @@ def train(use_cuda, train_program, save_dirname):
             print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
 
             if accuracy > 0.01:  # Low threshold for speeding up CI
-                if save_dirname is not None:
-                    trainer.save_params(save_dirname)
+                if params_dirname is not None:
+                    trainer.save_params(params_dirname)
                 return
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     trainer = fluid.Trainer(
-        train_func=train_program,
-        optimizer=fluid.optimizer.Adam(learning_rate=0.001),
-        place=place)
+        train_func=train_program, optimizer_func=optimizer_func, place=place)
 
     trainer.train(
         reader=train_reader,
@@ -122,10 +124,10 @@ def train(use_cuda, train_program, save_dirname):
         feed_order=['pixel', 'label'])
 
 
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)
 
     # The input's dimension of conv should be 4-D or 5-D.
     # Use normilized image pixels as input data, which should be in the range
@@ -142,12 +144,14 @@ def main(use_cuda):
     save_path = "image_classification_resnet.inference.model"
 
     train(
-        use_cuda=use_cuda, train_program=train_network, save_dirname=save_path)
+        use_cuda=use_cuda,
+        train_program=train_network,
+        params_dirname=save_path)
 
     infer(
         use_cuda=use_cuda,
         inference_program=inference_network,
-        save_dirname=save_path)
+        params_dirname=save_path)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
index 1e3e955ba0..224cca417e 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -64,7 +64,11 @@ def train_network():
     return [avg_cost, accuracy]
 
 
-def train(use_cuda, train_program, save_dirname):
+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
+def train(use_cuda, train_program, params_dirname):
     BATCH_SIZE = 128
     train_reader = paddle.batch(
         paddle.reader.shuffle(
@@ -82,15 +86,13 @@ def train(use_cuda, train_program, save_dirname):
             print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
 
             if accuracy > 0.01:  # Low threshold for speeding up CI
-                if save_dirname is not None:
-                    trainer.save_params(save_dirname)
+                if params_dirname is not None:
+                    trainer.save_params(params_dirname)
                 return
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     trainer = fluid.Trainer(
-        train_func=train_program,
-        place=place,
-        optimizer=fluid.optimizer.Adam(learning_rate=0.001))
+        train_func=train_program, place=place, optimizer_func=optimizer_func)
 
     trainer.train(
         reader=train_reader,
@@ -99,10 +101,10 @@ def train(use_cuda, train_program, save_dirname):
         feed_order=['pixel', 'label'])
 
 
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)
 
     # The input's dimension of conv should be 4-D or 5-D.
     # Use normilized image pixels as input data, which should be in the range
@@ -119,12 +121,14 @@ def main(use_cuda):
     save_path = "image_classification_vgg.inference.model"
 
     train(
-        use_cuda=use_cuda, train_program=train_network, save_dirname=save_path)
+        use_cuda=use_cuda,
+        train_program=train_network,
+        params_dirname=save_path)
 
     infer(
         use_cuda=use_cuda,
         inference_program=inference_network,
-        save_dirname=save_path)
+        params_dirname=save_path)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt
new file mode 100644
index 0000000000..673c965b66
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/no_test_label_semantic_roles.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/no_test_label_semantic_roles.py
deleted file mode 100755
index fe36e55bb5..0000000000
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/no_test_label_semantic_roles.py
+++ /dev/null
@@ -1,228 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-import numpy
-
-WORD_DICT, VERB_DICT, LABEL_DICT = paddle.dataset.conll05.get_dict()
-WORD_DICT_LEN = len(WORD_DICT)
-LABEL_DICT_LEN = len(LABEL_DICT)
-PRED_DICT_LEN = len(VERB_DICT)
-MARK_DICT_LEN = 2
-
-
-def lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark):
-    WORD_DIM = 32
-    MARK_DIM = 5
-    HIDDEN_DIM = 512
-    DEPTH = 8
-    EMBEDDING_NAME = 'emb'
-
-    # Data definitions
-    word = fluid.layers.data(
-        name='word_data', shape=[1], dtype='int64', lod_level=1)
-    predicate = fluid.layers.data(
-        name='verb_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n2 = fluid.layers.data(
-        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n1 = fluid.layers.data(
-        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_0 = fluid.layers.data(
-        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p1 = fluid.layers.data(
-        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p2 = fluid.layers.data(
-        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
-    mark = fluid.layers.data(
-        name='mark_data', shape=[1], dtype='int64', lod_level=1)
-
-    # 8 features
-    predicate_embedding = fluid.layers.embedding(
-        input=predicate,
-        size=[PRED_DICT_LEN, WORD_DIM],
-        dtype='float32',
-        is_sparse=IS_SPARSE,
-        param_attr='vemb')
-
-    mark_embedding = fluid.layers.embedding(
-        input=mark,
-        size=[MARK_DICT_LEN, MARK_DIM],
-        dtype='float32',
-        is_sparse=IS_SPARSE)
-
-    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-    emb_layers = [
-        fluid.layers.embedding(
-            size=[WORD_DICT_LEN, WORD_DIM],
-            input=x,
-            param_attr=fluid.ParamAttr(
-                name=EMBEDDING_NAME, trainable=False)) for x in word_input
-    ]
-    emb_layers.append(predicate_embedding)
-    emb_layers.append(mark_embedding)
-
-    hidden_0_layers = [
-        fluid.layers.fc(input=emb, size=HIDDEN_DIM, act='tanh')
-        for emb in emb_layers
-    ]
-
-    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
-
-    lstm_0 = fluid.layers.dynamic_lstm(
-        input=hidden_0,
-        size=HIDDEN_DIM,
-        candidate_activation='relu',
-        gate_activation='sigmoid',
-        cell_activation='sigmoid')
-
-    # stack L-LSTM and R-LSTM with direct edges
-    input_tmp = [hidden_0, lstm_0]
-
-    for i in range(1, DEPTH):
-        mix_hidden = fluid.layers.sums(input=[
-            fluid.layers.fc(input=input_tmp[0], size=HIDDEN_DIM, act='tanh'),
-            fluid.layers.fc(input=input_tmp[1], size=HIDDEN_DIM, act='tanh')
-        ])
-
-        lstm = fluid.layers.dynamic_lstm(
-            input=mix_hidden,
-            size=HIDDEN_DIM,
-            candidate_activation='relu',
-            gate_activation='sigmoid',
-            cell_activation='sigmoid',
-            is_reverse=((i % 2) == 1))
-
-        input_tmp = [mix_hidden, lstm]
-
-    feature_out = fluid.layers.sums(input=[
-        fluid.layers.fc(input=input_tmp[0], size=LABEL_DICT_LEN, act='tanh'),
-        fluid.layers.fc(input=input_tmp[1], size=LABEL_DICT_LEN, act='tanh')
-    ])
-
-    return feature_out
-
-
-def inference_network():
-    predict = lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,
-                       mark)
-
-    crf_decode = fluid.layers.crf_decoding(
-        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
-
-    return crf_decode
-
-
-def train_network():
-    MIX_HIDDEN_LR = 1e-3
-
-    predict = lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,
-                       mark)
-    target = fluid.layers.data(
-        name='target', shape=[1], dtype='int64', lod_level=1)
-    crf_cost = fluid.layers.linear_chain_crf(
-        input=predict,
-        label=target,
-        param_attr=fluid.ParamAttr(
-            name='crfw', learning_rate=MIX_HIDDEN_LR))
-    avg_cost = fluid.layers.mean(crf_cost)
-
-    return avg_cost
-
-
-def train(use_cuda, save_path):
-    BATCH_SIZE = 128
-    EPOCH_NUM = 1
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.conll05.train(), buf_size=8192),
-        batch_size=BATCH_SIZE)
-    test_reader = paddle.batch(
-        paddle.dataset.conll05.test(), batch_size=BATCH_SIZE)
-
-    def event_handler(event):
-        if isinstance(event, fluid.EndIteration):
-            if (event.batch_id % 10) == 0:
-                avg_cost = trainer.test(reader=test_reader)
-
-                print('BatchID {0:04}, Loss {1:2.2}'.format(event.batch_id + 1,
-                                                            avg_cost))
-
-                if avg_cost > 0.01:  # Low threshold for speeding up CI
-                    trainer.save_params(save_path)
-                    return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    sgd_optimizer = fluid.optimizer.SGD(
-        learning_rate=fluid.layers.exponential_decay(
-            learning_rate=0.01,
-            decay_steps=100000,
-            decay_rate=0.5,
-            staircase=True))
-    trainer = fluid.Trainer(train_network, optimizer=sgd_optimizer, place=place)
-    trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler)
-
-
-def infer(use_cuda, save_path):
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = fluid.Inferencer(
-        inference_program, param_path=save_path, place=place)
-
-    def create_random_lodtensor(lod, place, low, high):
-        data = np.random.random_integers(low, high,
-                                         [lod[-1], 1]).astype("int64")
-        res = fluid.LoDTensor()
-        res.set(data, place)
-        res.set_lod([lod])
-        return res
-
-    # Create an input example
-    lod = [0, 4, 10]
-    word = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
-    pred = create_random_lodtensor(lod, place, low=0, high=PRED_DICT_LEN - 1)
-    ctx_n2 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
-    ctx_n1 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
-    ctx_0 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
-    ctx_p1 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
-    ctx_p2 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
-    mark = create_random_lodtensor(lod, place, low=0, high=MARK_DICT_LEN - 1)
-
-    results = inferencer.infer({
-        'word_data': word,
-        'verb_data': pred,
-        'ctx_n2_data': ctx_n2,
-        'ctx_n1_data': ctx_n1,
-        'ctx_0_data': ctx_0,
-        'ctx_p1_data': ctx_p1,
-        'ctx_p2_data': ctx_p2,
-        'mark_data': mark
-    })
-
-    print("infer results: ", results)
-
-
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    save_path = "label_semantic_roles.inference.model"
-    train(use_cuda, save_path)
-    infer(use_cuda, save_path)
-
-
-if __name__ == '__main__':
-    for use_cuda in (False, True):
-        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
new file mode 100755
index 0000000000..0ccb3a39e0
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -0,0 +1,265 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+
+WORD_DICT, VERB_DICT, LABEL_DICT = paddle.dataset.conll05.get_dict()
+WORD_DICT_LEN = len(WORD_DICT)
+LABEL_DICT_LEN = len(LABEL_DICT)
+PRED_DICT_LEN = len(VERB_DICT)
+MARK_DICT_LEN = 2
+IS_SPARSE = True
+BATCH_SIZE = 10
+EMBEDDING_NAME = 'emb'
+
+
+def lstm_net():
+    WORD_DIM = 32
+    MARK_DIM = 5
+    HIDDEN_DIM = 512
+    DEPTH = 8
+
+    # Data definitions
+    word = fluid.layers.data(
+        name='word_data', shape=[1], dtype='int64', lod_level=1)
+    predicate = fluid.layers.data(
+        name='verb_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n2 = fluid.layers.data(
+        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n1 = fluid.layers.data(
+        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_0 = fluid.layers.data(
+        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p1 = fluid.layers.data(
+        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p2 = fluid.layers.data(
+        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+    mark = fluid.layers.data(
+        name='mark_data', shape=[1], dtype='int64', lod_level=1)
+
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        size=[PRED_DICT_LEN, WORD_DIM],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark,
+        size=[MARK_DICT_LEN, MARK_DIM],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[WORD_DICT_LEN, WORD_DIM],
+            input=x,
+            param_attr=fluid.ParamAttr(name=EMBEDDING_NAME))
+        for x in word_input
+        #name=EMBEDDING_NAME, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=HIDDEN_DIM, act='tanh')
+        for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=HIDDEN_DIM,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, DEPTH):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=HIDDEN_DIM, act='tanh'),
+            fluid.layers.fc(input=input_tmp[1], size=HIDDEN_DIM, act='tanh')
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=HIDDEN_DIM,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=LABEL_DICT_LEN, act='tanh'),
+        fluid.layers.fc(input=input_tmp[1], size=LABEL_DICT_LEN, act='tanh')
+    ])
+
+    return feature_out
+
+
+def inference_program():
+    predict = lstm_net()
+
+    return predict
+
+
+def train_program():
+    MIX_HIDDEN_LR = 1e-3
+
+    predict = lstm_net()
+    target = fluid.layers.data(
+        name='target', shape=[1], dtype='int64', lod_level=1)
+    crf_cost = fluid.layers.linear_chain_crf(
+        input=predict,
+        label=target,
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=MIX_HIDDEN_LR))
+    avg_cost = fluid.layers.mean(crf_cost)
+
+    return [avg_cost]
+
+
+def optimize_func():
+    return fluid.optimizer.SGD(learning_rate=fluid.layers.exponential_decay(
+        learning_rate=0.01, decay_steps=100000, decay_rate=0.5, staircase=True))
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    trainer = fluid.Trainer(
+        train_func=train_program, place=place, optimizer_func=optimize_func)
+
+    feed_order = [
+        'word_data', 'ctx_n2_data', 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
+        'ctx_p2_data', 'verb_data', 'mark_data', 'target'
+    ]
+
+    #embedding_param = fluid.global_scope().find_var(
+    #        EMBEDDING_NAME).get_tensor()
+    #embedding_param.set(
+    #        load_parameter(conll05.get_embedding(), WORD_DICT_LEN, WORD_DIM),
+    #        place)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndEpochEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.conll05.test(), batch_size=BATCH_SIZE)
+            avg_cost_set = trainer.test(
+                reader=test_reader, feed_order=feed_order)
+
+            # get avg cost
+            avg_cost = np.array(avg_cost_set).mean()
+
+            print("avg_cost: %s" % avg_cost)
+
+            if float(avg_cost) < 100.0:  # Large value to increase CI speed
+                trainer.save_params(params_dirname)
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                                                              float(avg_cost)))
+                if math.isnan(float(avg_cost)):
+                    sys.exit("got NaN loss, training failed.")
+
+        elif isinstance(event, fluid.EndStepEvent):
+            print("Step {0}, Epoch {1} Metrics {2}".format(
+                event.step, event.epoch, map(np.array, event.metrics)))
+            if event.step == 1:  # Run 2 iterations to speed CI
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.conll05.test(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=feed_order)
+
+
+def infer(use_cuda, inference_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    inferencer = fluid.Inferencer(
+        inference_program, param_path=params_dirname, place=place)
+
+    # Setup inputs by creating LoDTensors to represent sequences of words.
+    # Here each word is the basic element of these LoDTensors and the shape of 
+    # each word (base_shape) should be [1] since it is simply an index to 
+    # look up for the corresponding word vector.
+    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
+    # which has only one lod level. Then the created LoDTensors will have only 
+    # one higher level structure (sequence of words, or sentence) than the basic 
+    # element (word). Hence the LoDTensor will hold data for three sentences of 
+    # length 3, 4 and 2, respectively. 
+    # Note that lod info should be a list of lists.
+    lod = [[3, 4, 2]]
+    base_shape = [1]
+    # The range of random integers is [low, high]
+    word = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_n2 = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_n1 = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_0 = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_p1 = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_p2 = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    pred = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
+    mark = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=MARK_DICT_LEN - 1)
+
+    results = inferencer.infer(
+        {
+            'word_data': word,
+            'ctx_n2_data': ctx_n2,
+            'ctx_n1_data': ctx_n1,
+            'ctx_0_data': ctx_0,
+            'ctx_p1_data': ctx_p1,
+            'ctx_p2_data': ctx_p2,
+            'verb_data': pred,
+            'mark_data': mark
+        },
+        return_numpy=False)
+
+    print("infer results: ", np.array(results[0]).shape)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    params_dirname = "label_semantic_roles.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt
new file mode 100644
index 0000000000..673c965b66
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
new file mode 100644
index 0000000000..c4b37df3a0
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -0,0 +1,306 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as pd
+from paddle.fluid.executor import Executor
+from functools import partial
+import unittest
+import os
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+hidden_dim = 32
+word_dim = 16
+batch_size = 2
+max_length = 8
+topk_size = 50
+trg_dic_size = 10000
+beam_size = 2
+
+decoder_size = hidden_dim
+
+
+def encoder(is_sparse):
+    # encoder
+    src_word_id = pd.data(
+        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = pd.embedding(
+        input=src_word_id,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = pd.sequence_last_step(input=lstm_hidden0)
+    return encoder_out
+
+
+def train_decoder(context, is_sparse):
+    # decoder
+    trg_language_word = pd.data(
+        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = pd.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    rnn = pd.DynamicRNN()
+    with rnn.block():
+        current_word = rnn.step_input(trg_embedding)
+        pre_state = rnn.memory(init=context)
+        current_state = pd.fc(input=[current_word, pre_state],
+                              size=decoder_size,
+                              act='tanh')
+
+        current_score = pd.fc(input=current_state,
+                              size=target_dict_dim,
+                              act='softmax')
+        rnn.update_memory(pre_state, current_state)
+        rnn.output(current_score)
+
+    return rnn()
+
+
+def decode(context, is_sparse):
+    init_state = context
+    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
+    counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
+
+    # fill the first element with init_state
+    state_array = pd.create_array('float32')
+    pd.array_write(init_state, array=state_array, i=counter)
+
+    # ids, scores as memory
+    ids_array = pd.create_array('int64')
+    scores_array = pd.create_array('float32')
+
+    init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
+    init_scores = pd.data(
+        name="init_scores", shape=[1], dtype="float32", lod_level=2)
+
+    pd.array_write(init_ids, array=ids_array, i=counter)
+    pd.array_write(init_scores, array=scores_array, i=counter)
+
+    cond = pd.less_than(x=counter, y=array_len)
+
+    while_op = pd.While(cond=cond)
+    with while_op.block():
+        pre_ids = pd.array_read(array=ids_array, i=counter)
+        pre_state = pd.array_read(array=state_array, i=counter)
+        pre_score = pd.array_read(array=scores_array, i=counter)
+
+        # expand the lod of pre_state to be the same with pre_score
+        pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
+
+        pre_ids_emb = pd.embedding(
+            input=pre_ids,
+            size=[dict_size, word_dim],
+            dtype='float32',
+            is_sparse=is_sparse)
+
+        # use rnn unit to update rnn
+        current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
+                              size=decoder_size,
+                              act='tanh')
+        current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
+        # use score to do beam search
+        current_score = pd.fc(input=current_state_with_lod,
+                              size=target_dict_dim,
+                              act='softmax')
+        topk_scores, topk_indices = pd.topk(current_score, k=topk_size)
+        selected_ids, selected_scores = pd.beam_search(
+            pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
+
+        pd.increment(x=counter, value=1, in_place=True)
+
+        # update the memories
+        pd.array_write(current_state, array=state_array, i=counter)
+        pd.array_write(selected_ids, array=ids_array, i=counter)
+        pd.array_write(selected_scores, array=scores_array, i=counter)
+
+        pd.less_than(x=counter, y=array_len, cond=cond)
+
+    translation_ids, translation_scores = pd.beam_search_decode(
+        ids=ids_array, scores=scores_array)
+
+    # return init_ids, init_scores
+
+    return translation_ids, translation_scores
+
+
+def train_program(is_sparse):
+    context = encoder(is_sparse)
+    rnn_out = train_decoder(context, is_sparse)
+    label = pd.data(
+        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = pd.cross_entropy(input=rnn_out, label=label)
+    avg_cost = pd.mean(cost)
+    return avg_cost
+
+
+def optimizer_func():
+    return fluid.optimizer.Adagrad(
+        learning_rate=1e-4,
+        regularization=fluid.regularizer.L2DecayRegularizer(
+            regularization_coeff=0.1))
+
+
+def train(use_cuda, is_sparse, is_local=True):
+    EPOCH_NUM = 1
+
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    feed_order = [
+        'src_word_id', 'target_language_word', 'target_language_next_word'
+    ]
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndStepEvent):
+            print('pass_id=' + str(event.epoch) + ' batch=' + str(event.step))
+            if event.step == 10:
+                trainer.stop()
+
+    trainer = fluid.Trainer(
+        train_func=partial(train_program, is_sparse),
+        place=place,
+        optimizer_func=optimizer_func)
+
+    trainer.train(
+        reader=train_reader,
+        num_epochs=EPOCH_NUM,
+        event_handler=event_handler,
+        feed_order=feed_order)
+
+
+def decode_main(use_cuda, is_sparse):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    context = encoder(is_sparse)
+    translation_ids, translation_scores = decode(context, is_sparse)
+
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
+    init_scores_data = np.array(
+        [1. for _ in range(batch_size)], dtype='float32')
+    init_ids_data = init_ids_data.reshape((batch_size, 1))
+    init_scores_data = init_scores_data.reshape((batch_size, 1))
+    init_lod = [1] * batch_size
+    init_lod = [init_lod, init_lod]
+
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
+    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    feed_order = ['src_word_id']
+    feed_list = [
+        framework.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
+    for data in train_data():
+        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict['init_ids'] = init_ids
+        feed_dict['init_scores'] = init_scores
+
+        result_ids, result_scores = exe.run(
+            framework.default_main_program(),
+            feed=feed_dict,
+            fetch_list=[translation_ids, translation_scores],
+            return_numpy=False)
+        print result_ids.lod()
+        break
+
+
+class TestMachineTranslation(unittest.TestCase):
+    pass
+
+
+@contextlib.contextmanager
+def scope_prog_guard():
+    prog = fluid.Program()
+    startup_prog = fluid.Program()
+    scope = fluid.core.Scope()
+    with fluid.scope_guard(scope):
+        with fluid.program_guard(prog, startup_prog):
+            yield
+
+
+def inject_test_train(use_cuda, is_sparse):
+    f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse'
+                                         if is_sparse else 'dense')
+
+    def f(*args):
+        with scope_prog_guard():
+            train(use_cuda, is_sparse)
+
+    setattr(TestMachineTranslation, f_name, f)
+
+
+def inject_test_decode(use_cuda, is_sparse, decorator=None):
+    f_name = 'test_{0}_{1}_decode'.format('cuda'
+                                          if use_cuda else 'cpu', 'sparse'
+                                          if is_sparse else 'dense')
+
+    def f(*args):
+        with scope_prog_guard():
+            decode_main(use_cuda, is_sparse)
+
+    if decorator is not None:
+        f = decorator(f)
+
+    setattr(TestMachineTranslation, f_name, f)
+
+
+for _use_cuda_ in (False, True):
+    for _is_sparse_ in (False, True):
+        inject_test_train(_use_cuda_, _is_sparse_)
+
+for _use_cuda_ in (False, True):
+    for _is_sparse_ in (False, True):
+
+        _decorator_ = None
+        if _use_cuda_:
+            _decorator_ = unittest.skip(
+                reason='Beam Search does not support CUDA!')
+
+        inject_test_decode(
+            is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
index 2128d4c5b8..9a09db25dc 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -57,14 +57,17 @@ def train_program():
     return [avg_cost, acc]
 
 
-def train(use_cuda, train_program, save_dirname):
+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
+def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 
     trainer = fluid.Trainer(
         train_func=train_program,
         place=place,
-        optimizer=optimizer,
+        optimizer_func=optimizer_func,
         parallel=True)
 
     def event_handler(event):
@@ -78,7 +81,7 @@ def train(use_cuda, train_program, save_dirname):
             print("acc     : %s" % acc)
 
             if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
             else:
                 print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
                     event.epoch + 1, avg_cost, acc))
@@ -100,11 +103,11 @@ def train(use_cuda, train_program, save_dirname):
         feed_order=['img', 'label'])
 
 
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)
 
     batch_size = 1
     tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -112,21 +115,21 @@ def infer(use_cuda, inference_program, save_dirname=None):
 
     results = inferencer.infer({'img': tensor_img})
 
-    print("infer results: ", numpy.array(results[0]))
+    print("infer results: ", results[0])
 
 
 def main(use_cuda):
-    save_dirname = "recognize_digits_conv.inference.model"
+    params_dirname = "recognize_digits_conv.inference.model"
 
     # call train() with is_local argument to run distributed train
     train(
         use_cuda=use_cuda,
         train_program=train_program,
-        save_dirname=save_dirname)
+        params_dirname=params_dirname)
     infer(
         use_cuda=use_cuda,
         inference_program=inference_program,
-        save_dirname=save_dirname)
+        params_dirname=params_dirname)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
index 041c8d778e..b2b544e791 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -44,12 +44,15 @@ def train_program():
     return [avg_cost, acc]
 
 
-def train(use_cuda, train_program, save_dirname):
+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
+def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 
     trainer = fluid.Trainer(
-        train_func=train_program, place=place, optimizer=optimizer)
+        train_func=train_program, place=place, optimizer_func=optimizer_func)
 
     def event_handler(event):
         if isinstance(event, fluid.EndEpochEvent):
@@ -62,7 +65,7 @@ def train(use_cuda, train_program, save_dirname):
             print("acc     : %s" % acc)
 
             if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
             else:
                 print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
                     event.epoch + 1, avg_cost, acc))
@@ -81,11 +84,11 @@ def train(use_cuda, train_program, save_dirname):
         feed_order=['img', 'label'])
 
 
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)
 
     batch_size = 1
     tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -93,21 +96,21 @@ def infer(use_cuda, inference_program, save_dirname=None):
 
     results = inferencer.infer({'img': tensor_img})
 
-    print("infer results: ", numpy.array(results[0]))
+    print("infer results: ", results[0])
 
 
 def main(use_cuda):
-    save_dirname = "recognize_digits_mlp.inference.model"
+    params_dirname = "recognize_digits_mlp.inference.model"
 
     # call train() with is_local argument to run distributed train
     train(
         use_cuda=use_cuda,
         train_program=train_program,
-        save_dirname=save_dirname)
+        params_dirname=params_dirname)
     infer(
         use_cuda=use_cuda,
         inference_program=inference_program,
-        save_dirname=save_dirname)
+        params_dirname=params_dirname)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt
new file mode 100644
index 0000000000..673c965b66
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
new file mode 100644
index 0000000000..090c11ce1e
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -0,0 +1,258 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import sys
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.nets as nets
+
+IS_SPARSE = True
+USE_GPU = False
+BATCH_SIZE = 256
+
+
+def get_usr_combined_features():
+    # FIXME(dzh) : old API integer_value(10) may have range check.
+    # currently we don't have user configurated check.
+
+    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
+
+    uid = layers.data(name='user_id', shape=[1], dtype='int64')
+
+    usr_emb = layers.embedding(
+        input=uid,
+        dtype='float32',
+        size=[USR_DICT_SIZE, 32],
+        param_attr='user_table',
+        is_sparse=IS_SPARSE)
+
+    usr_fc = layers.fc(input=usr_emb, size=32)
+
+    USR_GENDER_DICT_SIZE = 2
+
+    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
+
+    usr_gender_emb = layers.embedding(
+        input=usr_gender_id,
+        size=[USR_GENDER_DICT_SIZE, 16],
+        param_attr='gender_table',
+        is_sparse=IS_SPARSE)
+
+    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
+
+    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
+    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
+
+    usr_age_emb = layers.embedding(
+        input=usr_age_id,
+        size=[USR_AGE_DICT_SIZE, 16],
+        is_sparse=IS_SPARSE,
+        param_attr='age_table')
+
+    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
+
+    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
+    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
+
+    usr_job_emb = layers.embedding(
+        input=usr_job_id,
+        size=[USR_JOB_DICT_SIZE, 16],
+        param_attr='job_table',
+        is_sparse=IS_SPARSE)
+
+    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
+
+    concat_embed = layers.concat(
+        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
+
+    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+
+    return usr_combined_features
+
+
+def get_mov_combined_features():
+
+    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
+
+    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
+
+    mov_emb = layers.embedding(
+        input=mov_id,
+        dtype='float32',
+        size=[MOV_DICT_SIZE, 32],
+        param_attr='movie_table',
+        is_sparse=IS_SPARSE)
+
+    mov_fc = layers.fc(input=mov_emb, size=32)
+
+    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
+
+    category_id = layers.data(
+        name='category_id', shape=[1], dtype='int64', lod_level=1)
+
+    mov_categories_emb = layers.embedding(
+        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+
+    mov_categories_hidden = layers.sequence_pool(
+        input=mov_categories_emb, pool_type="sum")
+
+    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
+
+    mov_title_id = layers.data(
+        name='movie_title', shape=[1], dtype='int64', lod_level=1)
+
+    mov_title_emb = layers.embedding(
+        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+
+    mov_title_conv = nets.sequence_conv_pool(
+        input=mov_title_emb,
+        num_filters=32,
+        filter_size=3,
+        act="tanh",
+        pool_type="sum")
+
+    concat_embed = layers.concat(
+        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
+
+    # FIXME(dzh) : need tanh operator
+    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+
+    return mov_combined_features
+
+
+def inference_program():
+    usr_combined_features = get_usr_combined_features()
+    mov_combined_features = get_mov_combined_features()
+
+    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
+    scale_infer = layers.scale(x=inference, scale=5.0)
+
+    return scale_infer
+
+
+def train_program():
+
+    scale_infer = inference_program()
+
+    label = layers.data(name='score', shape=[1], dtype='float32')
+    square_cost = layers.square_error_cost(input=scale_infer, label=label)
+    avg_cost = layers.mean(square_cost)
+
+    return [avg_cost, scale_infer]
+
+
+def optimizer_func():
+    return fluid.optimizer.SGD(learning_rate=0.2)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    trainer = fluid.Trainer(
+        train_func=train_program, place=place, optimizer_func=optimizer_func)
+
+    feed_order = [
+        'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
+        'movie_title', 'score'
+    ]
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndStepEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
+            avg_cost_set = trainer.test(
+                reader=test_reader, feed_order=feed_order)
+
+            # get avg cost
+            avg_cost = np.array(avg_cost_set).mean()
+
+            print("avg_cost: %s" % avg_cost)
+
+            if float(avg_cost) < 4:  # Smaller value to increase CI speed
+                trainer.save_params(params_dirname)
+                trainer.stop()
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                                                              float(avg_cost)))
+                if math.isnan(float(avg_cost)):
+                    sys.exit("got NaN loss, training failed.")
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.movielens.train(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=feed_order)
+
+
+def infer(use_cuda, inference_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    inferencer = fluid.Inferencer(
+        inference_program, param_path=params_dirname, place=place)
+
+    # Use the first data from paddle.dataset.movielens.test() as input.
+    # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor,
+    # where `data` is a list of sequences of index numbers, `lod` is 
+    # the level of detail (lod) info associated with `data`.
+    # For example, data = [[10, 2, 3], [2, 3]] means that it contains
+    # two sequences of indexes, of length 3 and 2, respectively.
+    # Correspondingly, lod = [[3, 2]] contains one level of detail info,
+    # indicating that `data` consists of two sequences of length 3 and 2. 
+    user_id = fluid.create_lod_tensor([[1]], [[1]], place)
+    gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
+    age_id = fluid.create_lod_tensor([[0]], [[1]], place)
+    job_id = fluid.create_lod_tensor([[10]], [[1]], place)
+    movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
+    category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
+    movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]], [[5]],
+                                          place)
+
+    results = inferencer.infer(
+        {
+            'user_id': user_id,
+            'gender_id': gender_id,
+            'age_id': age_id,
+            'job_id': job_id,
+            'movie_id': movie_id,
+            'category_id': category_id,
+            'movie_title': movie_title
+        },
+        return_numpy=False)
+
+    print("infer results: ", np.array(results[0]))
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    params_dirname = "recommender_system.inference.model"
+    train(
+        use_cuda=use_cuda,
+        train_program=train_program,
+        params_dirname=params_dirname)
+    infer(
+        use_cuda=use_cuda,
+        inference_program=inference_program,
+        params_dirname=params_dirname)
+
+
+if __name__ == '__main__':
+    main(USE_GPU)
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt
new file mode 100644
index 0000000000..d71147a85e
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt
@@ -0,0 +1,12 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# This test is buggy
+# py_test(test_understand_sentiment_dynamic_rnn SRCS
+# 	test_understand_sentiment_dynamic_rnn.py SERIAL)
+LIST(REMOVE_ITEM TEST_OPS test_understand_sentiment_dynamic_rnn)
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/notest_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/notest_understand_sentiment_stacked_lstm.py
deleted file mode 100644
index 9948e5c023..0000000000
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/notest_understand_sentiment_stacked_lstm.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-from functools import partial
-
-CLASS_DIM = 2
-EMB_DIM = 128
-HID_DIM = 512
-STACKED_NUM = 3
-
-
-def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):
-    assert stacked_num % 2 == 1
-
-    emb = fluid.layers.embedding(
-        input=data, size=[input_dim, emb_dim], is_sparse=True)
-
-    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
-    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
-
-    inputs = [fc1, lstm1]
-
-    for i in range(2, stacked_num + 1):
-        fc = fluid.layers.fc(input=inputs, size=hid_dim)
-        lstm, cell = fluid.layers.dynamic_lstm(
-            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
-        inputs = [fc, lstm]
-
-    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
-    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
-
-    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
-                                 size=class_dim,
-                                 act='softmax')
-    return prediction
-
-
-def inference_network(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
-    dict_dim = len(word_dict)
-    net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM,
-                           STACKED_NUM)
-    return net
-
-
-def train_network(word_dict):
-    prediction = inference_network(word_dict)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy
-
-
-def train(use_cuda, save_path):
-    BATCH_SIZE = 128
-    EPOCH_NUM = 5
-
-    word_dict = paddle.dataset.imdb.word_dict()
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=1000),
-        batch_size=BATCH_SIZE)
-
-    test_data = paddle.batch(
-        paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
-
-    def event_handler(event):
-        if isinstance(event, fluid.EndIteration):
-            if (event.batch_id % 10) == 0:
-                avg_cost, accuracy = trainer.test(reader=test_data)
-
-                print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format(
-                    event.batch_id + 1, avg_cost, accuracy))
-
-                if accuracy > 0.01:  # Low threshold for speeding up CI
-                    trainer.params.save(save_path)
-                    return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    trainer = fluid.Trainer(
-        partial(train_network, word_dict),
-        optimizer=fluid.optimizer.Adagrad(learning_rate=0.002),
-        place=place,
-        event_handler=event_handler)
-
-    trainer.train(train_data, EPOCH_NUM, event_handler=event_handler)
-
-
-def infer(use_cuda, save_path):
-    params = fluid.Params(save_path)
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    word_dict = paddle.dataset.imdb.word_dict()
-    inferencer = fluid.Inferencer(
-        partial(inference_network, word_dict), params, place=place)
-
-    def create_random_lodtensor(lod, place, low, high):
-        data = np.random.random_integers(low, high,
-                                         [lod[-1], 1]).astype("int64")
-        res = fluid.LoDTensor()
-        res.set(data, place)
-        res.set_lod([lod])
-        return res
-
-    lod = [0, 4, 10]
-    tensor_words = create_random_lodtensor(
-        lod, place, low=0, high=len(word_dict) - 1)
-    results = inferencer.infer({'words': tensor_words})
-    print("infer results: ", results)
-
-
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    save_path = "understand_sentiment_stacked_lstm.inference.model"
-    train(use_cuda, save_path)
-    infer(use_cuda, save_path)
-
-
-if __name__ == '__main__':
-    for use_cuda in (False, True):
-        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
new file mode 100644
index 0000000000..9b61f7a00c
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+from functools import partial
+import numpy as np
+
+CLASS_DIM = 2
+EMB_DIM = 128
+HID_DIM = 512
+BATCH_SIZE = 128
+
+
+def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim):
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=3,
+        act="tanh",
+        pool_type="sqrt")
+    conv_4 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=4,
+        act="tanh",
+        pool_type="sqrt")
+    prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                 size=class_dim,
+                                 act="softmax")
+    return prediction
+
+
+def inference_program(word_dict):
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    dict_dim = len(word_dict)
+    net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
+    return net
+
+
+def train_program(word_dict):
+    prediction = inference_program(word_dict)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return [avg_cost, accuracy]
+
+
+def optimizer_func():
+    return fluid.optimizer.Adagrad(learning_rate=0.002)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    trainer = fluid.Trainer(
+        train_func=partial(train_program, word_dict),
+        place=place,
+        optimizer_func=optimizer_func)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndEpochEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
+            avg_cost, acc = trainer.test(
+                reader=test_reader, feed_order=['words', 'label'])
+
+            print("avg_cost: %s" % avg_cost)
+            print("acc     : %s" % acc)
+
+            if acc > 0.2:  # Smaller value to increase CI speed
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
+                    event.epoch + 1, avg_cost, acc))
+                if math.isnan(avg_cost):
+                    sys.exit("got NaN loss, training failed.")
+        elif isinstance(event, fluid.EndStepEvent):
+            print("Step {0}, Epoch {1} Metrics {2}".format(
+                event.step, event.epoch, map(np.array, event.metrics)))
+            if event.step == 1:  # Run 2 iterations to speed CI
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=25000),
+        batch_size=BATCH_SIZE)
+
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=['words', 'label'])
+
+
+def infer(use_cuda, inference_program, params_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    word_dict = paddle.dataset.imdb.word_dict()
+
+    inferencer = fluid.Inferencer(
+        infer_func=partial(inference_program, word_dict),
+        param_path=params_dirname,
+        place=place)
+
+    # Setup input by creating LoDTensor to represent sequence of words.
+    # Here each word is the basic element of the LoDTensor and the shape of 
+    # each word (base_shape) should be [1] since it is simply an index to 
+    # look up for the corresponding word vector.
+    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
+    # which has only one lod level. Then the created LoDTensor will have only 
+    # one higher level structure (sequence of words, or sentence) than the basic 
+    # element (word). Hence the LoDTensor will hold data for three sentences of 
+    # length 3, 4 and 2, respectively. 
+    # Note that lod info should be a list of lists.
+    lod = [[3, 4, 2]]
+    base_shape = [1]
+    # The range of random integers is [low, high]
+    tensor_words = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+    results = inferencer.infer({'words': tensor_words})
+    print("infer results: ", results)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    params_dirname = "understand_sentiment_conv.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
new file mode 100644
index 0000000000..aa7c567b4d
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+from functools import partial
+import numpy as np
+
+CLASS_DIM = 2
+EMB_DIM = 128
+BATCH_SIZE = 128
+LSTM_SIZE = 128
+
+
+def dynamic_rnn_lstm(data, input_dim, class_dim, emb_dim, lstm_size):
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+    sentence = fluid.layers.fc(input=emb, size=lstm_size, act='tanh')
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        word = rnn.step_input(sentence)
+        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
+        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
+
+        def gate_common(ipt, hidden, size):
+            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
+            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
+            return gate0 + gate1
+
+        forget_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                         lstm_size))
+        input_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                        lstm_size))
+        output_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                         lstm_size))
+        cell_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                       lstm_size))
+
+        cell = forget_gate * prev_cell + input_gate * cell_gate
+        hidden = output_gate * fluid.layers.tanh(x=cell)
+        rnn.update_memory(prev_cell, cell)
+        rnn.update_memory(prev_hidden, hidden)
+        rnn.output(hidden)
+
+    last = fluid.layers.sequence_last_step(rnn())
+    prediction = fluid.layers.fc(input=last, size=class_dim, act="softmax")
+    return prediction
+
+
+def inference_program(word_dict):
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    dict_dim = len(word_dict)
+    pred = dynamic_rnn_lstm(data, dict_dim, CLASS_DIM, EMB_DIM, LSTM_SIZE)
+    return pred
+
+
+def train_program(word_dict):
+    prediction = inference_program(word_dict)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return [avg_cost, accuracy]
+
+
+def optimizer_func():
+    return fluid.optimizer.Adagrad(learning_rate=0.002)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    trainer = fluid.Trainer(
+        train_func=partial(train_program, word_dict),
+        place=place,
+        optimizer_func=optimizer_func)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndEpochEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
+            avg_cost, acc = trainer.test(
+                reader=test_reader, feed_order=['words', 'label'])
+
+            print("avg_cost: %s" % avg_cost)
+            print("acc     : %s" % acc)
+
+            if acc > 0.2:  # Smaller value to increase CI speed
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
+                    event.epoch + 1, avg_cost, acc))
+                if math.isnan(avg_cost):
+                    sys.exit("got NaN loss, training failed.")
+        elif isinstance(event, fluid.EndStepEvent):
+            print("Step {0}, Epoch {1} Metrics {2}".format(
+                event.step, event.epoch, map(np.array, event.metrics)))
+            if event.step == 1:  # Run 2 iterations to speed CI
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=25000),
+        batch_size=BATCH_SIZE)
+
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=['words', 'label'])
+
+
+def infer(use_cuda, inference_program, params_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    word_dict = paddle.dataset.imdb.word_dict()
+
+    inferencer = fluid.Inferencer(
+        infer_func=partial(inference_program, word_dict),
+        param_path=params_dirname,
+        place=place)
+
+    # Setup input by creating LoDTensor to represent sequence of words.
+    # Here each word is the basic element of the LoDTensor and the shape of 
+    # each word (base_shape) should be [1] since it is simply an index to 
+    # look up for the corresponding word vector.
+    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
+    # which has only one lod level. Then the created LoDTensor will have only 
+    # one higher level structure (sequence of words, or sentence) than the basic 
+    # element (word). Hence the LoDTensor will hold data for three sentences of 
+    # length 3, 4 and 2, respectively. 
+    # Note that lod info should be a list of lists.
+    lod = [[3, 4, 2]]
+    base_shape = [1]
+    # The range of random integers is [low, high]
+    tensor_words = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+    results = inferencer.infer({'words': tensor_words})
+    print("infer results: ", results)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    params_dirname = "understand_sentiment_conv.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
new file mode 100644
index 0000000000..113dda88ca
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+from functools import partial
+import numpy as np
+
+CLASS_DIM = 2
+EMB_DIM = 128
+HID_DIM = 512
+STACKED_NUM = 3
+BATCH_SIZE = 128
+
+
+def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):
+    assert stacked_num % 2 == 1
+
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+
+    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
+    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
+
+    inputs = [fc1, lstm1]
+
+    for i in range(2, stacked_num + 1):
+        fc = fluid.layers.fc(input=inputs, size=hid_dim)
+        lstm, cell = fluid.layers.dynamic_lstm(
+            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
+        inputs = [fc, lstm]
+
+    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
+    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
+
+    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
+                                 size=class_dim,
+                                 act='softmax')
+    return prediction
+
+
+def inference_program(word_dict):
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    dict_dim = len(word_dict)
+    net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM,
+                           STACKED_NUM)
+    return net
+
+
+def train_program(word_dict):
+    prediction = inference_program(word_dict)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return [avg_cost, accuracy]
+
+
+def optimizer_func():
+    return fluid.optimizer.Adagrad(learning_rate=0.002)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    trainer = fluid.Trainer(
+        train_func=partial(train_program, word_dict),
+        place=place,
+        optimizer_func=optimizer_func)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndEpochEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
+            avg_cost, acc = trainer.test(
+                reader=test_reader, feed_order=['words', 'label'])
+
+            print("avg_cost: %s" % avg_cost)
+            print("acc     : %s" % acc)
+
+            if acc > 0.2:  # Smaller value to increase CI speed
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
+                    event.epoch + 1, avg_cost, acc))
+                if math.isnan(avg_cost):
+                    sys.exit("got NaN loss, training failed.")
+        elif isinstance(event, fluid.EndStepEvent):
+            print("Step {0}, Epoch {1} Metrics {2}".format(
+                event.step, event.epoch, map(np.array, event.metrics)))
+            if event.step == 1:  # Run 2 iterations to speed CI
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=25000),
+        batch_size=BATCH_SIZE)
+
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=['words', 'label'])
+
+
+def infer(use_cuda, inference_program, params_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    word_dict = paddle.dataset.imdb.word_dict()
+
+    inferencer = fluid.Inferencer(
+        infer_func=partial(inference_program, word_dict),
+        param_path=params_dirname,
+        place=place)
+
+    # Setup input by creating LoDTensor to represent sequence of words.
+    # Here each word is the basic element of the LoDTensor and the shape of 
+    # each word (base_shape) should be [1] since it is simply an index to 
+    # look up for the corresponding word vector.
+    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
+    # which has only one lod level. Then the created LoDTensor will have only 
+    # one higher level structure (sequence of words, or sentence) than the basic 
+    # element (word). Hence the LoDTensor will hold data for three sentences of 
+    # length 3, 4 and 2, respectively. 
+    # Note that lod info should be a list of lists.
+    lod = [[3, 4, 2]]
+    base_shape = [1]
+    # The range of random integers is [low, high]
+    tensor_words = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+    results = inferencer.infer({'words': tensor_words})
+    print("infer results: ", results)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    params_dirname = "understand_sentiment_stacked_lstm.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt
new file mode 100644
index 0000000000..673c965b66
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
index bf86cd9acf..ba44f72d9b 100644
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
@@ -25,16 +25,6 @@ HIDDEN_SIZE = 256
 N = 5
 BATCH_SIZE = 32
 
-
-def create_random_lodtensor(lod, place, low, high):
-    # The range of data elements is [low, high]
-    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod([lod])
-    return res
-
-
 word_dict = paddle.dataset.imikolov.build_dict()
 dict_size = len(word_dict)
 
@@ -90,7 +80,11 @@ def train_program(is_sparse):
     return avg_cost
 
 
-def train(use_cuda, train_program, save_dirname):
+def optimizer_func():
+    return fluid.optimizer.SGD(learning_rate=0.001)
+
+
+def train(use_cuda, train_program, params_dirname):
     train_reader = paddle.batch(
         paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
     test_reader = paddle.batch(
@@ -107,16 +101,14 @@ def train(use_cuda, train_program, save_dirname):
             print("loss= ", avg_cost)
 
             if avg_cost < 10.0:
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                 trainer.stop()
 
             if math.isnan(avg_cost):
                 sys.exit("got NaN loss, training failed.")
 
     trainer = fluid.Trainer(
-        train_func=train_program,
-        optimizer=fluid.optimizer.SGD(learning_rate=0.001),
-        place=place)
+        train_func=train_program, optimizer_func=optimizer_func, place=place)
 
     trainer.train(
         reader=train_reader,
@@ -125,16 +117,28 @@ def train(use_cuda, train_program, save_dirname):
         feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw'])
 
 
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
-
-    lod = [0, 1]
-    first_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
-    second_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
-    third_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
-    fourth_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
+        infer_func=inference_program, param_path=params_dirname, place=place)
+
+    # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
+    # is simply an index to look up for the corresponding word vector and hence 
+    # the shape of word (base_shape) should be [1]. The length-based level of 
+    # detail (lod) info of each LoDtensor should be [[1]] meaning there is only 
+    # one lod_level and there is only one sequence of one word on this level.
+    # Note that lod info should be a list of lists.
+    lod = [[1]]
+    base_shape = [1]
+    # The range of random integers is [low, high]
+    first_word = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=dict_size - 1)
+    second_word = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=dict_size - 1)
+    third_word = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=dict_size - 1)
+    fourth_word = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=dict_size - 1)
 
     result = inferencer.infer(
         {
@@ -151,17 +155,17 @@ def main(use_cuda, is_sparse):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
-    save_path = "word2vec.inference.model"
+    params_dirname = "word2vec.inference.model"
 
     train(
         use_cuda=use_cuda,
         train_program=partial(train_program, is_sparse),
-        save_dirname=save_path)
+        params_dirname=params_dirname)
 
     infer(
         use_cuda=use_cuda,
         inference_program=partial(inference_program, is_sparse),
-        save_dirname=save_path)
+        params_dirname=params_dirname)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
index 792ed7368d..c6687e8ad7 100644
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -125,14 +125,6 @@ def stacked_lstm_net(data,
     return avg_cost, accuracy, prediction
 
 
-def create_random_lodtensor(lod, place, low, high):
-    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod([lod])
-    return res
-
-
 def train(word_dict,
           net_method,
           use_cuda,
@@ -242,9 +234,21 @@ def infer(word_dict, use_cuda, save_dirname=None):
 
         word_dict_len = len(word_dict)
 
-        lod = [0, 4, 10]
-        tensor_words = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
+        # Setup input by creating LoDTensor to represent sequence of words.
+        # Here each word is the basic element of the LoDTensor and the shape of 
+        # each word (base_shape) should be [1] since it is simply an index to 
+        # look up for the corresponding word vector.
+        # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
+        # which has only one lod level. Then the created LoDTensor will have only 
+        # one higher level structure (sequence of words, or sentence) than the basic 
+        # element (word). Hence the LoDTensor will hold data for three sentences of 
+        # length 3, 4 and 2, respectively. 
+        # Note that lod info should be a list of lists.
+        lod = [[3, 4, 2]]
+        base_shape = [1]
+        # The range of random integers is [low, high]
+        tensor_words = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index f1ee5dfd99..bc8a1aafc8 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -116,29 +116,6 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
     return feature_out
 
 
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def create_random_lodtensor(lod, place, low, high):
-    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod([lod])
-    return res
-
-
 def train(use_cuda, save_dirname=None, is_local=True):
     # define network topology
     word = fluid.layers.data(
@@ -271,23 +248,35 @@ def infer(use_cuda, save_dirname=None):
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
-        lod = [0, 4, 10]
-        word = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        pred = create_random_lodtensor(
-            lod, place, low=0, high=pred_dict_len - 1)
-        ctx_n2 = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        ctx_n1 = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        ctx_0 = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        ctx_p1 = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        ctx_p2 = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        mark = create_random_lodtensor(
-            lod, place, low=0, high=mark_dict_len - 1)
+        # Setup inputs by creating LoDTensors to represent sequences of words.
+        # Here each word is the basic element of these LoDTensors and the shape of 
+        # each word (base_shape) should be [1] since it is simply an index to 
+        # look up for the corresponding word vector.
+        # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
+        # which has only one lod level. Then the created LoDTensors will have only 
+        # one higher level structure (sequence of words, or sentence) than the basic 
+        # element (word). Hence the LoDTensor will hold data for three sentences of 
+        # length 3, 4 and 2, respectively. 
+        # Note that lod info should be a list of lists.
+        lod = [[3, 4, 2]]
+        base_shape = [1]
+        # The range of random integers is [low, high]
+        word = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        pred = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=pred_dict_len - 1)
+        ctx_n2 = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        ctx_n1 = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        ctx_0 = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        ctx_p1 = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        ctx_p2 = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        mark = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=mark_dict_len - 1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index c4b6519a20..d8499fa3f7 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -157,28 +157,6 @@ def decoder_decode(context, is_sparse):
     return translation_ids, translation_scores
 
 
-def set_init_lod(data, lod, place):
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod(lod)
-    return res
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
 def train_main(use_cuda, is_sparse, is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
@@ -202,23 +180,25 @@ def train_main(use_cuda, is_sparse, is_local=True):
             paddle.dataset.wmt14.train(dict_size), buf_size=1000),
         batch_size=batch_size)
 
+    feed_order = [
+        'src_word_id', 'target_language_word', 'target_language_next_word'
+    ]
+
     exe = Executor(place)
 
     def train_loop(main_program):
         exe.run(framework.default_startup_program())
 
+        feed_list = [
+            main_program.global_block().var(var_name) for var_name in feed_order
+        ]
+        feeder = fluid.DataFeeder(feed_list, place)
+
         batch_id = 0
         for pass_id in xrange(1):
             for data in train_data():
-                word_data = to_lodtensor(map(lambda x: x[0], data), place)
-                trg_word = to_lodtensor(map(lambda x: x[1], data), place)
-                trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
                 outs = exe.run(main_program,
-                               feed={
-                                   'src_word_id': word_data,
-                                   'target_language_word': trg_word,
-                                   'target_language_next_word': trg_word_next
-                               },
+                               feed=feeder.feed(data),
                                fetch_list=[avg_cost])
                 avg_cost_val = np.array(outs[0])
                 print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
@@ -268,26 +248,32 @@ def decode_main(use_cuda, is_sparse):
         [1. for _ in range(batch_size)], dtype='float32')
     init_ids_data = init_ids_data.reshape((batch_size, 1))
     init_scores_data = init_scores_data.reshape((batch_size, 1))
-    init_lod = [i for i in range(batch_size)] + [batch_size]
+    init_lod = [1] * batch_size
     init_lod = [init_lod, init_lod]
 
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
+    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.wmt14.train(dict_size), buf_size=1000),
         batch_size=batch_size)
-    for _, data in enumerate(train_data()):
-        init_ids = set_init_lod(init_ids_data, init_lod, place)
-        init_scores = set_init_lod(init_scores_data, init_lod, place)
 
-        src_word_data = to_lodtensor(map(lambda x: x[0], data), place)
+    feed_order = ['src_word_id']
+    feed_list = [
+        framework.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
+    for data in train_data():
+        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict['init_ids'] = init_ids
+        feed_dict['init_scores'] = init_scores
 
         result_ids, result_scores = exe.run(
             framework.default_main_program(),
-            feed={
-                'src_word_id': src_word_data,
-                'init_ids': init_ids,
-                'init_scores': init_scores
-            },
+            feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
         print result_ids.lod()
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 7be924f762..65d6552acc 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -173,63 +173,33 @@ def train(use_cuda, save_dirname, is_local=True):
     test_reader = paddle.batch(
         paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
 
-    feeding = {
-        'user_id': 0,
-        'gender_id': 1,
-        'age_id': 2,
-        'job_id': 3,
-        'movie_id': 4,
-        'category_id': 5,
-        'movie_title': 6,
-        'score': 7
-    }
-
-    def func_feed(feeding, data):
-        feed_tensors = {}
-        for (key, idx) in feeding.iteritems():
-            tensor = fluid.LoDTensor()
-            if key != "category_id" and key != "movie_title":
-                if key == "score":
-                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                        "float32")
-                else:
-                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                        "int64")
-            else:
-                numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
-                                 data)
-                lod_info = [len(item) for item in numpy_data]
-                offset = 0
-                lod = [offset]
-                for item in lod_info:
-                    offset += item
-                    lod.append(offset)
-                numpy_data = np.concatenate(numpy_data, axis=0)
-                tensor.set_lod([lod])
-
-            numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
-            tensor.set(numpy_data, place)
-            feed_tensors[key] = tensor
-        return feed_tensors
+    feed_order = [
+        'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
+        'movie_title', 'score'
+    ]
 
     def train_loop(main_program):
         exe.run(framework.default_startup_program())
 
+        feed_list = [
+            main_program.global_block().var(var_name) for var_name in feed_order
+        ]
+        feeder = fluid.DataFeeder(feed_list, place)
+
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
             for batch_id, data in enumerate(train_reader()):
                 # train a mini-batch
                 outs = exe.run(program=main_program,
-                               feed=func_feed(feeding, data),
+                               feed=feeder.feed(data),
                                fetch_list=[avg_cost])
                 out = np.array(outs[0])
                 if (batch_id + 1) % 10 == 0:
                     avg_cost_set = []
                     for test_data in test_reader():
-                        avg_cost_np = exe.run(
-                            program=test_program,
-                            feed=func_feed(feeding, test_data),
-                            fetch_list=[avg_cost])
+                        avg_cost_np = exe.run(program=test_program,
+                                              feed=feeder.feed(test_data),
+                                              fetch_list=[avg_cost])
                         avg_cost_set.append(avg_cost_np[0])
                         break  # test only 1 segment for speeding up CI
 
@@ -279,23 +249,6 @@ def infer(use_cuda, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    def create_lod_tensor(data, lod=None):
-        tensor = fluid.LoDTensor()
-        if lod is None:
-            # Tensor, the shape is [batch_size, 1]
-            index = 0
-            lod_0 = [index]
-            for l in range(len(data)):
-                index += 1
-                lod_0.append(index)
-            lod = [lod_0]
-        tensor.set_lod(lod)
-
-        flattened_data = np.concatenate(data, axis=0).astype("int64")
-        flattened_data = flattened_data.reshape([len(flattened_data), 1])
-        tensor.set(flattened_data, place)
-        return tensor
-
     inference_scope = fluid.core.Scope()
     with fluid.scope_guard(inference_scope):
         # Use fluid.io.load_inference_model to obtain the inference program desc,
@@ -307,26 +260,33 @@ def infer(use_cuda, save_dirname=None):
 
         # Use the first data from paddle.dataset.movielens.test() as input
         assert feed_target_names[0] == "user_id"
-        user_id = create_lod_tensor([[1]])
+        # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor
+        # where `data` is a list of sequences of index numbers, `lod` is 
+        # the level of detail (lod) info associated with `data`.
+        # For example, data = [[10, 2, 3], [2, 3]] means that it contains
+        # two sequences of indexes, of length 3 and 2, respectively.
+        # Correspondingly, lod = [[3, 2]] contains one level of detail info,
+        # indicating that `data` consists of two sequences of length 3 and 2. 
+        user_id = fluid.create_lod_tensor([[1]], [[1]], place)
 
         assert feed_target_names[1] == "gender_id"
-        gender_id = create_lod_tensor([[1]])
+        gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
 
         assert feed_target_names[2] == "age_id"
-        age_id = create_lod_tensor([[0]])
+        age_id = fluid.create_lod_tensor([[0]], [[1]], place)
 
         assert feed_target_names[3] == "job_id"
-        job_id = create_lod_tensor([[10]])
+        job_id = fluid.create_lod_tensor([[10]], [[1]], place)
 
         assert feed_target_names[4] == "movie_id"
-        movie_id = create_lod_tensor([[783]])
+        movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
 
         assert feed_target_names[5] == "category_id"
-        category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
+        category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
 
         assert feed_target_names[6] == "movie_title"
-        movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
-                                        [[0, 5]])
+        movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]],
+                                              [[5]], place)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
diff --git a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
similarity index 87%
rename from python/paddle/fluid/tests/book/notest_rnn_encoder_decoder.py
rename to python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
index ce640dece8..7ada57def6 100644
--- a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -152,29 +152,6 @@ def seq_to_seq_net():
     return avg_cost, prediction
 
 
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def create_random_lodtensor(lod, place, low, high):
-    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod([lod])
-    return res
-
-
 def train(use_cuda, save_dirname=None):
     [avg_cost, prediction] = seq_to_seq_net()
 
@@ -188,22 +165,20 @@ def train(use_cuda, save_dirname=None):
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = Executor(place)
-
     exe.run(framework.default_startup_program())
 
+    feed_order = ['source_sequence', 'target_sequence', 'label_sequence']
+    feed_list = [
+        framework.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
     batch_id = 0
     for pass_id in xrange(2):
         for data in train_data():
-            word_data = to_lodtensor(map(lambda x: x[0], data), place)
-            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
-            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
-
             outs = exe.run(framework.default_main_program(),
-                           feed={
-                               'source_sequence': word_data,
-                               'target_sequence': trg_word,
-                               'label_sequence': trg_word_next
-                           },
+                           feed=feeder.feed(data),
                            fetch_list=[avg_cost])
 
             avg_cost_val = np.array(outs[0])
@@ -237,9 +212,23 @@ def infer(use_cuda, save_dirname=None):
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
-        lod = [0, 4, 10]
-        word_data = create_random_lodtensor(lod, place, low=0, high=1)
-        trg_word = create_random_lodtensor(lod, place, low=0, high=1)
+        # Setup input by creating LoDTensor to represent sequence of words.
+        # Here each word is the basic element of the LoDTensor and the shape of 
+        # each word (base_shape) should be [1] since it is simply an index to 
+        # look up for the corresponding word vector.
+        # Suppose the length_based level of detail (lod) info is set to [[4, 6]],
+        # which has only one lod level. Then the created LoDTensor will have only 
+        # one higher level structure (sequence of words, or sentence) than the basic 
+        # element (word). Hence the LoDTensor will hold data for two sentences of 
+        # length 4 and 6, respectively. 
+        # Note that lod info should be a list of lists.
+        lod = [[4, 6]]
+        base_shape = [1]
+        # The range of random integers is [low, high]
+        word_data = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=1)
+        trg_word = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 30e1a5040c..3118d88701 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -21,15 +21,6 @@ import math
 import sys
 
 
-def create_random_lodtensor(lod, place, low, high):
-    # The range of data elements is [low, high]
-    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod([lod])
-    return res
-
-
 def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
     PASS_NUM = 100
     EMBED_SIZE = 32
@@ -175,16 +166,23 @@ def infer(use_cuda, save_dirname=None):
         word_dict = paddle.dataset.imikolov.build_dict()
         dict_size = len(word_dict)
 
-        # Setup inputs, by creating 4 words, the lod of which should be [0, 1]
-        lod = [0, 1]
-        first_word = create_random_lodtensor(
-            lod, place, low=0, high=dict_size - 1)
-        second_word = create_random_lodtensor(
-            lod, place, low=0, high=dict_size - 1)
-        third_word = create_random_lodtensor(
-            lod, place, low=0, high=dict_size - 1)
-        fourth_word = create_random_lodtensor(
-            lod, place, low=0, high=dict_size - 1)
+        # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
+        # is simply an index to look up for the corresponding word vector and hence 
+        # the shape of word (base_shape) should be [1]. The length-based level of 
+        # detail (lod) info of each LoDtensor should be [[1]] meaning there is only 
+        # one lod_level and there is only one sequence of one word on this level.
+        # Note that lod info should be a list of lists.
+        lod = [[1]]
+        base_shape = [1]
+        # The range of random integers is [low, high]
+        first_word = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=dict_size - 1)
+        second_word = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=dict_size - 1)
+        third_word = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=dict_size - 1)
+        fourth_word = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=dict_size - 1)
 
         assert feed_target_names[0] == 'firstw'
         assert feed_target_names[1] == 'secondw'
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
index a1ca6d981f..fa696acdfa 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -80,21 +80,6 @@ def encoder_decoder():
     return rnn()
 
 
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
 def main():
     rnn_out = encoder_decoder()
     label = layers.data(
@@ -122,18 +107,21 @@ def main():
 
     exe.run(framework.default_startup_program())
 
+    feed_order = [
+        'src_word_id', 'target_language_word', 'target_language_next_word'
+    ]
+
+    feed_list = [
+        fluid.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
     batch_id = 0
     for pass_id in xrange(10):
         for data in train_data():
-            word_data = to_lodtensor(map(lambda x: x[0], data), place)
-            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
-            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
             outs = exe.run(fluid.default_main_program(),
-                           feed={
-                               'src_word_id': word_data,
-                               'target_language_word': trg_word,
-                               'target_language_next_word': trg_word_next
-                           },
+                           feed=feeder.feed(data),
                            fetch_list=[avg_cost])
             avg_cost_val = np.array(outs[0])
             print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
diff --git a/python/paddle/fluid/tests/test_concurrency.py b/python/paddle/fluid/tests/no_test_concurrency.py
similarity index 100%
rename from python/paddle/fluid/tests/test_concurrency.py
rename to python/paddle/fluid/tests/no_test_concurrency.py
diff --git a/python/paddle/fluid/tests/test_cpp_reader.py b/python/paddle/fluid/tests/test_cpp_reader.py
index e54c73b295..6cc291dfcf 100644
--- a/python/paddle/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/fluid/tests/test_cpp_reader.py
@@ -44,8 +44,8 @@ create_random_data_generator_op = startup_block.append_op(
     attrs={
         "shape_concat": [1, 2, 1, 1],
         "ranks": [2, 2],
-        "min": 0.0,
-        "max": 1.0,
+        "low": 0.0,
+        "high": 1.0,
         'lod_levels': [0, 0]
     })
 
diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py
new file mode 100644
index 0000000000..013d72f418
--- /dev/null
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
@@ -0,0 +1,91 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor, _validate_lod, _convert_lod
+import numpy
+import unittest
+
+
+class TestLoDTensor(unittest.TestCase):
+    def test_validate_lod(self):
+        lod = (1, 2, 1)
+        self.assertRaises(AssertionError, _validate_lod, lod, -1)
+        lod = [[1, 2], (2, 3)]
+        self.assertRaises(AssertionError, _validate_lod, lod, -1)
+        lod = [1, 2, 3]
+        self.assertRaises(AssertionError, _validate_lod, lod, -1)
+
+        lod = []
+        self.assertTrue(_validate_lod(lod, -1))
+        lod = [[], [1], [3]]
+        self.assertFalse(_validate_lod(lod, -1))
+        lod = [[0], [-1], [3]]
+        self.assertFalse(_validate_lod(lod, -1))
+
+        # Each level's sum should be equal to the number of items in the next level
+        # Moreover, last level's sum should be equal to the tensor height
+        lod = [[2, 3], [1, 3, 1, 2, 1]]
+        self.assertTrue(_validate_lod(lod, tensor_height=8))
+        lod = [[1, 3], [2, 1, 3]]
+        self.assertFalse(_validate_lod(lod, tensor_height=6))
+        lod = [[1, 3], [2, 1, 3, 4]]
+        self.assertFalse(_validate_lod(lod, tensor_height=5))
+
+    def test_convert_lod(self):
+        lod = [[1, 2, 3]]
+        converted_lod = [[0, 1, 3, 6]]
+        self.assertEqual(_convert_lod(lod), converted_lod)
+
+        lod = [[2, 3], [1, 3, 1, 2, 1]]
+        converted_lod = [[0, 2, 5], [0, 1, 4, 5, 7, 8]]
+        self.assertEqual(_convert_lod(lod), converted_lod)
+
+    def test_create_lod_tensor(self):
+        # Create LoDTensor from a list
+        data = [[1, 2, 3], [3, 4]]
+        wrong_lod = [[2, 2]]
+        correct_lod = [[3, 2]]
+        self.assertRaises(AssertionError, create_lod_tensor, data, wrong_lod,
+                          fluid.CPUPlace())
+        tensor = create_lod_tensor(data, correct_lod, fluid.CPUPlace())
+        self.assertEqual(tensor.lod(), [[0, 3, 5]])
+
+        # Create LoDTensor from numpy array
+        data = numpy.random.random([10, 1])
+        lod = [[2, 1], [3, 3, 4]]
+        tensor = create_lod_tensor(data, lod, fluid.CPUPlace())
+        self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]])
+
+        # Create LoDTensor from another LoDTensor, they are differnt instances
+        new_lod = [[2, 2, 1], [1, 2, 2, 3, 2]]
+        new_tensor = create_lod_tensor(tensor, new_lod, fluid.CPUPlace())
+        self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]])
+        self.assertEqual(new_tensor.lod(), [[0, 2, 4, 5], [0, 1, 3, 5, 8, 10]])
+
+    def test_create_random_int_lodtensor(self):
+        # The shape of a word, commonly used in speech and NLP problem, is [1]
+        shape = [1]
+        lod = [[2, 3, 5]]
+        dict_size = 10000
+        low = 0
+        high = dict_size - 1
+        tensor = create_random_int_lodtensor(lod, shape,
+                                             fluid.CPUPlace(), low, high)
+        self.assertEqual(tensor.lod(), [[0, 2, 5, 10]])
+        self.assertEqual(tensor.shape(), [10, 1])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 2ae9653953..ab683bc101 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -17,7 +17,7 @@ endif(NOT WITH_DISTRIBUTE)
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 
 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
-list(REMOVE_ITEM TEST_OPS test_nce) # IXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778
+list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778
 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
 
@@ -26,7 +26,7 @@ list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not
 
 function(py_test_modules TARGET_NAME)
   if(WITH_TESTING)
-    set(options "")
+    set(options SERIAL)
     set(oneValueArgs "")
     set(multiValueArgs MODULES DEPS ENVS)
     cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -34,76 +34,19 @@ function(py_test_modules TARGET_NAME)
              COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
              ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if (py_test_modules_SERIAL)
+        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    endif()
   endif()
 endfunction()
-
-list(REMOVE_ITEM TEST_OPS test_sequence_expand)
-
-# test time consuming OPs in a separate process for expliot parallism
-list(REMOVE_ITEM TEST_OPS test_parallel_executor)
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
-list(REMOVE_ITEM TEST_OPS test_dyn_rnn)
-list(REMOVE_ITEM TEST_OPS test_mul_op)
-
-# tests that need to be run in separate process.
-list(REMOVE_ITEM TEST_OPS test_multihead_attention)
-list(REMOVE_ITEM TEST_OPS test_calc_gradient)
-list(REMOVE_ITEM TEST_OPS test_while_op)
-list(REMOVE_ITEM TEST_OPS test_lod_array_length_op)
-list(REMOVE_ITEM TEST_OPS test_reorder_lod_tensor)
-list(REMOVE_ITEM TEST_OPS test_profiler)
-list(REMOVE_ITEM TEST_OPS test_nvprof)
-list(REMOVE_ITEM TEST_OPS test_normalization_wrapper)
-list(REMOVE_ITEM TEST_OPS test_executor_and_mul)
-list(REMOVE_ITEM TEST_OPS test_assign_value_op)
-list(REMOVE_ITEM TEST_OPS test_array_read_write_op)
-list(REMOVE_ITEM TEST_OPS test_lod_rank_table)
-list(REMOVE_ITEM TEST_OPS test_weight_normalization)
-list(REMOVE_ITEM TEST_OPS test_conditional_block)
-list(REMOVE_ITEM TEST_OPS test_parameter)
-list(REMOVE_ITEM TEST_OPS test_registry)
-list(REMOVE_ITEM TEST_OPS test_fetch_var)
-list(REMOVE_ITEM TEST_OPS test_parallel_op)
-list(REMOVE_ITEM TEST_OPS test_dynrnn_static_input)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
-list(REMOVE_ITEM TEST_OPS test_network_with_dtype)
-
-# tests that can be bundled together in one python process for speed.
-if(WITH_FAST_BUNDLE_TEST)
-    py_test_modules("test_all_ops" MODULES ${TEST_OPS})
-else()
-    foreach(TEST_OP ${TEST_OPS})
-        py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-    endforeach(TEST_OP)
-endif(WITH_FAST_BUNDLE_TEST)
-
-#
-py_test_modules(test_sequence_expand MODULES test_sequence_expand)
-# tests with high overhead
-py_test_modules(test_parallel_executor MODULES test_parallel_executor)
-py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
-py_test_modules(test_train_dyn_rnn MODULES test_dyn_rnn)
-py_test_modules(test_mul_op MODULES test_mul_op)
-py_test_modules(test_network_with_dtype MODULES test_network_with_dtype)
-
-# tests that need to be run in separate process.
-py_test_modules(test_multihead_attention MODULES test_multihead_attention)
-py_test_modules(test_calc_gradient MODULES test_calc_gradient)
-py_test_modules(test_while_op MODULES test_while_op)
-py_test_modules(test_lod_array_length_op MODULES test_lod_array_length_op)
-py_test_modules(test_reorder_lod_tensor MODULES test_reorder_lod_tensor)
-py_test_modules(test_profiler MODULES test_profiler)
-py_test_modules(test_nvprof MODULES test_nvprof)
-py_test_modules(test_normalization_wrapper MODULES test_normalization_wrapper)
-py_test_modules(test_executor_and_mul MODULES test_executor_and_mul)
-py_test_modules(test_assign_value_op MODULES test_assign_value_op)
-py_test_modules(test_array_read_write_op MODULES test_array_read_write_op)
-py_test_modules(test_lod_rank_table MODULES test_lod_rank_table)
-py_test_modules(test_weight_normalization MODULES test_weight_normalization)
-py_test_modules(test_conditional_block MODULES test_conditional_block)
-py_test_modules(test_parameter MODULES test_parameter)
-py_test_modules(test_registry MODULES test_registry)
-py_test_modules(test_fetch_var MODULES test_fetch_var)
-py_test_modules(test_dynrnn_static_input MODULES test_dynrnn_static_input)
-py_test_modules(test_parallel_op MODULES test_parallel_op)
-py_test_modules(test_dist_train MODULES test_dist_train)
+#list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
+#list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
+# TODO(wuyi): this test hungs on CI, will add it back later
+list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach(TEST_OP)
+py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
+py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
diff --git a/python/paddle/fluid/tests/unittests/benchmark.py b/python/paddle/fluid/tests/unittests/benchmark.py
new file mode 100644
index 0000000000..e891ee932f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/benchmark.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import time
+import itertools
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from op_test import OpTest
+
+
+class BenchmarkSuite(OpTest):
+    def timeit_function(self, callback, iters, *args, **kwargs):
+        assert iters != 0, "Iters should >= 1"
+        start = time.time()
+        for i in range(iters):
+            callback(*args, **kwargs)
+        elapse = time.time() - start
+        return elapse / iters
+
+    def _assert_cpu_gpu_same(self, cpu_outs, gpu_outs, fetch_list, atol):
+        for item_cpu_out, item_gpu_out, variable in zip(cpu_outs, gpu_outs,
+                                                        fetch_list):
+            # the cpu version is baseline, expect gpu version keep same with cpu version.
+            expect = item_cpu_out
+            expect_t = np.array(item_cpu_out)
+            actual = item_gpu_out
+            actual_t = np.array(item_gpu_out)
+            var_name = variable if isinstance(variable,
+                                              basestring) else variable.name
+            self.assertTrue(
+                np.allclose(
+                    actual_t, expect_t, atol=atol),
+                "Output (" + var_name + ") has diff" + str(actual_t) + "\n" +
+                str(expect_t))
+            self.assertListEqual(actual.lod(),
+                                 expect.lod(),
+                                 "Output (" + var_name + ") has different lod")
+
+    def _get_input_names(self):
+        inputs = []
+        for name, value in self.inputs.iteritems():
+            if isinstance(value, list):
+                inputs.extend([sub_name for sub_name, _ in value])
+            inputs.append(name)
+        return inputs
+
+    def _get_output_names(self):
+        outputs = []
+        for var_name, var in self.outputs.iteritems():
+            if isinstance(var, list):
+                for sub_var_name, sub_var in var:
+                    outputs.append(sub_var_name)
+            else:
+                outputs.append(var_name)
+        if len(outputs) == 0:
+            for out_name, out_dup in Operator.get_op_outputs(self.op_type):
+                outputs.append(str(out_name))
+        return outputs
+
+    def check_output_stability(self, atol=1e-8):
+        places = self._get_places()
+        if len(places) < 2:
+            return
+        cpu_outs, fetch_list = self._calc_output(places[0])
+        gpu_outs, _ = self._calc_output(places[1])
+        self._assert_cpu_gpu_same(cpu_outs, gpu_outs, fetch_list, atol)
+
+    def timeit_output_with_place(self, place, iters):
+        return self.timeit_function(self.calc_output, iters, place)
+
+    def timeit_output(self, iters=100):
+        places = self._get_places()
+        elapses = []
+        for place in places:
+            elapses.append(self.timeit_output_with_place(place, iters))
+        for place, elapse in zip(places, elapses):
+            print("One pass of ({2}_op) at {0} cost {1}".format(
+                str(place), elapse, self.op_type))
+
+    def timeit_grad_with_place(self, place, iters=100):
+        inputs_to_check = self._get_input_names()
+        output_names = self._get_output_names()
+        return self.timeit_function(
+            self._get_gradient,
+            iters,
+            inputs_to_check,
+            place,
+            output_names,
+            no_grad_set=None)
+
+    def timeit_grad(self, iters=100):
+        places = self._get_places()
+        elapses = []
+        for place in places:
+            elapses.append(self.timeit_grad_with_place(place, iters))
+        for place, elapse in zip(places, elapses):
+            print("One pass of ({2}_grad_op) at {0} cost {1}".format(
+                str(place), elapse, self.op_type))
diff --git a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
new file mode 100644
index 0000000000..91a5f1bca4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
@@ -0,0 +1,82 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+from benchmark import BenchmarkSuite
+from op_test import OpTest
+
+# This is a demo op test case for operator benchmarking and high resolution number stability alignment.
+
+
+class TestSumOp(BenchmarkSuite):
+    def setUp(self):
+        self.op_type = "sum"
+        self.customize_testcase()
+        self.customize_fetch_list()
+
+    def customize_fetch_list(self):
+        """
+        customize fetch list, configure the wanted variables.
+        >>> self.fetch_list = ["Out"]
+        """
+        self.fetch_list = ["Out"]
+        # pass
+
+    def customize_testcase(self):
+        # a test case
+        x0 = np.random.random((300, 400)).astype('float32')
+        x1 = np.random.random((300, 400)).astype('float32')
+        x2 = np.random.random((300, 400)).astype('float32')
+
+        # NOTE: if the output is empty, then it will autofilled by benchmarkSuite.
+        # only the output dtype is used, the shape, lod and data is computed from input.
+        self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+        self.outputs = {"Out": x0 + x1 + x2}
+
+    def test_check_output(self):
+        """
+        compare the output with customized output. In this case,
+        you should set the correct output by hands.
+        >>> self.outputs = {"Out": x0 + x1 + x2}
+        """
+        self.check_output(atol=1e-8)
+
+    def test_output_stability(self):
+        # compare the cpu gpu output in high resolution.
+        self.check_output_stability()
+
+    def test_timeit_output(self):
+        """
+        perf the op, time cost will be averged in iters.
+        output example
+        >>> One pass of (sum_op) at CPUPlace cost 0.000461330413818
+        >>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596
+        """
+        self.timeit_output(iters=100)
+
+    def test_timeit_grad(self):
+        """
+        perf the op gradient, time cost will be averged in iters.
+        output example
+        >>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536
+        >>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653
+        """
+        self.timeit_grad(iters=100)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 299ab8e51f..307caae4b0 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -15,13 +15,17 @@
 import unittest
 import numpy as np
 import random
+import time
 import itertools
-import paddle.fluid.core as core
 import collections
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid.backward import append_backward
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
-from paddle.fluid.framework import Program, OpProtoHolder
+from paddle.fluid.framework import Program, OpProtoHolder, Variable
+from testsuite import create_op, set_input, append_input_output, append_loss_ops
 
 
 def randomize_probability(batch_size, class_num, dtype='float32'):
@@ -33,67 +37,6 @@ def randomize_probability(batch_size, class_num, dtype='float32'):
     return prob
 
 
-def create_op(scope, op_type, inputs, outputs, attrs):
-    kwargs = dict()
-
-    def __create_var__(name, var_name):
-        scope.var(var_name).get_tensor()
-        kwargs[name].append(var_name)
-
-    for in_name, in_dup in Operator.get_op_inputs(op_type):
-        if in_name in inputs:
-            kwargs[in_name] = []
-            if in_dup:
-                sub_in = inputs[in_name]
-                for item in sub_in:
-                    sub_in_name, _ = item[0], item[1]
-                    __create_var__(in_name, sub_in_name)
-            else:
-                __create_var__(in_name, in_name)
-
-    for out_name, out_dup in Operator.get_op_outputs(op_type):
-        if out_name in outputs:
-            kwargs[out_name] = []
-            if out_dup:
-                sub_out = outputs[out_name]
-                for item in sub_out:
-                    sub_out_name, _ = item[0], item[1]
-                    __create_var__(out_name, sub_out_name)
-            else:
-                __create_var__(out_name, out_name)
-
-    for attr_name in Operator.get_op_attr_names(op_type):
-        if attr_name in attrs:
-            kwargs[attr_name] = attrs[attr_name]
-
-    return Operator(op_type, **kwargs)
-
-
-def set_input(scope, op, inputs, place):
-    def __set_input__(var_name, var):
-        if isinstance(var, tuple) or isinstance(var, np.ndarray):
-            tensor = scope.find_var(var_name).get_tensor()
-            if isinstance(var, tuple):
-                tensor.set_lod(var[1])
-                var = var[0]
-            tensor.set_dims(var.shape)
-            tensor.set(var, place)
-        elif isinstance(var, float):
-            scope.find_var(var_name).set_float(var)
-        elif isinstance(var, int):
-            scope.find_var(var_name).set_int(var)
-
-    for in_name, in_dup in Operator.get_op_inputs(op.type()):
-        if in_name in inputs:
-            if in_dup:
-                sub_in = inputs[in_name]
-                for item in sub_in:
-                    sub_in_name, sub_in_val = item[0], item[1]
-                    __set_input__(sub_in_name, sub_in_val)
-            else:
-                __set_input__(in_name, inputs[in_name])
-
-
 def get_numeric_gradient(place,
                          scope,
                          op,
@@ -167,54 +110,15 @@ def get_numeric_gradient(place,
     return gradient_flat.reshape(tensor_to_check.get_dims())
 
 
-def append_input_output(block, op_proto, np_list, is_input):
-    '''Insert VarDesc and generate Python variable instance'''
-    proto_list = op_proto.inputs if is_input else op_proto.outputs
-
-    def create_var(block, name, np_list, var_proto):
-        if name not in np_list:
-            assert var_proto.intermediate, "{} not found".format(name)
-            shape = None
-            lod_level = None
-        else:
-            np_value = np_list[name]
-            if isinstance(np_value, tuple):
-                shape = list(np_value[0].shape)
-                lod_level = len(np_value[1])
-            else:
-                shape = list(np_value.shape)
-                lod_level = 0
-        return block.create_var(
-            dtype="float32", shape=shape, lod_level=lod_level, name=name)
-
-    var_dict = {}
-    for var_proto in proto_list:
-        var_name = str(var_proto.name)
-        if is_input:
-            if (var_name not in np_list) and var_proto.dispensable:
-                continue
-            assert (var_name in np_list) or (var_proto.dispensable), \
-                "Missing {} as input".format(var_name)
-        if var_proto.duplicable:
-            assert isinstance(np_list[var_name], list), \
-                "Duplicable {} should be set as list".format(var_name)
-            var_list = []
-            for (name, np_value) in np_list[var_name]:
-                var_list.append(
-                    create_var(block, name, {name: np_value}, var_proto))
-            var_dict[var_name] = var_list
-        else:
-            var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
-
-    return var_dict
-
-
 class OpTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         '''Fix random seeds to remove randomness from tests'''
         cls._np_rand_state = np.random.get_state()
         cls._py_rand_state = random.getstate()
+        cls.call_once = False
+        cls.dtype = "float32"
+        cls.outputs = {}
 
         np.random.seed(123)
         random.seed(124)
@@ -225,6 +129,31 @@ class OpTest(unittest.TestCase):
         np.random.set_state(cls._np_rand_state)
         random.setstate(cls._py_rand_state)
 
+    def try_call_once(self, data_type):
+        if not self.call_once:
+            self.call_once = True
+            self.dtype = data_type
+
+    def infer_dtype_from_inputs_outputs(self, inputs, outputs):
+        def infer_dtype(numpy_dict):
+            assert isinstance(
+                numpy_dict,
+                dict), "self.inputs, self.outputs must be numpy_dict"
+            for var_name, var_value in numpy_dict.iteritems():
+                if isinstance(var_value, (np.ndarray, np.generic)):
+                    self.try_call_once(var_value.dtype)
+                elif isinstance(var_value, (list, tuple)):
+                    # the case of self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+                    if len(var_value) > 1 and isinstance(var_value[1], (
+                            np.ndarray, np.generic)):
+                        instance = var_value[1]
+                        self.try_call_once(instance[1].dtype)
+                else:
+                    self.try_call_once("float32")
+
+        infer_dtype(inputs)
+        infer_dtype(outputs)
+
     def feed_var(self, input_vars, place):
         feed_map = {}
         for var_name in input_vars:
@@ -248,18 +177,14 @@ class OpTest(unittest.TestCase):
 
         return feed_map
 
-    def calc_output(self, place):
-        outs, _ = self._calc_output(place)
-        return outs
-
-    def _calc_output(self, place):
+    def _append_ops(self, block):
         op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
-
-        program = Program()
-        block = program.global_block()
-
-        inputs = append_input_output(block, op_proto, self.inputs, True)
-        outputs = append_input_output(block, op_proto, self.outputs, False)
+        "infer datatype from inputs and outputs for this test case"
+        self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
+        inputs = append_input_output(block, op_proto, self.inputs, True,
+                                     self.dtype)
+        outputs = append_input_output(block, op_proto, self.outputs, False,
+                                      self.dtype)
         op = block.append_op(
             type=self.op_type,
             inputs=inputs,
@@ -269,22 +194,68 @@ class OpTest(unittest.TestCase):
         op.desc.infer_var_type(block.desc)
         op.desc.infer_shape(block.desc)
 
-        fetch_list = []
-        for var_name, var in outputs.iteritems():
-            if var_name in self.outputs:
+    def _get_io_vars(self, block, numpy_inputs):
+        inputs = {}
+        for name, value in numpy_inputs.iteritems():
+            if isinstance(value, list):
+                var_list = [
+                    block.var(sub_name) for sub_name, sub_value in value
+                ]
+                inputs[name] = var_list
+            else:
+                inputs[name] = block.var(name)
+        return inputs
+
+    def _get_inputs(self, block):
+        return self._get_io_vars(block, self.inputs)
+
+    def _get_outputs(self, block):
+        return self._get_io_vars(block, self.outputs)
+
+    def calc_output(self, place):
+        outs, _ = self._calc_output(place)
+        return outs
+
+    def _calc_output(self, place, parallel=False):
+
+        program = Program()
+        block = program.global_block()
+        self._append_ops(block)
+
+        inputs = self._get_inputs(block)
+        outputs = self._get_outputs(block)
+        feed_map = self.feed_var(inputs, place)
+
+        if parallel:
+            use_cuda = False
+            if isinstance(place, fluid.CUDAPlace(0)):
+                use_cuda = True
+            executor = fluid.ParallelExecutor(
+                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+        else:
+            executor = Executor(place)
+
+        fetch_list = getattr(self, "fetch_list", [])
+        # if the fetch_list is customized by user, we use it directly.
+        # if not, fill the fetch_list by the user configured outputs in test.
+        if len(fetch_list) == 0:
+            for var_name, var in outputs.iteritems():
                 if isinstance(var, list):
                     for v in var:
                         fetch_list.append(v)
                 else:
                     fetch_list.append(var)
-
-        feed_map = self.feed_var(inputs, place)
-
-        exe = Executor(place)
-        outs = exe.run(program,
-                       feed=feed_map,
-                       fetch_list=fetch_list,
-                       return_numpy=False)
+        # if the fetch_list still empty, fill the fetch_list by the operator output.
+        if len(fetch_list) == 0:
+            for out_name, out_dup in Operator.get_op_outputs(self.op_type):
+                fetch_list.append(str(out_name))
+        # fetch_list = map(block.var, fetch_list)
+        if not isinstance(fetch_list[0], Variable):
+            fetch_list = map(block.var, fetch_list)
+        outs = executor.run(program,
+                            feed=feed_map,
+                            fetch_list=fetch_list,
+                            return_numpy=False)
         return outs, fetch_list
 
     def check_output_with_place(self, place, atol):
@@ -340,17 +311,19 @@ class OpTest(unittest.TestCase):
                                          "Output (" + out_name +
                                          ") has different lod at " + str(place))
 
-    def check_output(self, atol=1e-5):
-        places = [core.CPUPlace()]
+    def _get_places(self):
+        places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
+        return places
+
+    def check_output(self, atol=1e-5):
+        places = self._get_places()
         for place in places:
             self.check_output_with_place(place, atol)
 
     def check_output_customized(self, checker):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
-            places.append(core.CUDAPlace(0))
+        places = self._get_places()
         for place in places:
             outs = self.calc_output(place)
             outs = [np.array(out) for out in outs]
@@ -383,9 +356,7 @@ class OpTest(unittest.TestCase):
                    in_place=False,
                    max_relative_error=0.005,
                    user_defined_grads=None):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
-            places.append(core.CUDAPlace(0))
+        places = self._get_places()
         for place in places:
             self.check_grad_with_place(place, inputs_to_check, output_names,
                                        no_grad_set, numeric_grad_delta,
@@ -432,35 +403,6 @@ class OpTest(unittest.TestCase):
                                max_relative_error,
                                "Gradient Check On %s" % str(place))
 
-    @staticmethod
-    def _create_var_descs_(block, var_dict):
-        # FIXME: Try unify with `append_input_output`
-        for param_name in var_dict:
-            var = var_dict[param_name]
-            if not isinstance(var, list) and not isinstance(var, tuple):
-                var = [(param_name, var, None)]
-            if not isinstance(var[0], list) and not isinstance(var[0], tuple):
-                var = [(param_name, var[0], var[1])]
-
-            for i, item in enumerate(var):
-                if not isinstance(item[0], basestring):
-                    item = [[param_name] + list(item)]
-                if len(item) == 2:
-                    if isinstance(item[1], tuple):
-                        var[i] = [item[0], item[1][0], item[1][1]]
-                    else:
-                        # only set var name and value, set lod to None
-                        var[i] = list(item) + [None]
-            var_descs = [(block.create_var(
-                name=name, shape=each.shape, dtype=each.dtype), each, lod)
-                         for name, each, lod in var]
-
-            yield param_name, var_descs
-
-    @staticmethod
-    def _merge_list(iterable):
-        return reduce(lambda a, b: list(a) + list(b), iterable, [])
-
     @staticmethod
     def _numpy_to_lod_tensor(np_value, lod, place):
         tensor = core.LoDTensor()
@@ -473,9 +415,9 @@ class OpTest(unittest.TestCase):
     def np_dtype_to_fluid_dtype(input):
         """Change the dtype of float16 numpy array
 
-        numpy float16 is binded to paddle::platform::float16 
+        numpy float16 is binded to paddle::platform::float16
         in tensor_py.h via the help of uint16 data type since
-        the internal memory representation of float16 is 
+        the internal memory representation of float16 is
         uint16_t in paddle and np.uint16 in numpy, which are
         themselves binded together by pybind.
 
@@ -483,91 +425,39 @@ class OpTest(unittest.TestCase):
             input: input numpy array
 
         Returns:
-            input: The dtype of input will be changed to np.uint16 if 
+            input: The dtype of input will be changed to np.uint16 if
                 it is originally np.float16, such that the internal memory
-                of input will be reinterpreted as of dtype np.uint16. 
+                of input will be reinterpreted as of dtype np.uint16.
         """
         if input.dtype == np.float16:
             input.dtype = np.uint16
         return input
 
-    def _get_gradient(self, input_to_check, place, output_names, no_grad_set):
+    def _get_gradient(self,
+                      input_to_check,
+                      place,
+                      output_names,
+                      no_grad_set,
+                      parallel=False):
         prog = Program()
         block = prog.global_block()
-        inputs_with_np = {
-            key: value
-            for (key, value) in OpTest._create_var_descs_(
-                block, getattr(self, 'inputs', {}))
-        }
-        outputs_with_np = {
-            key: val
-            for (key, val) in OpTest._create_var_descs_(
-                block, getattr(self, 'outputs', {}))
-        }
-        inputs = {
-            k: [item[0] for item in inputs_with_np[k]]
-            for k in inputs_with_np
-        }
-        outputs = {
-            k: [item[0] for item in outputs_with_np[k]]
-            for k in outputs_with_np
-        }
-
-        op = block.append_op(
-            type=self.op_type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=getattr(self, 'attrs', {}))
-
-        # infer variable type and infer shape in compile-time
-        op.desc.infer_var_type(block.desc)
-        op.desc.infer_shape(block.desc)
-
-        mean_inputs = map(block.var, output_names)
-
-        if len(mean_inputs) == 1:
-            loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
-            op = block.append_op(
-                inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
-            op.desc.infer_var_type(block.desc)
-            op.desc.infer_shape(block.desc)
-        else:
-            avg_sum = []
-            for cur_loss in mean_inputs:
-                cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
-                op = block.append_op(
-                    inputs={"X": [cur_loss]},
-                    outputs={"Out": [cur_avg_loss]},
-                    type="mean")
-                op.desc.infer_var_type(block.desc)
-                op.desc.infer_shape(block.desc)
-                avg_sum.append(cur_avg_loss)
-
-            loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
-            op_sum = block.append_op(
-                inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
-            op_sum.desc.infer_var_type(block.desc)
-            op_sum.desc.infer_shape(block.desc)
-
-            loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
-            op_loss = block.append_op(
-                inputs={"X": loss_sum},
-                outputs={"Out": loss},
-                type='scale',
-                attrs={'scale': 1.0 / float(len(avg_sum))})
-            op_loss.desc.infer_var_type(block.desc)
-            op_loss.desc.infer_shape(block.desc)
-
+        self._append_ops(block)
+        loss = append_loss_ops(block, output_names)
         param_grad_list = append_backward(
             loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set)
 
-        feed_dict = {
-            item[0].name: OpTest._numpy_to_lod_tensor(item[1], item[2], place)
-            for p_name in inputs_with_np for item in inputs_with_np[p_name]
-        }
+        inputs = self._get_inputs(block)
+        feed_dict = self.feed_var(inputs, place)
 
         fetch_list = [g for p, g in param_grad_list]
-        executor = Executor(place)
+        if parallel:
+            use_cuda = False
+            if isinstance(place, fluid.CUDAPlace(0)):
+                use_cuda = True
+            executor = fluid.ParallelExecutor(
+                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+        else:
+            executor = Executor(place)
         return map(np.array,
                    executor.run(prog, feed_dict, fetch_list,
                                 return_numpy=False))
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
new file mode 100644
index 0000000000..829c5a1a5f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import multiprocessing
+import os
+import unittest
+import paddle.fluid as fluid
+import time
+import numpy as np
+
+__all__ = ['TestParallelExecutorBase']
+
+
+class TestParallelExecutorBase(unittest.TestCase):
+    def check_network_convergence(self,
+                                  method,
+                                  use_cuda=True,
+                                  memory_opt=True,
+                                  iter=50,
+                                  batch_size=None,
+                                  allow_op_delay=False,
+                                  feed_dict=None,
+                                  seed=None,
+                                  use_parallel_executor=True,
+                                  balance_parameter_opt_between_cards=False):
+        def run_executor(exe, feed, fetch_list, program=None):
+            if isinstance(exe, fluid.ParallelExecutor):
+                res = exe.run(fetch_list=fetch_list, feed=feed)
+            elif isinstance(exe, fluid.Executor):
+                if program is None:
+                    program = fluid.default_main_program()
+                res = exe.run(program=program, feed=feed, fetch_list=fetch_list)
+            else:
+                raise ValueError('Unkown type exe')
+            return res
+
+        main = fluid.Program()
+        startup = fluid.Program()
+        startup.random_seed = 1  # Fix random seed
+        with fluid.program_guard(main, startup):
+            if seed is not None:
+                startup.random_seed = seed
+            loss = method(use_feed=feed_dict is not None)
+            adam = fluid.optimizer.Adam()
+            adam.minimize(loss)
+            if memory_opt:
+                fluid.memory_optimize(main)
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            startup_exe = fluid.Executor(place)
+            startup_exe.run(startup)
+            exec_strategy = fluid.ExecutionStrategy()
+            exec_strategy.allow_op_delay = allow_op_delay
+
+            build_strategy = fluid.BuildStrategy()
+            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce
+
+            if use_parallel_executor:
+                exe = fluid.ParallelExecutor(
+                    use_cuda,
+                    loss_name=loss.name,
+                    exec_strategy=exec_strategy,
+                    build_strategy=build_strategy)
+            else:
+                exe = fluid.Executor(place=place)
+
+            if batch_size is not None:
+                batch_size *= fluid.core.get_cuda_device_count(
+                ) if use_cuda else int(
+                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+            begin = time.time()
+            first_loss, = run_executor(
+                exe=exe, feed=feed_dict, fetch_list=[loss.name])
+            first_loss = np.array(first_loss)
+
+            for i in xrange(iter):
+                run_executor(exe=exe, feed=feed_dict, fetch_list=[])
+
+            last_loss, = run_executor(
+                exe=exe, feed=feed_dict, fetch_list=[loss.name])
+            end = time.time()
+
+            if batch_size is not None:
+                print "%.4f Instance per second" % (
+                    (batch_size * iter + 2) / (end - begin))
+
+            last_loss = np.array(last_loss)
+
+            print first_loss, last_loss
+            # self.assertGreater(first_loss[0], last_loss[0])
+            return first_loss, last_loss
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
new file mode 100644
index 0000000000..e04412f809
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class BaseTestCase(OpTest):
+    def initTestCase(self):
+        self.op_type = 'arg_min'
+        self.dims = (3, 4, 5)
+        self.dtype = 'float32'
+        self.axis = 0
+
+    def setUp(self):
+        self.initTestCase()
+        self.x = (1000 * np.random.random(self.dims)).astype(self.dtype)
+        self.inputs = {'X': self.x}
+        self.attrs = {'axis': self.axis}
+        if self.op_type == "arg_min":
+            self.outputs = {'Out': np.argmin(self.x, axis=self.axis)}
+        else:
+            self.outputs = {'Out': np.argmax(self.x, axis=self.axis)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCase0(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, 4, 5)
+        self.dtype = 'float32'
+        self.axis = 0
+
+
+class TestCase1(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_min'
+        self.dims = (3, 4)
+        self.dtype = 'float64'
+        self.axis = 1
+
+
+class TestCase2(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, 4)
+        self.dtype = 'int64'
+        self.axis = 0
+
+
+class TestCase3(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, )
+        self.dtype = 'int64'
+        self.axis = 0
+
+
+class TestCase4(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_min'
+        self.dims = (1, )
+        self.dtype = 'int32'
+        self.axis = 0
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index bffb4f3b66..87c11e7880 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -17,7 +17,10 @@ import numpy as np
 from op_test import OpTest
 
 
-def bilinear_interp_np(input, out_h, out_w):
+def bilinear_interp_np(input, out_h, out_w, out_size):
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
     batch_size, channel, in_h, in_w = input.shape
     if out_h > 1:
         ratio_h = (in_h - 1.0) / (out_h - 1.0)
@@ -49,12 +52,15 @@ def bilinear_interp_np(input, out_h, out_w):
 
 class TestBilinearInterpOp(OpTest):
     def setUp(self):
+        self.out_size = None
         self.init_test_case()
         self.op_type = "bilinear_interp"
         input_np = np.random.random(self.input_shape).astype("float32")
-        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w)
-
+        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
+                                       self.out_size)
         self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
         self.attrs = {'out_h': self.out_h, 'out_w': self.out_w}
         self.outputs = {'Out': output_np}
 
@@ -68,6 +74,7 @@ class TestBilinearInterpOp(OpTest):
         self.input_shape = [2, 3, 4, 4]
         self.out_h = 2
         self.out_w = 2
+        self.out_size = np.array([3, 3]).astype("int32")
 
 
 class TestCase1(TestBilinearInterpOp):
@@ -91,5 +98,29 @@ class TestCase3(TestBilinearInterpOp):
         self.out_w = 128
 
 
+class TestCase4(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.out_size = np.array([2, 2]).astype("int32")
+
+
+class TestCase5(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.out_size = np.array([11, 11]).astype("int32")
+
+
+class TestCase6(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
index 56f5af91d8..b4c48d85f2 100644
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
@@ -19,7 +19,8 @@ import math
 from op_test import OpTest
 
 
-def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
+def box_coder(target_box, prior_box, prior_box_var, output_box, code_type,
+              box_normalized):
     prior_box_x = (
         (prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0])
     prior_box_y = (
@@ -30,6 +31,9 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
         (prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0])
     prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0],
                                           prior_box_var.shape[1])
+    if not box_normalized:
+        prior_box_height = prior_box_height + 1
+        prior_box_width = prior_box_width + 1
 
     if (code_type == "EncodeCenterSize"):
         target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape(
@@ -40,6 +44,9 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
             target_box.shape[0], 1)
         target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape(
             target_box.shape[0], 1)
+        if not box_normalized:
+            target_box_height = target_box_height + 1
+            target_box_width = target_box_width + 1
 
         output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \
                 prior_box_var[:,:,0]
@@ -64,9 +71,13 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
         output_box[:, :, 1] = target_box_y - target_box_height / 2
         output_box[:, :, 2] = target_box_x + target_box_width / 2
         output_box[:, :, 3] = target_box_y + target_box_height / 2
+        if not box_normalized:
+            output_box[:, :, 2] = output_box[:, :, 2] - 1
+            output_box[:, :, 3] = output_box[:, :, 3] - 1
 
 
-def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type):
+def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type,
+                    box_normalized):
     n = target_box.shape[0]
     m = prior_box.shape[0]
     output_box = np.zeros((n, m, 4), dtype=np.float32)
@@ -74,11 +85,11 @@ def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type):
         if (code_type == "EncodeCenterSize"):
             box_coder(target_box[lod[i]:lod[i + 1], :], prior_box,
                       prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
-                      code_type)
+                      code_type, box_normalized)
         elif (code_type == "DecodeCenterSize"):
             box_coder(target_box[lod[i]:lod[i + 1], :, :], prior_box,
                       prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
-                      code_type)
+                      code_type, box_normalized)
     return output_box
 
 
@@ -93,15 +104,45 @@ class TestBoxCoderOp(OpTest):
         prior_box_var = np.random.random((10, 4)).astype('float32')
         target_box = np.random.random((5, 10, 4)).astype('float32')
         code_type = "DecodeCenterSize"
+        box_normalized = False
         output_box = batch_box_coder(prior_box, prior_box_var, target_box,
-                                     lod[0], code_type)
+                                     lod[0], code_type, box_normalized)
 
         self.inputs = {
             'PriorBox': prior_box,
             'PriorBoxVar': prior_box_var,
             'TargetBox': target_box,
         }
-        self.attrs = {'code_type': 'decode_center_size'}
+        self.attrs = {
+            'code_type': 'decode_center_size',
+            'box_normalized': False
+        }
+        self.outputs = {'OutputBox': output_box}
+
+
+class TestBoxCoderOpWithoutBoxVar(OpTest):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "box_coder"
+        lod = [[0, 1, 2, 3, 4, 5]]
+        prior_box = np.random.random((10, 4)).astype('float32')
+        prior_box_var = np.ones((10, 4)).astype('float32')
+        target_box = np.random.random((5, 10, 4)).astype('float32')
+        code_type = "DecodeCenterSize"
+        box_normalized = False
+        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
+                                     lod[0], code_type, box_normalized)
+
+        self.inputs = {
+            'PriorBox': prior_box,
+            'TargetBox': target_box,
+        }
+        self.attrs = {
+            'code_type': 'decode_center_size',
+            'box_normalized': False
+        }
         self.outputs = {'OutputBox': output_box}
 
 
@@ -116,15 +157,16 @@ class TestBoxCoderOpWithLoD(OpTest):
         prior_box_var = np.random.random((10, 4)).astype('float32')
         target_box = np.random.random((20, 4)).astype('float32')
         code_type = "EncodeCenterSize"
+        box_normalized = True
         output_box = batch_box_coder(prior_box, prior_box_var, target_box,
-                                     lod[0], code_type)
+                                     lod[0], code_type, box_normalized)
 
         self.inputs = {
             'PriorBox': prior_box,
             'PriorBoxVar': prior_box_var,
             'TargetBox': (target_box, lod),
         }
-        self.attrs = {'code_type': 'encode_center_size'}
+        self.attrs = {'code_type': 'encode_center_size', 'box_normalized': True}
         self.outputs = {'OutputBox': output_box}
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint.py b/python/paddle/fluid/tests/unittests/test_checkpoint.py
new file mode 100644
index 0000000000..e22400a045
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_checkpoint.py
@@ -0,0 +1,75 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import unittest
+import os
+import tempfile
+
+
+class TestCheckpoint(unittest.TestCase):
+    def setUp(self):
+        self.dirname = tempfile.mktemp()
+        self.max_num_checkpoints = 3
+        self.epoch_interval = 1
+        self.step_interval = 1
+        self.trainer_id = 0
+        self.chief = self.trainer_id == 0
+        self.place = fluid.CPUPlace()
+        self.epoch_id = 100
+        self.step_id = 20
+
+    def test_checkpoint(self):
+        self.save_checkpoint()
+        serial = fluid.io.get_latest_checkpoint_serial(self.dirname)
+        self.assertTrue(serial >= 0)
+        trainer_args = ["epoch_id", "step_id"]
+        epoch_id, step_id = fluid.io.load_trainer_args(
+            self.dirname, serial, self.trainer_id, trainer_args)
+        self.assertEqual(self.step_id, int(step_id))
+        self.assertEqual(self.epoch_id, int(epoch_id))
+
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            exe = fluid.Executor(self.place)
+            fluid.io.load_checkpoint(exe, self.dirname, serial, program)
+
+        fluid.io.clean_checkpoint(self.dirname, delete_dir=True)
+        self.assertFalse(os.path.isdir(self.dirname))
+
+    def save_checkpoint(self):
+        config = fluid.CheckpointConfig(self.dirname, self.max_num_checkpoints,
+                                        self.epoch_interval, self.step_interval)
+
+        trainer_args = {}
+        trainer_args["epoch_id"] = self.epoch_id
+        trainer_args["step_id"] = self.step_id
+
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            program.global_block().create_var(
+                name="scale_0",
+                psersistable=True,
+                dtype="float32",
+                shape=[32, 32])
+
+            exe = fluid.Executor(self.place)
+            for i in xrange(10):
+                fluid.io.save_checkpoint(exe, config.checkpoint_dir,
+                                         self.trainer_id, trainer_args, program,
+                                         config.max_num_checkpoints)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index d864b9b348..ded2f13028 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -21,8 +21,11 @@ from op_test import OpTest
 
 def conv2dtranspose_forward_naive(input_, filter_, attrs):
     in_n, in_c, in_h, in_w = input_.shape
-    f_c, out_c, f_h, f_w = filter_.shape
+    f_c, f_out_c, f_h, f_w = filter_.shape
+    groups = attrs['groups']
     assert in_c == f_c
+    out_c = f_out_c * groups
+    sub_in_c = in_c / groups
 
     stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
         'dilations']
@@ -36,15 +39,21 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
     for n in range(in_n):
         for i in range(in_h):
             for j in range(in_w):
-                input_masked = input_[n, :, i, j]  # (c)
-                input_masked = np.reshape(input_masked, (in_c, 1, 1))
-                input_masked = np.tile(input_masked, (1, f_h, f_w))
-
-                for k in range(out_c):
-                    tmp_out = np.sum(input_masked * filter_[:, k, :, :], axis=0)
-                    i1, i2 = i * stride[0], i * stride[0] + d_bolck_h
-                    j1, j2 = j * stride[0], j * stride[0] + d_bolck_h
-                    out[n, k, i1:i2:dilations[0], j1:j2:dilations[1]] += tmp_out
+                for g in range(groups):
+                    input_masked = input_[n, g * sub_in_c:(g + 1) * sub_in_c, i,
+                                          j]  # (c)
+                    input_masked = np.reshape(input_masked, (sub_in_c, 1, 1))
+                    input_masked = np.tile(input_masked, (1, f_h, f_w))
+
+                    for k in range(f_out_c):
+                        tmp_out = np.sum(
+                            input_masked *
+                            filter_[g * sub_in_c:(g + 1) * sub_in_c, k, :, :],
+                            axis=0)
+                        i1, i2 = i * stride[0], i * stride[0] + d_bolck_h
+                        j1, j2 = j * stride[0], j * stride[0] + d_bolck_h
+                        out[n, g * f_out_c + k, i1:i2:dilations[0], j1:j2:
+                            dilations[1]] += tmp_out
 
     out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]]
     return out
@@ -64,6 +73,7 @@ class TestConv2dTransposeOp(OpTest):
         self.attrs = {
             'strides': self.stride,
             'paddings': self.pad,
+            'groups': self.groups,
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
             'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
@@ -127,6 +137,7 @@ class TestConv2dTransposeOp(OpTest):
         self.pad = [0, 0]
         self.stride = [1, 1]
         self.dilations = [1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3]
@@ -140,16 +151,29 @@ class TestWithPad(TestConv2dTransposeOp):
         self.pad = [1, 1]
         self.stride = [1, 1]
         self.dilations = [1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3]
 
 
+class TestWithGroups(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 2
+        self.input_size = [2, 4, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3]
+
+
 class TestWithStride(TestConv2dTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
         self.dilations = [1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3]
@@ -159,6 +183,7 @@ class TestWithDilation(TestConv2dTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
+        self.groups = 1
         self.dilations = [2, 2]
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
@@ -176,6 +201,7 @@ class TestCUDNNWithPad(TestWithPad):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
+        self.groups = 1
         self.dilations = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
@@ -190,6 +216,7 @@ class TestCUDNNWithStride(TestWithStride):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
+        self.groups = 1
         self.dilations = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
@@ -200,6 +227,21 @@ class TestCUDNNWithStride(TestWithStride):
         self.op_type = "conv2d_transpose"
 
 
+class TestCUDNNWithGroups(TestWithGroups):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 2
+        self.input_size = [2, 4, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv2d_transpose"
+
+
 # Please Don't remove the following code.
 # Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
index 55ba238710..c9f26d10df 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
@@ -21,8 +21,11 @@ from op_test import OpTest
 
 def conv3dtranspose_forward_naive(input_, filter_, attrs):
     in_n, in_c, in_d, in_h, in_w = input_.shape
-    f_c, out_c, f_d, f_h, f_w = filter_.shape
+    f_c, f_out_c, f_d, f_h, f_w = filter_.shape
+    groups = attrs['groups']
     assert in_c == f_c
+    out_c = f_out_c * groups
+    sub_in_c = in_c / groups
 
     stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
         'dilations']
@@ -39,18 +42,23 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs):
         for d in range(in_d):
             for i in range(in_h):
                 for j in range(in_w):
-                    input_masked = input_[n, :, d, i, j]  # (c)
-                    input_masked = np.reshape(input_masked, (in_c, 1, 1, 1))
-                    input_masked = np.tile(input_masked, (1, f_d, f_h, f_w))
-
-                    for k in range(out_c):
-                        tmp_out = np.sum(input_masked * filter_[:, k, :, :, :],
-                                         axis=0)
-                        d1, d2 = d * stride[0], d * stride[0] + d_bolck_d
-                        i1, i2 = i * stride[1], i * stride[1] + d_bolck_h
-                        j1, j2 = j * stride[2], j * stride[2] + d_bolck_w
-                        out[n, k, d1:d2:dilations[0], i1:i2:dilations[1], j1:j2:
-                            dilations[2]] += tmp_out
+                    for g in range(groups):
+                        input_masked = input_[n, g * sub_in_c:(g + 1
+                                                               ) * sub_in_c, d,
+                                              i, j]  # (c)
+                        input_masked = np.reshape(input_masked,
+                                                  (sub_in_c, 1, 1, 1))
+                        input_masked = np.tile(input_masked, (1, f_d, f_h, f_w))
+
+                        for k in range(f_out_c):
+                            tmp_out = np.sum(input_masked * filter_[
+                                g * sub_in_c:(g + 1) * sub_in_c, k, :, :, :],
+                                             axis=0)
+                            d1, d2 = d * stride[0], d * stride[0] + d_bolck_d
+                            i1, i2 = i * stride[1], i * stride[1] + d_bolck_h
+                            j1, j2 = j * stride[2], j * stride[2] + d_bolck_w
+                            out[n, g * f_out_c + k, d1:d2:dilations[0], i1:i2:
+                                dilations[1], j1:j2:dilations[2]] += tmp_out
 
     out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w -
               pad[2]]
@@ -72,6 +80,7 @@ class TestConv3dTransposeOp(OpTest):
             'strides': self.stride,
             'paddings': self.pad,
             'dilations': self.dilations,
+            'groups': self.groups,
             'use_cudnn': self.use_cudnn,
             'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
         }
@@ -134,6 +143,7 @@ class TestConv3dTransposeOp(OpTest):
         self.pad = [0, 0, 0]
         self.stride = [1, 1, 1]
         self.dilations = [1, 1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
@@ -147,16 +157,29 @@ class TestWithPad(TestConv3dTransposeOp):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
         self.dilations = [1, 1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
 
 
+class TestWithGroups(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [2, 4, 5, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+
+
 class TestWithStride(TestConv3dTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [2, 2, 2]
         self.dilations = [1, 1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
@@ -167,6 +190,7 @@ class TestWithDilation(TestConv3dTransposeOp):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
         self.dilations = [2, 2, 2]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
@@ -184,6 +208,7 @@ class TestCUDNNWithPad(TestWithPad):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
         self.dilations = [1, 1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
@@ -198,6 +223,7 @@ class TestCUDNNWithStride(TestWithStride):
         self.pad = [1, 1, 1]
         self.stride = [2, 2, 2]
         self.dilations = [1, 1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
@@ -207,6 +233,21 @@ class TestCUDNNWithStride(TestWithStride):
         self.op_type = "conv3d_transpose"
 
 
+class TestCUDNNWithGroups(TestWithGroups):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [2, 4, 5, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+
+
 # Please Don't remove the following code.
 # Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
diff --git a/python/paddle/fluid/tests/unittests/test_crop_op.py b/python/paddle/fluid/tests/unittests/test_crop_op.py
index 20cc3a643f..4016089c01 100644
--- a/python/paddle/fluid/tests/unittests/test_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crop_op.py
@@ -42,9 +42,9 @@ class TestCropOp(OpTest):
     def setUp(self):
         self.op_type = "crop"
         self.crop_by_input = False
+        self.offset_by_input = False
         self.attrs = {}
         self.initTestCase()
-        self.attrs['offsets'] = self.offsets
         if self.crop_by_input:
             self.inputs = {
                 'X': np.random.random(self.x_shape).astype("float32"),
@@ -55,6 +55,10 @@ class TestCropOp(OpTest):
             self.inputs = {
                 'X': np.random.random(self.x_shape).astype("float32"),
             }
+        if self.offset_by_input:
+            self.inputs['Offsets'] = np.array(self.offsets).astype('int32')
+        else:
+            self.attrs['offsets'] = self.offsets
         self.outputs = {
             'Out': crop(self.inputs['X'], self.offsets, self.crop_shape)
         }
@@ -101,5 +105,22 @@ class TestCase4(TestCropOp):
         self.crop_by_input = True
 
 
+class TestCase5(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (3, 4, 5)
+        self.crop_shape = [2, 2, 3]
+        self.offsets = [1, 0, 2]
+        self.offset_by_input = True
+
+
+class TestCase6(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (10, 9, 14)
+        self.crop_shape = [3, 3, 5]
+        self.offsets = [3, 5, 4]
+        self.crop_by_input = True
+        self.offset_by_input = True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_debugger.py b/python/paddle/fluid/tests/unittests/test_debugger.py
index 67b03f635b..870952f2f9 100644
--- a/python/paddle/fluid/tests/unittests/test_debugger.py
+++ b/python/paddle/fluid/tests/unittests/test_debugger.py
@@ -15,7 +15,7 @@
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid import debuger
+from paddle.fluid import debugger
 from paddle.fluid.framework import Program
 
 
@@ -51,9 +51,9 @@ class TestDebugger(unittest.TestCase):
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
 
-        print(debuger.pprint_program_codes(p))
+        print(debugger.pprint_program_codes(p))
 
-        debuger.draw_block_graphviz(p.block(0), path="./test.dot")
+        debugger.draw_block_graphviz(p.block(0), path="./test.dot")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
index a905a854ad..f545ad155c 100644
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -160,7 +160,9 @@ class TestDetectionMAPOp(OpTest):
         label_count, true_pos, false_pos = get_input_pos(
             self.class_pos_count, self.true_pos, self.true_pos_lod,
             self.false_pos, self.false_pos_lod)
-        for (label, difficult, xmin, ymin, xmax, ymax) in self.label:
+        for v in self.label:
+            label = v[0]
+            difficult = False if len(v) == 5 else v[1]
             if self.evaluate_difficult:
                 label_count[label] += 1
             elif not difficult:
@@ -245,6 +247,15 @@ class TestDetectionMAPOpSkipDiff(TestDetectionMAPOp):
                        [2, 0.8, 0, 1], [2, 0.1, 1, 0], [3, 0.2, 0, 1]]
 
 
+class TestDetectionMAPOpWithoutDiff(TestDetectionMAPOp):
+    def init_test_case(self):
+        super(TestDetectionMAPOpWithoutDiff, self).init_test_case()
+
+        # label xmin ymin xmax ymax
+        self.label = [[1, 0.1, 0.1, 0.3, 0.3], [1, 0.6, 0.6, 0.8, 0.8],
+                      [2, 0.3, 0.3, 0.6, 0.5], [1, 0.7, 0.1, 0.9, 0.3]]
+
+
 class TestDetectionMAPOp11Point(TestDetectionMAPOp):
     def init_test_case(self):
         super(TestDetectionMAPOp11Point, self).init_test_case()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
index c2393a288c..2314bb2ed8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -12,19 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import time
 import unittest
+from multiprocessing import Process
+
+import numpy
 
 import paddle.fluid as fluid
-import paddle.fluid.core as core
 import paddle.fluid.layers as layers
-import numpy
-from multiprocessing import Process
-from threading import Thread
-import os, sys
-import time
 
 
 class TestSendOp(unittest.TestCase):
+    @unittest.skip(
+        "This test is buggy. We cannot use time.sleep to sync processes, the connection may fail in unittest."
+    )
     def test_send(self):
         # Run init_serv in a thread
         place = fluid.CPUPlace()
@@ -34,7 +36,7 @@ class TestSendOp(unittest.TestCase):
         p.start()
 
         time.sleep(10)
-        with open("/tmp/paddle.%d.selected_port" % p.pid, "r") as fn:
+        with open("/tmp/paddle.%d.port" % p.pid, "r") as fn:
             selected_port = int(fn.readlines()[0])
         self.init_client(place, selected_port)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 10f8c4f3f0..b4379ad447 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -13,43 +13,19 @@
 # limitations under the License.
 
 import unittest
-
 import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.layers as layers
 from paddle.fluid.transpiler.distribute_transpiler import delete_ops
-import numpy
+
+from transpiler_test import TranspilerTest
 
 
-class TestDistTranspiler(unittest.TestCase):
+class TestDistTranspiler(TranspilerTest):
     def setUp(self):
-        self.trainer_id = 0
-        self.trainers = 2
-        self.pservers = 2
-        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
         self.current_pserver_ep = "127.0.0.1:6174"
 
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'))
-
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-
-        optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
-        return optimize_ops, params_grads
-
     def test_transpiler(self):
         trainer = self.get_trainer()
         pserver, startup = self.get_pserver(self.current_pserver_ep)
-
         self.assertEqual([op.type for op in trainer.global_block().ops],
                          self.get_expect_trainer_ops())
 
@@ -67,18 +43,10 @@ class TestDistTranspiler(unittest.TestCase):
             "fill_constant", "fill_constant", "uniform_random", "uniform_random"
         ])
 
-        # the variable #fc_w will be split into two blocks 
+        # the variable #fc_w will be split into two blocks
         fc_w_var = startup.global_block().var("fc_w.block1")
         self.assertEqual(fc_w_var.shape, (500, 1000))
 
-    def get_main_program(self):
-        main = fluid.Program()
-
-        with fluid.program_guard(main):
-            self.net_conf()
-
-        return main
-
     def get_expect_trainer_ops(self):
         trainer = fluid.Program()
 
@@ -86,27 +54,12 @@ class TestDistTranspiler(unittest.TestCase):
             optimize_ops, params_grads = self.net_conf()
 
         delete_ops(trainer.global_block(), optimize_ops)
-        return [op.type for op in trainer.global_block().ops
-                ] + ["split_byref", "send", "concat"]
-
-    def get_trainer(self):
-        return self._transpiler_instance().get_trainer_program()
-
-    def get_pserver(self, ep):
-        t = self._transpiler_instance()
-        pserver = t.get_pserver_program(ep)
-        startup = t.get_startup_program(ep, pserver)
-        return pserver, startup
-
-    def _transpiler_instance(self):
-        main = self.get_main_program()
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            self.trainer_id,
-            program=main,
-            pservers=self.pserver_eps,
-            trainers=self.trainers)
-        return t
+        ops = [op.type for op in trainer.global_block().ops] + [
+            "split_byref", "send", "send_barrier", "recv", "recv",
+            "fetch_barrier", "concat"
+        ]
+        ops.insert(ops.index("elementwise_add_grad") + 1, "send")
+        return ops
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
index 2232939075..95af51f1b2 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -30,9 +30,6 @@ class Memory(object):
         assert val.dtype == self.ex.dtype
         self.cur = val
 
-    def ex(self):
-        return self.ex
-
     def next(self):
         self.ex = self.cur
         self.cur = None
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index 1f52bd90d0..96d47906a0 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -252,5 +252,25 @@ class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp):
         self.axis = 1
 
 
+class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 20, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestFP16ElementwiseAddOp_channelwise_add(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 10, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
new file mode 100644
index 0000000000..281068e945
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+
+
+def quantize_max_abs(x, num_bits):
+    range = math.pow(2, num_bits) - 1
+    scale = np.max(np.abs(x).flatten())
+    y = np.round(x / scale * range)
+    return y, scale
+
+
+def dequantize_max_abs(x, num_bits, scale):
+    range = math.pow(2, num_bits) - 1
+    y = (scale / range) * x
+    return y
+
+
+class TestFakeDequantizeMaxAbsOp(OpTest):
+    def set_args(self):
+        self.num_bits = 8
+
+    def setUp(self):
+        self.set_args()
+        self.op_type = "fake_dequantize_max_abs"
+        x = np.random.randn(31, 65).astype("float32")
+        yq, scale = quantize_max_abs(x, self.num_bits)
+        print 'scale ', scale
+        ydq = dequantize_max_abs(yq, self.num_bits, scale)
+
+        self.inputs = {'X': yq}
+        self.attrs = {'num_bits': self.num_bits, 'scale': float(scale)}
+        self.outputs = {'Out': ydq}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFakeDequantizeMaxAbsOp5Bits(OpTest):
+    def set_args(self):
+        self.num_bits = 5
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
index 66e3e2d51d..533d8ccfac 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
@@ -50,5 +50,27 @@ class TestFillConstantBatchSizeLikeWhenSecondDimIsBatchSize(OpTest):
         self.check_output()
 
 
+class TestFillConstantBatchSizeLikeWithLoDTensor(OpTest):
+    def setUp(self):
+        self.op_type = "fill_constant_batch_size_like"
+        self.inputs = {
+            'Input': (np.random.random((31, 28)).astype("float32"),
+                      [[0, 9, 23, 31]])
+        }
+        self.attrs = {
+            'value': 3.5,
+            'shape': [-1, 16],
+            'input_dim_idx': 0,
+            'output_dim_idx': 0
+        }
+
+        out = np.random.random((3, 16)).astype("float32")
+        out.fill(3.5)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 6fd043c27e..4ae9086480 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -20,8 +20,9 @@ from op_test import OpTest
 class TestGatherOp(OpTest):
     def setUp(self):
         self.op_type = "gather"
-        xnp = np.random.random((10, 20)).astype("float32")
-        self.inputs = {'X': xnp, 'Index': np.array([1, 3, 5]).astype("int32")}
+        self.config()
+        xnp = np.random.random(self.x_shape).astype("float32")
+        self.inputs = {'X': xnp, 'Index': np.array(self.index).astype("int32")}
         self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
 
     def test_check_output(self):
@@ -30,6 +31,16 @@ class TestGatherOp(OpTest):
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
+    def config(self):
+        self.x_shape = (10, 20)
+        self.index = [1, 3, 5]
+
+
+class TestCase1(TestGatherOp):
+    def config(self):
+        self.x_shape = (10)
+        self.index = [1, 3, 5]
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index c5414abf0f..f8cf6f4e2d 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -369,6 +369,38 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
+    def test_resize_bilinear(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 9, 6], dtype="float32")
+            output = layers.resize_bilinear(x, out_shape=[12, 12])
+            self.assertIsNotNone(output)
+            output = layers.resize_bilinear(x, scale=3)
+            self.assertIsNotNone(output)
+        print(str(program))
+
+    def test_polygon_box_transform(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[8, 4, 4], dtype="float32")
+            output = layers.polygon_box_transform(input=x)
+            self.assertIsNotNone(output)
+        print(str(program))
+
+    def test_l2_normalize(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[8, 7, 10], dtype="float32")
+            output = layers.l2_normalize(x, axis=1)
+
+    def test_maxout(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='x', shape=[8, 6, 6], dtype="float32")
+            output = layers.maxout(x=data, groups=2)
+            self.assertIsNotNone(output)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
new file mode 100644
index 0000000000..d1d709551c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -0,0 +1,107 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import os
+import signal
+import subprocess
+import time
+import unittest
+from multiprocessing import Process
+from op_test import OpTest
+
+
+def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
+    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+    # loss function
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+    # optimizer
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    pserver_endpoints = ip + ":" + port
+    current_endpoint = ip + ":" + port
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id,
+        pservers=pserver_endpoints,
+        trainers=trainers,
+        sync_mode=sync_mode)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
+    exe.run(pserver_prog)
+
+
+class TestListenAndServOp(OpTest):
+    def setUp(self):
+        self.ps_timeout = 5
+        self.ip = "127.0.0.1"
+        self.port = "6173"
+        self.trainers = 1
+        self.trainer_id = 1
+
+    def _start_pserver(self, use_cuda, sync_mode):
+        p = Process(
+            target=run_pserver,
+            args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
+                  self.trainer_id))
+        p.start()
+        return p.pid
+
+    def _wait_ps_ready(self, pid):
+        start_left_time = self.ps_timeout
+        sleep_time = 0.5
+        while True:
+            assert start_left_time >= 0, "wait ps ready failed"
+            time.sleep(sleep_time)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                start_left_time -= sleep_time
+
+    def test_rpc_interfaces(self):
+        # TODO(Yancey1989): need to make sure the rpc interface correctly.
+        pass
+
+    def test_handle_signal_in_serv_op(self):
+        # run pserver on CPU in sync mode
+        pid = self._start_pserver(False, True)
+        self._wait_ps_ready(pid)
+
+        # raise SIGTERM to pserver
+        os.kill(pid, signal.SIGTERM)
+
+        # run pserver on CPU in async mode
+        pid = self._start_pserver(False, False)
+        self._wait_ps_ready(pid)
+
+        # raise SIGTERM to pserver
+        os.kill(pid, signal.SIGTERM)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py
index f8ff5a3361..e726f99d49 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
@@ -194,107 +194,104 @@ class TestLstmOp(OpTest):
             ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4)
 
 
-class TestLstmOpHasInitial(TestLstmOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-
-        self.has_initial_state = True
-        self.is_reverse = True
-        self.use_peepholes = True
-
-    def test_check_grad(self):
-        # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
-            max_relative_error=5e-4)
-
-    def test_check_grad_ingore_bias(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('Bias'))
-
-    def test_check_grad_ingore_weight(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Bias'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('Weight'))
-
-    def test_check_grad_ingore_input(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Weight', 'Bias'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('Input'))
-
-    def test_check_grad_ingore_h0(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('H0'))
-
-    def test_check_grad_ingore_c0(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('C0'))
-
-
-class TestLstmOpRerverse(TestLstmOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-
-        self.has_initial_state = False
-        self.is_reverse = True
-        self.use_peepholes = True
-
-
-class TestLstmOpNotUsePeepholes(TestLstmOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-
-        self.has_initial_state = False
-        self.is_reverse = True
-        self.use_peepholes = False
-
+# class TestLstmOpHasInitial(TestLstmOp):
+#     def set_argument(self):
+#         self.lod = [[0, 2, 5, 7]]
+#         self.D = 16
+
+#         self.act_gate = 'sigmoid'
+#         self.act_cell = 'tanh'
+#         self.act_cand = 'tanh'
+
+#         self.has_initial_state = True
+#         self.is_reverse = True
+#         self.use_peepholes = True
+
+#     def test_check_grad(self):
+#         # TODO(qingqing) remove folowing lines after the check_grad is refined.
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
+#             max_relative_error=5e-4)
+
+#     def test_check_grad_ingore_bias(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('Bias'))
+
+#     def test_check_grad_ingore_weight(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Bias'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('Weight'))
+
+#     def test_check_grad_ingore_input(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Weight', 'Bias'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('Input'))
+
+#     def test_check_grad_ingore_h0(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('H0'))
+
+#     def test_check_grad_ingore_c0(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('C0'))
+
+# class TestLstmOpRerverse(TestLstmOp):
+#     def set_argument(self):
+#         self.lod = [[0, 2, 5, 7]]
+#         self.D = 16
+
+#         self.act_gate = 'sigmoid'
+#         self.act_cell = 'tanh'
+#         self.act_cand = 'tanh'
+
+#         self.has_initial_state = False
+#         self.is_reverse = True
+#         self.use_peepholes = True
+
+# class TestLstmOpNotUsePeepholes(TestLstmOp):
+#     def set_argument(self):
+#         self.lod = [[0, 2, 5, 7]]
+#         self.D = 16
+
+#         self.act_gate = 'sigmoid'
+#         self.act_cell = 'tanh'
+#         self.act_cand = 'tanh'
+
+#         self.has_initial_state = False
+#         self.is_reverse = True
+#         self.use_peepholes = False
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_mul_mkldnn_op.py
deleted file mode 100644
index 42d68ef376..0000000000
--- a/python/paddle/fluid/tests/unittests/test_mul_mkldnn_op.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from test_mul_op import TestMulOp, TestMulOp2, TestFP16MulOp1, TestFP16MulOp2
-
-
-class TestMKLDNNMulOp(TestMulOp):
-    def init_op_test(self):
-        super(TestMKLDNNMulOp, self).setUp()
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNMulOp2(TestMulOp2):
-    def init_op_test(self):
-        super(TestMKLDNNMulOp2, self).setUp()
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNFP16MulOp1(TestFP16MulOp1):
-    def init_op_test(self):
-        super(TestMKLDNNFP16MulOp1, self).setUp()
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNFP16MulOp2(TestFP16MulOp2):
-    def init_op_test(self):
-        super(TestMKLDNNFP16MulOp2, self).setUp()
-        self.attrs = {"use_mkldnn": True}
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index d984393c89..bbc782c1bc 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -21,12 +21,10 @@ from op_test import OpTest
 class TestMulOp(OpTest):
     def setUp(self):
         self.op_type = "mul"
-        self.use_mkldnn = False
         self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((84, 100)).astype("float32")
+            'X': np.random.random((2, 5)).astype("float32"),
+            'Y': np.random.random((5, 3)).astype("float32")
         }
-        self.attrs = {'use_mkldnn': self.use_mkldnn}
         self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
 
     def test_check_output(self):
@@ -47,19 +45,17 @@ class TestMulOp(OpTest):
 class TestMulOp2(OpTest):
     def setUp(self):
         self.op_type = "mul"
-        self.use_mkldnn = False
         self.inputs = {
-            'X': np.random.random((15, 4, 12, 10)).astype("float32"),
-            'Y': np.random.random((4, 30, 8, 2, 9)).astype("float32")
+            'X': np.random.random((3, 4, 4, 3)).astype("float32"),
+            'Y': np.random.random((2, 6, 1, 2, 3)).astype("float32")
         }
         self.attrs = {
             'x_num_col_dims': 2,
             'y_num_col_dims': 2,
-            'use_mkldnn': self.use_mkldnn
         }
-        result = np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10),
-                        self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9))
-        result = result.reshape(15, 4, 8, 2, 9)
+        result = np.dot(self.inputs['X'].reshape(3 * 4, 4 * 3),
+                        self.inputs['Y'].reshape(2 * 6, 1 * 2 * 3))
+        result = result.reshape(3, 4, 1, 2, 3)
         self.outputs = {'Out': result}
 
     def test_check_output(self):
@@ -80,11 +76,9 @@ class TestMulOp2(OpTest):
 class TestFP16MulOp1(OpTest):
     def setUp(self):
         self.op_type = "mul"
-        self.use_mkldnn = False
-        x = np.random.random((32, 84)).astype("float16")
-        y = np.random.random((84, 100)).astype("float16")
-        self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
-        self.attrs = {'use_mkldnn': self.use_mkldnn}
+        x = np.random.random((3, 5)).astype("float16")
+        y = np.random.random((5, 4)).astype("float16")
+        self.inputs = {'X': x.view(np.float16), 'Y': y.view(np.float16)}
         self.outputs = {'Out': np.dot(x, y)}
 
     def test_check_output(self):
@@ -97,18 +91,15 @@ class TestFP16MulOp1(OpTest):
 class TestFP16MulOp2(OpTest):
     def setUp(self):
         self.op_type = "mul"
-        self.use_mkldnn = False
-        x = np.random.random((15, 4, 12, 10)).astype("float16")
-        y = np.random.random((4, 30, 8, 2, 9)).astype("float16")
-        self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
+        x = np.random.random((3, 4, 4, 3)).astype("float16")
+        y = np.random.random((2, 6, 1, 2, 3)).astype("float16")
+        self.inputs = {'X': x.view(np.float16), 'Y': y.view(np.float16)}
         self.attrs = {
             'x_num_col_dims': 2,
             'y_num_col_dims': 2,
-            'use_mkldnn': self.use_mkldnn
         }
-        result = np.dot(
-            x.reshape(15 * 4, 12 * 10), y.reshape(4 * 30, 8 * 2 * 9))
-        result = result.reshape(15, 4, 8, 2, 9)
+        result = np.dot(x.reshape(3 * 4, 4 * 3), y.reshape(2 * 6, 1 * 2 * 3))
+        result = result.reshape(3, 4, 1, 2, 3)
         self.outputs = {'Out': result}
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_norm_op.py b/python/paddle/fluid/tests/unittests/test_norm_op.py
index 6feda175fb..108a665f37 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_op.py
@@ -17,44 +17,23 @@ import numpy as np
 from op_test import OpTest
 
 
-def norm(input, scale, epsilon):
-    s0, s1, s2, s3 = input.shape
-    x_square = input * input
-    for i in xrange(s0):
-        input_batch = input[i:i + 1, :, :, :]
-        input_batch = input_batch.reshape(s1, s2 * s3)
-        x_square_batch = x_square[i:i + 1, :, :, :]
-        x_square_batch = x_square_batch.reshape(s1, s2 * s3)
-        square_colsum = x_square_batch.sum(axis=0) + epsilon
-        tmp = pow(square_colsum, 0.5)
-        tmp = np.reciprocal(tmp)
-        tmp_tile = np.tile(tmp, s1)
-        tmp_tile = tmp_tile.reshape(s1, s2 * s3)
-        scale_tile = np.tile(scale, (1, s2 * s3))
-        scale_tile = scale_tile.reshape(s1, s2 * s3)
-        out_batch = input_batch * tmp_tile * scale_tile
-        out_batch = out_batch.reshape(1, s1, s2, s3)
-        if i == 0:
-            out = out_batch
-        else:
-            out = np.concatenate((out, out_batch), 0)
-    out.reshape(s0, s1, s2, s3)
-    return out
+def l2_norm(x, axis, epsilon):
+    x2 = x**2
+    s = np.sum(x2, axis=axis, keepdims=True)
+    r = np.sqrt(s + epsilon)
+    y = x / np.broadcast_to(r, x.shape)
+    return y, r
 
 
 class TestNormOp(OpTest):
     def setUp(self):
         self.op_type = "norm"
         self.init_test_case()
-        input = np.random.random(self.shape).astype("float32")
-        scale = np.array([10, 10, 10])
-        self.inputs = {
-            'X': input.astype('float32'),
-            'Scale': scale.astype('float32')
-        }
-        self.attrs = {'epsilon': self.epsilon}
-        output = norm(input, scale, self.epsilon)
-        self.outputs = {'Out': output.astype('float32')}
+        x = np.random.random(self.shape).astype("float64")
+        y, norm = l2_norm(x, self.axis, self.epsilon)
+        self.inputs = {'X': x}
+        self.attrs = {'epsilon': self.epsilon, 'axis': self.axis}
+        self.outputs = {'Out': y, 'Norm': norm}
 
     def test_check_output(self):
         self.check_output()
@@ -63,8 +42,23 @@ class TestNormOp(OpTest):
         self.check_grad(['X'], 'Out')
 
     def init_test_case(self):
-        self.shape = [2, 3, 2, 2]
-        self.epsilon = 1e-6
+        self.shape = [2, 3, 4, 4]
+        self.axis = 1
+        self.epsilon = 1e-8
+
+
+class TestNormOp2(TestNormOp):
+    def init_test_case(self):
+        self.shape = [5, 3, 9, 7]
+        self.axis = 0
+        self.epsilon = 1e-8
+
+
+class TestNormOp3(TestNormOp):
+    def init_test_case(self):
+        self.shape = [5, 3, 2, 7]
+        self.axis = -1
+        self.epsilon = 1e-8
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
index ef34893943..198c68866d 100644
--- a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
+++ b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
@@ -70,8 +70,9 @@ class TestNormalization(unittest.TestCase):
     def l2_normalize(self, data, axis, epsilon):
         """ Compute the groundtruth.
         """
-        output = data * np.reciprocal(
-            np.sum(np.square(data), axis=axis, keepdims=True))
+        output = data / np.broadcast_to(
+            np.sqrt(np.sum(np.square(data), axis=axis, keepdims=True)),
+            data.shape)
         return output
 
     def test_l2_normalize(self):
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index 779ae388f0..c098a5a0cb 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -63,7 +63,7 @@ class TestOperator(unittest.TestCase):
         self.assertEqual(mul_op.output("Out"), ["mul.out"])
         self.assertEqual(
             set(mul_op.attr_names),
-            set(["x_num_col_dims", "y_num_col_dims", "use_mkldnn"]))
+            set(["x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var"]))
         self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
         self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
         self.assertEqual(mul_op.attr("x_num_col_dims"), 1)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
deleted file mode 100644
index 056f9e1781..0000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ /dev/null
@@ -1,902 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import unittest
-
-import paddle.fluid as fluid
-import paddle
-import paddle.dataset.mnist as mnist
-import paddle.dataset.wmt16 as wmt16
-
-
-def simple_fc_net(use_feed):
-    if use_feed:
-        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    else:
-        reader = fluid.layers.open_files(
-            filenames=['./mnist.recordio'],
-            shapes=[[-1, 784], [-1, 1]],
-            lod_levels=[0, 0],
-            dtypes=['float32', 'int64'],
-            thread_num=1,
-            for_parallel=True)
-        reader = fluid.layers.io.double_buffer(reader)
-        img, label = fluid.layers.read_file(reader)
-    hidden = img
-    for _ in xrange(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def fc_with_batchnorm(use_feed):
-    if use_feed:
-        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    else:
-        reader = fluid.layers.open_files(
-            filenames=['mnist.recordio'],
-            shapes=[[-1, 784], [-1, 1]],
-            lod_levels=[0, 0],
-            dtypes=['float32', 'int64'],
-            thread_num=1,
-            for_parallel=True)
-        reader = fluid.layers.io.double_buffer(reader)
-        img, label = fluid.layers.read_file(reader)
-
-    hidden = img
-    for _ in xrange(1):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-
-        hidden = fluid.layers.batch_norm(input=hidden)
-
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def squeeze_excitation(input, num_channels, reduction_ratio):
-    # pool = fluid.layers.pool2d(
-    #    input=input, pool_size=0, pool_type='avg', global_pooling=True)
-    conv = input
-    shape = conv.shape
-    reshape = fluid.layers.reshape(
-        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
-    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
-
-    squeeze = fluid.layers.fc(input=pool,
-                              size=num_channels / reduction_ratio,
-                              act='relu')
-    excitation = fluid.layers.fc(input=squeeze,
-                                 size=num_channels,
-                                 act='sigmoid')
-    scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
-    return scale
-
-
-def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
-                  act=None):
-    conv = fluid.layers.conv2d(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=stride,
-        padding=(filter_size - 1) / 2,
-        groups=groups,
-        act=None,
-        bias_attr=False)
-    return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)
-
-
-def shortcut(input, ch_out, stride):
-    ch_in = input.shape[1]
-    if ch_in != ch_out:
-        if stride == 1:
-            filter_size = 1
-        else:
-            filter_size = 3
-        return conv_bn_layer(input, ch_out, filter_size, stride)
-    else:
-        return input
-
-
-def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
-    # The number of first 1x1 convolutional channels for each bottleneck build block
-    # was halved to reduce the compution cost.
-    conv0 = conv_bn_layer(
-        input=input, num_filters=num_filters, filter_size=1, act='relu')
-    conv1 = conv_bn_layer(
-        input=conv0,
-        num_filters=num_filters * 2,
-        filter_size=3,
-        stride=stride,
-        groups=cardinality,
-        act='relu')
-    conv2 = conv_bn_layer(
-        input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
-    scale = squeeze_excitation(
-        input=conv2,
-        num_channels=num_filters * 2,
-        reduction_ratio=reduction_ratio)
-
-    short = shortcut(input, num_filters * 2, stride)
-
-    return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
-
-
-def SE_ResNeXt50Small(batch_size=2, use_feed=False):
-    assert not use_feed, "SE_ResNeXt doesn't support feed yet"
-
-    img = fluid.layers.fill_constant(
-        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
-    label = fluid.layers.fill_constant(
-        shape=[batch_size, 1], dtype='int64', value=0.0)
-
-    conv = conv_bn_layer(
-        input=img, num_filters=16, filter_size=3, stride=2, act='relu')
-    conv = conv_bn_layer(
-        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
-    conv = conv_bn_layer(
-        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
-    conv = fluid.layers.pool2d(
-        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
-
-    cardinality = 32
-    reduction_ratio = 16
-    depth = [3, 4, 6, 3]
-    num_filters = [128, 256, 512, 1024]
-
-    for block in range(len(depth)):
-        for i in range(depth[block]):
-            conv = bottleneck_block(
-                input=conv,
-                num_filters=num_filters[block],
-                stride=2 if i == 0 and block != 0 else 1,
-                cardinality=cardinality,
-                reduction_ratio=reduction_ratio)
-
-    shape = conv.shape
-    reshape = fluid.layers.reshape(
-        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
-    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
-    dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
-    # Classifier layer:
-    prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-import time
-
-
-class TestParallelExecutorBase(unittest.TestCase):
-    def check_network_convergence(self,
-                                  method,
-                                  memory_opt=True,
-                                  iter=50,
-                                  batch_size=None,
-                                  allow_op_delay=False,
-                                  feed_dict=None,
-                                  seed=None,
-                                  use_parallel_executor=True,
-                                  balance_parameter_opt_between_cards=False):
-        def run_executor(exe, feed, fetch_list, program=None):
-            if isinstance(exe, fluid.ParallelExecutor):
-                res = exe.run(fetch_list=fetch_list, feed=feed)
-            elif isinstance(exe, fluid.Executor):
-                if program is None:
-                    program = fluid.default_main_program()
-                res = exe.run(program=program, feed=feed, fetch_list=fetch_list)
-            else:
-                raise ValueError('Unkown type exe')
-            return res
-
-        main = fluid.Program()
-        startup = fluid.Program()
-        startup.random_seed = 1  # Fix random seed
-        with fluid.program_guard(main, startup):
-            if seed is not None:
-                startup.random_seed = seed
-            loss = method(use_feed=feed_dict is not None)
-            adam = fluid.optimizer.Adam()
-            adam.minimize(loss)
-            if memory_opt:
-                fluid.memory_optimize(main)
-            place = fluid.CUDAPlace(0)
-            startup_exe = fluid.Executor(place)
-            startup_exe.run(startup)
-            exec_strategy = fluid.ExecutionStrategy()
-            exec_strategy.allow_op_delay = allow_op_delay
-
-            build_strategy = fluid.BuildStrategy()
-            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce
-
-            if use_parallel_executor:
-                exe = fluid.ParallelExecutor(
-                    True,
-                    loss_name=loss.name,
-                    exec_strategy=exec_strategy,
-                    build_strategy=build_strategy)
-            else:
-                exe = fluid.Executor(place=place)
-
-            if batch_size is not None:
-                batch_size *= fluid.core.get_cuda_device_count()
-            begin = time.time()
-            first_loss, = run_executor(
-                exe=exe, feed=feed_dict, fetch_list=[loss.name])
-            first_loss = np.array(first_loss)
-
-            for i in xrange(iter):
-                run_executor(exe=exe, feed=feed_dict, fetch_list=[])
-
-            last_loss, = run_executor(
-                exe=exe, feed=feed_dict, fetch_list=[loss.name])
-            end = time.time()
-
-            if batch_size is not None:
-                print "%.4f Instance per second" % (
-                    (batch_size * iter + 2) / (end - begin))
-
-            last_loss = np.array(last_loss)
-
-            print first_loss, last_loss
-            # self.assertGreater(first_loss[0], last_loss[0])
-            return first_loss, last_loss
-
-
-class TestMNIST(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        # Convert mnist to recordio file
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            reader = paddle.batch(mnist.train(), batch_size=4)
-            feeder = fluid.DataFeeder(
-                feed_list=[  # order is image and label
-                    fluid.layers.data(
-                        name='image', shape=[784]),
-                    fluid.layers.data(
-                        name='label', shape=[1], dtype='int64'),
-                ],
-                place=fluid.CPUPlace())
-            fluid.recordio_writer.convert_reader_to_recordio_file(
-                './mnist.recordio', reader, feeder)
-
-    def check_simple_fc_convergence(self, balance_parameter_opt_between_cards):
-        self.check_network_convergence(simple_fc_net)
-        self.check_network_convergence(simple_fc_net, allow_op_delay=True)
-
-        img = np.zeros(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
-        self.check_network_convergence(
-            simple_fc_net,
-            feed_dict={"image": img,
-                       "label": label},
-            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
-        )
-
-    def test_simple_fc(self):
-        self.check_simple_fc_convergence(False)
-
-    def test_simple_fc_with_new_strategy(self):
-        self.check_simple_fc_convergence(True)
-
-    def check_simple_fc_parallel_accuracy(self,
-                                          balance_parameter_opt_between_cards):
-        img = np.zeros(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
-        single_first_loss, single_last_loss = self.check_network_convergence(
-            method=simple_fc_net,
-            seed=1000,
-            feed_dict={"image": img,
-                       "label": label},
-            use_parallel_executor=False)
-        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
-            method=simple_fc_net,
-            seed=1000,
-            feed_dict={"image": img,
-                       "label": label},
-            use_parallel_executor=True,
-            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
-        )
-
-        for p_f in parallel_first_loss:
-            self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
-        for p_l in parallel_last_loss:
-            self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
-
-    def test_simple_fc_parallel_accuracy(self):
-        self.check_simple_fc_parallel_accuracy(False)
-
-    def test_simple_fc_parallel_accuracy_with_new_strategy(self):
-        self.check_simple_fc_parallel_accuracy(True)
-
-    def check_batchnorm_fc_convergence(self,
-                                       balance_parameter_opt_between_cards):
-        self.check_network_convergence(fc_with_batchnorm)
-        img = np.zeros(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
-        self.check_network_convergence(
-            fc_with_batchnorm,
-            feed_dict={"image": img,
-                       "label": label},
-            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
-        )
-
-    def test_batchnorm_fc(self):
-        self.check_batchnorm_fc_convergence(False)
-
-    def test_batchnorm_fc_with_new_strategy(self):
-        self.check_batchnorm_fc_convergence(True)
-
-
-class TestResnet(TestParallelExecutorBase):
-    # @classmethod
-    # def setUpClass(cls):
-    #     # import os
-    #     # if os.path.exists('./flowers.recordio'):
-    #     #     return
-    #     with fluid.program_guard(fluid.Program(), fluid.Program()):
-    #         reader = paddle.batch(flowers.train(), batch_size=4)
-    #         feeder = fluid.DataFeeder(
-    #             feed_list=[
-    #                 fluid.layers.data(
-    #                     name='image', shape=[3, 224, 224]),
-    #                 fluid.layers.data(
-    #                     name='label', shape=[1], dtype='int64'),
-    #             ],
-    #             place=fluid.CPUPlace())
-    #         fluid.recordio_writer.convert_reader_to_recordio_file(
-    #             "./flowers.recordio", reader, feeder, compressor=fluid.core.RecordIOWriter.Compressor.NoCompress)
-
-    def check_resnet_convergence(self, balance_parameter_opt_between_cards):
-        import functools
-        batch_size = 2
-        self.check_network_convergence(
-            functools.partial(
-                SE_ResNeXt50Small, batch_size=batch_size),
-            iter=20,
-            batch_size=batch_size,
-            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
-        )
-
-    def test_resnet(self):
-        self.check_resnet_convergence(False)
-
-    def test_resnet_with_new_strategy(self):
-        self.check_resnet_convergence(True)
-
-
-class ModelHyperParams(object):
-    # Dictionary size for source and target language. This model directly uses
-    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
-    # alreay been added, but the <pad> token is not added. Transformer requires
-    # sequences in a mini-batch are padded to have the same length. A <pad> token is
-    # added into the original dictionary in paddle.dateset.wmt16.
-
-    # size of source word dictionary.
-    src_vocab_size = 10000
-    # index for <pad> token in source language.
-    src_pad_idx = src_vocab_size
-
-    # size of target word dictionay
-    trg_vocab_size = 10000
-    # index for <pad> token in target language.
-    trg_pad_idx = trg_vocab_size
-
-    # position value corresponding to the <pad> token.
-    pos_pad_idx = 0
-
-    # max length of sequences. It should plus 1 to include position
-    # padding token for position encoding.
-    max_length = 50
-
-    # the dimension for word embeddings, which is also the last dimension of
-    # the input and output of multi-head attention, position-wise feed-forward
-    # networks, encoder and decoder.
-
-    d_model = 512
-    # size of the hidden layer in position-wise feed-forward networks.
-    d_inner_hid = 1024
-    # the dimension that keys are projected to for dot-product attention.
-    d_key = 64
-    # the dimension that values are projected to for dot-product attention.
-    d_value = 64
-    # number of head used in multi-head attention.
-    n_head = 8
-    # number of sub-layers to be stacked in the encoder and decoder.
-    n_layer = 6
-    # dropout rate used by all dropout layers.
-    dropout = 0.1
-
-
-def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
-    """
-    Pad the instances to the max sequence length in batch, and generate the
-    corresponding position data and attention bias. Then, convert the numpy
-    data to tensors and return a dict mapping names to tensors.
-    """
-
-    def __pad_batch_data(insts,
-                         pad_idx,
-                         is_target=False,
-                         return_pos=True,
-                         return_attn_bias=True,
-                         return_max_len=True):
-        """
-        Pad the instances to the max sequence length in batch, and generate the
-        corresponding position data and attention bias.
-        """
-        return_list = []
-        max_len = max(len(inst) for inst in insts)
-        inst_data = np.array(
-            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
-        return_list += [inst_data.astype("int64").reshape([-1, 1])]
-        if return_pos:
-            inst_pos = np.array([[
-                pos_i + 1 if w_i != pad_idx else 0
-                for pos_i, w_i in enumerate(inst)
-            ] for inst in inst_data])
-
-            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
-        if return_attn_bias:
-            if is_target:
-                # This is used to avoid attention on paddings and subsequent
-                # words.
-                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
-                                              max_len))
-                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
-                    [-1, 1, max_len, max_len])
-                slf_attn_bias_data = np.tile(slf_attn_bias_data,
-                                             [1, n_head, 1, 1]) * [-1e9]
-            else:
-                # This is used to avoid attention on paddings.
-                slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
-                                               (max_len - len(inst))
-                                               for inst in insts])
-                slf_attn_bias_data = np.tile(
-                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
-                    [1, n_head, max_len, 1])
-            return_list += [slf_attn_bias_data.astype("float32")]
-        if return_max_len:
-            return_list += [max_len]
-        return return_list if len(return_list) > 1 else return_list[0]
-
-    def data_to_tensor(data_list, name_list, input_dict, place):
-        assert len(data_list) == len(name_list)
-        for i in range(len(name_list)):
-            tensor = fluid.LoDTensor()
-            tensor.set(data_list[i], place)
-            input_dict[name_list[i]] = tensor
-
-    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
-        [inst[0] for inst in insts], src_pad_idx, is_target=False)
-    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
-        [inst[1] for inst in insts], trg_pad_idx, is_target=True)
-    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
-                                [1, 1, trg_max_len, 1]).astype("float32")
-    lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
-                                False, False, False)
-    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
-
-    return [
-        src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
-        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
-    ]
-
-
-import transformer_model
-
-
-def transformer(use_feed):
-    assert not use_feed, "transfomer doesn't support feed yet"
-    return transformer_model.transformer(
-        ModelHyperParams.src_vocab_size + 1,
-        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
-        ModelHyperParams.n_layer, ModelHyperParams.n_head,
-        ModelHyperParams.d_key, ModelHyperParams.d_value,
-        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
-        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
-        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
-
-
-class TestTransformer(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        reader = paddle.batch(
-            wmt16.train(ModelHyperParams.src_vocab_size,
-                        ModelHyperParams.trg_vocab_size),
-            batch_size=transformer_model.batch_size)
-
-        with fluid.recordio_writer.create_recordio_writer(
-                "./wmt16.recordio") as writer:
-            for batch in reader():
-                for tensor in prepare_batch_input(
-                        batch, ModelHyperParams.src_pad_idx,
-                        ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
-                    t = fluid.LoDTensor()
-                    t.set(tensor, fluid.CPUPlace())
-                    writer.append_tensor(t)
-                writer.complete_append_tensor()
-
-    @unittest.skip("transformer is buggy in multi gpu")
-    def test_main(self):
-        self.check_network_convergence(transformer)
-
-
-class ParallelExecutorTestingDuringTraining(unittest.TestCase):
-    def check_network_convergence(self, build_strategy=None):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = simple_fc_net(True)
-            test_program = main.clone(for_test=True)
-
-            opt = fluid.optimizer.SGD(learning_rate=0.001)
-            opt.minimize(loss)
-
-            batch_size = 32
-            image = np.random.normal(size=(batch_size, 784)).astype('float32')
-            label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
-
-            place = fluid.CUDAPlace(0)
-            exe = fluid.Executor(place)
-            exe.run(startup)
-            feed_dict = {'image': image, 'label': label}
-
-            train_exe = fluid.ParallelExecutor(
-                use_cuda=True,
-                loss_name=loss.name,
-                main_program=main,
-                build_strategy=build_strategy)
-
-            test_exe = fluid.ParallelExecutor(
-                use_cuda=True,
-                main_program=test_program,
-                share_vars_from=train_exe,
-                build_strategy=build_strategy)
-
-            for i in xrange(5):
-                test_loss, = test_exe.run([loss.name], feed=feed_dict)
-                test_loss = np.array(test_loss)
-
-                train_loss, = train_exe.run([loss.name], feed=feed_dict)
-                train_loss = np.array(train_loss)
-                self.assertTrue(
-                    np.allclose(
-                        train_loss, test_loss, atol=1e-8),
-                    "Train loss: " + str(train_loss) + "\n Test loss:" +
-                    str(test_loss))
-
-    def test_parallel_testing(self):
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
-        self.check_network_convergence(build_strategy)
-
-    def test_parallel_testing_with_new_strategy(self):
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-        self.check_network_convergence(build_strategy)
-
-
-import paddle.dataset.conll05 as conll05
-import paddle.fluid as fluid
-
-word_dict, verb_dict, label_dict = conll05.get_dict()
-word_dict_len = len(word_dict)
-label_dict_len = len(label_dict)
-pred_dict_len = len(verb_dict)
-mark_dict_len = 2
-word_dim = 32
-mark_dim = 5
-hidden_dim = 512
-depth = 8
-mix_hidden_lr = 1e-3
-embedding_name = 'emb'
-
-
-def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
-            is_sparse, **ignored):
-    # 8 features
-    predicate_embedding = fluid.layers.embedding(
-        input=predicate,
-        is_sparse=is_sparse,
-        size=[pred_dict_len, word_dim],
-        dtype='float32',
-        param_attr='vemb')
-
-    mark_embedding = fluid.layers.embedding(
-        input=mark,
-        is_sparse=is_sparse,
-        size=[mark_dict_len, mark_dim],
-        dtype='float32')
-
-    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-    emb_layers = [
-        fluid.layers.embedding(
-            size=[word_dict_len, word_dim],
-            is_sparse=is_sparse,
-            input=x,
-            param_attr=fluid.ParamAttr(
-                name=embedding_name, trainable=False)) for x in word_input
-    ]
-    emb_layers.append(predicate_embedding)
-    emb_layers.append(mark_embedding)
-
-    hidden_0_layers = [
-        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
-        for emb in emb_layers
-    ]
-
-    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
-
-    lstm_0 = fluid.layers.dynamic_lstm(
-        input=hidden_0,
-        size=hidden_dim,
-        candidate_activation='relu',
-        gate_activation='sigmoid',
-        cell_activation='sigmoid')
-
-    # stack L-LSTM and R-LSTM with direct edges
-    input_tmp = [hidden_0, lstm_0]
-
-    for i in range(1, depth):
-        mix_hidden = fluid.layers.sums(input=[
-            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
-            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
-        ])
-
-        lstm = fluid.layers.dynamic_lstm(
-            input=mix_hidden,
-            size=hidden_dim,
-            candidate_activation='relu',
-            gate_activation='sigmoid',
-            cell_activation='sigmoid',
-            is_reverse=((i % 2) == 1))
-
-        input_tmp = [mix_hidden, lstm]
-
-    feature_out = fluid.layers.sums(input=[
-        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
-        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
-    ])
-
-    return feature_out
-
-
-class TestCRFModel(unittest.TestCase):
-    def check_network_convergence(self, is_sparse, build_strategy=None):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            word = fluid.layers.data(
-                name='word_data', shape=[1], dtype='int64', lod_level=1)
-            predicate = fluid.layers.data(
-                name='verb_data', shape=[1], dtype='int64', lod_level=1)
-            ctx_n2 = fluid.layers.data(
-                name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-            ctx_n1 = fluid.layers.data(
-                name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-            ctx_0 = fluid.layers.data(
-                name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-            ctx_p1 = fluid.layers.data(
-                name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-            ctx_p2 = fluid.layers.data(
-                name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
-            mark = fluid.layers.data(
-                name='mark_data', shape=[1], dtype='int64', lod_level=1)
-
-            feature_out = db_lstm(**locals())
-            target = fluid.layers.data(
-                name='target', shape=[1], dtype='int64', lod_level=1)
-            crf_cost = fluid.layers.linear_chain_crf(
-                input=feature_out,
-                label=target,
-                param_attr=fluid.ParamAttr(
-                    name='crfw', learning_rate=1e-1))
-            avg_cost = fluid.layers.mean(crf_cost)
-
-            sgd_optimizer = fluid.optimizer.SGD(
-                learning_rate=fluid.layers.exponential_decay(
-                    learning_rate=0.01,
-                    decay_steps=100000,
-                    decay_rate=0.5,
-                    staircase=True))
-            sgd_optimizer.minimize(avg_cost)
-
-            train_data = paddle.batch(
-                paddle.reader.shuffle(
-                    paddle.dataset.conll05.test(), buf_size=8192),
-                batch_size=16)
-
-            place = fluid.CUDAPlace(0)
-            exe = fluid.Executor(place)
-            exe.run(startup)
-
-            pe = fluid.ParallelExecutor(
-                use_cuda=True,
-                loss_name=avg_cost.name,
-                build_strategy=build_strategy)
-
-            feeder = fluid.DataFeeder(
-                feed_list=[
-                    word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
-                    mark, target
-                ],
-                place=fluid.CPUPlace())
-
-            data = train_data()
-            for i in xrange(10):
-                cur_batch = next(data)
-                print map(np.array,
-                          pe.run(feed=feeder.feed(cur_batch),
-                                 fetch_list=[avg_cost.name]))[0]
-
-    def test_update_sparse_parameter_all_reduce(self):
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
-        self.check_network_convergence(
-            is_sparse=True, build_strategy=build_strategy)
-
-    def test_update_dense_parameter_all_reduce(self):
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
-        self.check_network_convergence(
-            is_sparse=False, build_strategy=build_strategy)
-
-    def test_update_sparse_parameter_reduce(self):
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-        self.check_network_convergence(
-            is_sparse=True, build_strategy=build_strategy)
-
-    def test_update_dense_parameter_reduce(self):
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-        self.check_network_convergence(
-            is_sparse=False, build_strategy=build_strategy)
-
-
-# test fetch all the variables of global_block
-
-import paddle.dataset.flowers as flowers
-import math
-
-
-def Lenet(data, class_dim):
-    conv1 = fluid.layers.conv2d(data, 32, 5, 1, act=None)
-    bn1 = fluid.layers.batch_norm(conv1, act='relu')
-    pool1 = fluid.layers.pool2d(bn1, 2, 'max', 2)
-    conv2 = fluid.layers.conv2d(pool1, 50, 5, 1, act=None)
-    bn2 = fluid.layers.batch_norm(conv2, act='relu')
-    pool2 = fluid.layers.pool2d(bn2, 2, 'max', 2)
-
-    fc1 = fluid.layers.fc(pool2, size=500, act='relu')
-    fc2 = fluid.layers.fc(fc1, size=class_dim, act='softmax')
-
-    return fc2
-
-
-class TestFetchOp(unittest.TestCase):
-    def parallel_exe(self, train_inputs, seed):
-        main = fluid.Program()
-        startup = fluid.Program()
-        startup.random_seed = seed
-        with fluid.program_guard(main, startup):
-            data = fluid.layers.data(
-                name='image', shape=[3, 224, 224], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            out = Lenet(data, class_dim=102)
-            loss = fluid.layers.cross_entropy(input=out, label=label)
-            loss = fluid.layers.mean(loss)
-
-            opt = fluid.optimizer.Momentum(
-                learning_rate=0.1,
-                momentum=0.9,
-                regularization=fluid.regularizer.L2Decay(1e-4))
-
-            opt.minimize(loss)
-
-            # TODO(zcd): I found that onece the memory optimizer is open,
-            # parallel_exe doesn't fetch some variable, such as conv2d_0.b_0@GRAD,
-            # conv2d_1.b_0@GRAD. Those variables should not be pruned.
-            # fluid.memory_optimize(main)
-
-            place = fluid.CUDAPlace(0)
-            exe = fluid.Executor(place)
-            exe.run(startup)
-
-            feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
-            pe = fluid.ParallelExecutor(
-                use_cuda=True, loss_name=loss.name, main_program=main)
-
-            fetch_list = []
-            all_vars = main.global_block().vars
-            for k, v in all_vars.iteritems():
-                if 'tmp' not in k and k[0] is not '_' or v.persistable:
-                    fetch_list.append(k)
-
-            for data in train_inputs:
-                ret = pe.run(fetch_list, feed=feeder.feed(data))
-                for i in range(len(fetch_list)):
-                    assert not math.isnan(np.sum(ret[i])) and \
-                           not math.isinf(np.sum(ret[i]))
-
-    def test_fetch_op(self):
-        tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16)
-        tst_reader_iter = tst_reader()
-
-        iters = 3
-        train_inputs = []
-        for i in range(iters):
-            train_inputs.append(tst_reader_iter.next())
-
-        self.parallel_exe(train_inputs, seed=1)
-
-
-class TestFeedParallel(unittest.TestCase):
-    def test_main(self):
-        main = fluid.Program()
-        startup = fluid.Program()
-        startup.random_seed = 1
-        with fluid.scope_guard(fluid.core.Scope()):
-            with fluid.program_guard(main, startup):
-                data = fluid.layers.data(
-                    name='image', shape=[3, 224, 224], dtype='float32')
-                label = fluid.layers.data(
-                    name='label', shape=[1], dtype='int64')
-                out = Lenet(data, class_dim=102)
-                loss = fluid.layers.cross_entropy(input=out, label=label)
-                loss = fluid.layers.mean(loss)
-                opt = fluid.optimizer.Momentum(
-                    learning_rate=0.1,
-                    momentum=0.9,
-                    regularization=fluid.regularizer.L2Decay(1e-4))
-
-                opt.minimize(loss)
-        place = fluid.CUDAPlace(0)
-        feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
-        reader = feeder.decorate_reader(
-            paddle.batch(
-                flowers.train(), batch_size=16), multi_devices=True)
-        exe = fluid.Executor(place)
-        exe.run(startup)
-        pe = fluid.ParallelExecutor(
-            use_cuda=True, loss_name=loss.name, main_program=main)
-
-        for batch_id, data in enumerate(reader()):
-            loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0])
-            print batch_id, loss_np
-            if batch_id == 2:
-                break
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
new file mode 100644
index 0000000000..163975555e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.conll05 as conll05
+import paddle.fluid as fluid
+import unittest
+import paddle
+import numpy as np
+import os
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_dict_len = len(verb_dict)
+mark_dict_len = 2
+word_dim = 32
+mark_dim = 5
+hidden_dim = 512
+depth = 8
+mix_hidden_lr = 1e-3
+embedding_name = 'emb'
+
+
+def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
+            is_sparse, **ignored):
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        is_sparse=is_sparse,
+        size=[pred_dict_len, word_dim],
+        dtype='float32',
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark,
+        is_sparse=is_sparse,
+        size=[mark_dict_len, mark_dim],
+        dtype='float32')
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[word_dict_len, word_dim],
+            is_sparse=is_sparse,
+            input=x,
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
+        for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=hidden_dim,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=hidden_dim,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
+    ])
+
+    return feature_out
+
+
+class TestCRFModel(unittest.TestCase):
+    def check_network_convergence(self,
+                                  is_sparse,
+                                  build_strategy=None,
+                                  use_cuda=True):
+        os.environ['CPU_NUM'] = str(4)
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            word = fluid.layers.data(
+                name='word_data', shape=[1], dtype='int64', lod_level=1)
+            predicate = fluid.layers.data(
+                name='verb_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_n2 = fluid.layers.data(
+                name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_n1 = fluid.layers.data(
+                name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_0 = fluid.layers.data(
+                name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_p1 = fluid.layers.data(
+                name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_p2 = fluid.layers.data(
+                name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+            mark = fluid.layers.data(
+                name='mark_data', shape=[1], dtype='int64', lod_level=1)
+
+            feature_out = db_lstm(**locals())
+            target = fluid.layers.data(
+                name='target', shape=[1], dtype='int64', lod_level=1)
+            crf_cost = fluid.layers.linear_chain_crf(
+                input=feature_out,
+                label=target,
+                param_attr=fluid.ParamAttr(
+                    name='crfw', learning_rate=1e-1))
+            avg_cost = fluid.layers.mean(crf_cost)
+
+            sgd_optimizer = fluid.optimizer.SGD(
+                learning_rate=fluid.layers.exponential_decay(
+                    learning_rate=0.01,
+                    decay_steps=100000,
+                    decay_rate=0.5,
+                    staircase=True))
+            sgd_optimizer.minimize(avg_cost)
+
+            train_data = paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.conll05.test(), buf_size=8192),
+                batch_size=16)
+
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup)
+
+            pe = fluid.ParallelExecutor(
+                use_cuda=use_cuda,
+                loss_name=avg_cost.name,
+                build_strategy=build_strategy)
+
+            feeder = fluid.DataFeeder(
+                feed_list=[
+                    word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
+                    mark, target
+                ],
+                place=fluid.CPUPlace())
+
+            data = train_data()
+            for i in xrange(10):
+                cur_batch = next(data)
+                print map(np.array,
+                          pe.run(feed=feeder.feed(cur_batch),
+                                 fetch_list=[avg_cost.name]))[0]
+
+    def test_update_sparse_parameter_all_reduce(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+        self.check_network_convergence(
+            is_sparse=True, build_strategy=build_strategy, use_cuda=True)
+        self.check_network_convergence(
+            is_sparse=True, build_strategy=build_strategy, use_cuda=False)
+
+    def test_update_dense_parameter_all_reduce(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+        self.check_network_convergence(
+            is_sparse=False, build_strategy=build_strategy, use_cuda=True)
+        self.check_network_convergence(
+            is_sparse=False, build_strategy=build_strategy, use_cuda=False)
+
+    def test_update_sparse_parameter_reduce(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+        self.check_network_convergence(
+            is_sparse=True, build_strategy=build_strategy, use_cuda=True)
+        self.check_network_convergence(
+            is_sparse=True, build_strategy=build_strategy, use_cuda=False)
+
+    def test_update_dense_parameter_reduce(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+        self.check_network_convergence(
+            is_sparse=False, build_strategy=build_strategy, use_cuda=True)
+        self.check_network_convergence(
+            is_sparse=False, build_strategy=build_strategy, use_cuda=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
new file mode 100644
index 0000000000..79702475cc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.flowers as flowers
+import math
+import paddle.fluid as fluid
+import unittest
+import numpy as np
+import paddle
+import os
+
+
+def Lenet(data, class_dim):
+    conv1 = fluid.layers.conv2d(data, 32, 5, 1, act=None)
+    bn1 = fluid.layers.batch_norm(conv1, act='relu')
+    pool1 = fluid.layers.pool2d(bn1, 2, 'max', 2)
+    conv2 = fluid.layers.conv2d(pool1, 50, 5, 1, act=None)
+    bn2 = fluid.layers.batch_norm(conv2, act='relu')
+    pool2 = fluid.layers.pool2d(bn2, 2, 'max', 2)
+
+    fc1 = fluid.layers.fc(pool2, size=500, act='relu')
+    fc2 = fluid.layers.fc(fc1, size=class_dim, act='softmax')
+
+    return fc2
+
+
+class TestFetchOp(unittest.TestCase):
+    def parallel_exe(self, train_inputs, seed, use_cuda):
+        main = fluid.Program()
+        startup = fluid.Program()
+        startup.random_seed = seed
+        with fluid.program_guard(main, startup):
+            data = fluid.layers.data(
+                name='image', shape=[3, 224, 224], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            out = Lenet(data, class_dim=102)
+            loss = fluid.layers.cross_entropy(input=out, label=label)
+            loss = fluid.layers.mean(loss)
+
+            opt = fluid.optimizer.Momentum(
+                learning_rate=0.1,
+                momentum=0.9,
+                regularization=fluid.regularizer.L2Decay(1e-4))
+
+            opt.minimize(loss)
+
+            # TODO(zcd): I found that onece the memory optimizer is open,
+            # parallel_exe doesn't fetch some variable, such as conv2d_0.b_0@GRAD,
+            # conv2d_1.b_0@GRAD. Those variables should not be pruned.
+            # fluid.memory_optimize(main)
+
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup)
+
+            feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
+            pe = fluid.ParallelExecutor(
+                use_cuda=use_cuda, loss_name=loss.name, main_program=main)
+
+            fetch_list = []
+            all_vars = main.global_block().vars
+            for k, v in all_vars.iteritems():
+                if 'tmp' not in k and k[0] is not '_' or v.persistable:
+                    fetch_list.append(k)
+
+            for data in train_inputs:
+                ret = pe.run(fetch_list, feed=feeder.feed(data))
+                for i in range(len(fetch_list)):
+                    assert not math.isnan(np.sum(ret[i])) and \
+                           not math.isinf(np.sum(ret[i]))
+
+    def test_fetch_op(self):
+        tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16)
+        tst_reader_iter = tst_reader()
+
+        iters = 3
+        train_inputs = []
+        for i in range(iters):
+            train_inputs.append(tst_reader_iter.next())
+
+        os.environ['CPU_NUM'] = str(4)
+        self.parallel_exe(train_inputs, seed=1, use_cuda=True)
+        self.parallel_exe(train_inputs, seed=1, use_cuda=False)
+
+
+class TestFeedParallel(unittest.TestCase):
+    def parallel_exe(self, use_cuda, seed):
+        main = fluid.Program()
+        startup = fluid.Program()
+        startup.random_seed = seed
+        with fluid.scope_guard(fluid.core.Scope()):
+            with fluid.program_guard(main, startup):
+                data = fluid.layers.data(
+                    name='image', shape=[3, 224, 224], dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+                out = Lenet(data, class_dim=102)
+                loss = fluid.layers.cross_entropy(input=out, label=label)
+                loss = fluid.layers.mean(loss)
+                opt = fluid.optimizer.Momentum(
+                    learning_rate=0.1,
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4))
+
+                opt.minimize(loss)
+
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
+        reader = feeder.decorate_reader(
+            paddle.batch(
+                flowers.train(), batch_size=16), multi_devices=True)
+
+        exe = fluid.Executor(place)
+        exe.run(startup)
+
+        pe = fluid.ParallelExecutor(
+            use_cuda=use_cuda, loss_name=loss.name, main_program=main)
+
+        for batch_id, data in enumerate(reader()):
+            loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0])
+            print batch_id, loss_np
+            if batch_id == 2:
+                break
+
+    def test_feed_op(self):
+        os.environ['CPU_NUM'] = str(4)
+        self.parallel_exe(use_cuda=True, seed=1)
+        self.parallel_exe(use_cuda=False, seed=1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
new file mode 100644
index 0000000000..a801d99aa1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from parallel_executor_test_base import TestParallelExecutorBase
+import paddle.fluid as fluid
+import numpy as np
+import paddle
+import paddle.dataset.mnist as mnist
+import unittest
+import os
+
+MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
+
+
+def simple_fc_net(use_feed):
+    if use_feed:
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    else:
+        reader = fluid.layers.open_files(
+            filenames=[MNIST_RECORDIO_FILE],
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'],
+            thread_num=1,
+            for_parallel=True)
+        reader = fluid.layers.io.double_buffer(reader)
+        img, label = fluid.layers.read_file(reader)
+    hidden = img
+    for _ in xrange(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def fc_with_batchnorm(use_feed):
+    if use_feed:
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    else:
+        reader = fluid.layers.open_files(
+            filenames=[MNIST_RECORDIO_FILE],
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'],
+            thread_num=1,
+            for_parallel=True)
+        reader = fluid.layers.io.double_buffer(reader)
+        img, label = fluid.layers.read_file(reader)
+
+    hidden = img
+    for _ in xrange(1):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+
+        hidden = fluid.layers.batch_norm(input=hidden)
+
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestMNIST(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=4)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            fluid.recordio_writer.convert_reader_to_recordio_file(
+                MNIST_RECORDIO_FILE, reader, feeder)
+
+    def check_simple_fc_convergence(self,
+                                    balance_parameter_opt_between_cards,
+                                    use_cuda=True):
+        self.check_network_convergence(simple_fc_net, use_cuda=use_cuda)
+        self.check_network_convergence(
+            simple_fc_net, use_cuda=use_cuda, allow_op_delay=True)
+
+        img = np.zeros(shape=[32, 784], dtype='float32')
+        label = np.ones(shape=[32, 1], dtype='int64')
+        self.check_network_convergence(
+            simple_fc_net,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
+        )
+
+    def test_simple_fc(self):
+        self.check_simple_fc_convergence(False, use_cuda=True)
+        self.check_simple_fc_convergence(False, use_cuda=False)
+
+    def test_simple_fc_with_new_strategy(self):
+        self.check_simple_fc_convergence(True, use_cuda=True)
+        self.check_simple_fc_convergence(True, use_cuda=False)
+
+    def check_simple_fc_parallel_accuracy(self,
+                                          balance_parameter_opt_between_cards,
+                                          use_cuda=True):
+        img = np.zeros(shape=[32, 784], dtype='float32')
+        label = np.ones(shape=[32, 1], dtype='int64')
+        single_first_loss, single_last_loss = self.check_network_convergence(
+            method=simple_fc_net,
+            seed=1000,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_parallel_executor=False)
+        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
+            method=simple_fc_net,
+            seed=1000,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_parallel_executor=True,
+            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
+        )
+
+        for p_f in parallel_first_loss:
+            self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
+        for p_l in parallel_last_loss:
+            self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
+
+    def test_simple_fc_parallel_accuracy(self):
+        self.check_simple_fc_parallel_accuracy(False, use_cuda=True)
+        self.check_simple_fc_parallel_accuracy(False, use_cuda=False)
+
+    def test_simple_fc_parallel_accuracy_with_new_strategy(self):
+        self.check_simple_fc_parallel_accuracy(True, use_cuda=True)
+        self.check_simple_fc_parallel_accuracy(True, use_cuda=False)
+
+    def check_batchnorm_fc_convergence(
+            self, balance_parameter_opt_between_cards, use_cuda):
+        self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda)
+        img = np.zeros(shape=[32, 784], dtype='float32')
+        label = np.ones(shape=[32, 1], dtype='int64')
+        self.check_network_convergence(
+            fc_with_batchnorm,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
+        )
+
+    def test_batchnorm_fc(self):
+        self.check_batchnorm_fc_convergence(False, use_cuda=True)
+        self.check_batchnorm_fc_convergence(False, use_cuda=False)
+
+    def test_batchnorm_fc_with_new_strategy(self):
+        self.check_batchnorm_fc_convergence(True, use_cuda=True)
+        self.check_batchnorm_fc_convergence(True, use_cuda=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
new file mode 100644
index 0000000000..066299e6c6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+from parallel_executor_test_base import TestParallelExecutorBase
+import unittest
+import os
+
+
+def squeeze_excitation(input, num_channels, reduction_ratio):
+    # pool = fluid.layers.pool2d(
+    #    input=input, pool_size=0, pool_type='avg', global_pooling=True)
+    conv = input
+    shape = conv.shape
+    reshape = fluid.layers.reshape(
+        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
+    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
+
+    squeeze = fluid.layers.fc(input=pool,
+                              size=num_channels / reduction_ratio,
+                              act='relu')
+    excitation = fluid.layers.fc(input=squeeze,
+                                 size=num_channels,
+                                 act='sigmoid')
+    scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+    return scale
+
+
+def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
+                  act=None):
+    conv = fluid.layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        stride=stride,
+        padding=(filter_size - 1) / 2,
+        groups=groups,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)
+
+
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1]
+    if ch_in != ch_out:
+        if stride == 1:
+            filter_size = 1
+        else:
+            filter_size = 3
+        return conv_bn_layer(input, ch_out, filter_size, stride)
+    else:
+        return input
+
+
+def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
+    # The number of first 1x1 convolutional channels for each bottleneck build block
+    # was halved to reduce the compution cost.
+    conv0 = conv_bn_layer(
+        input=input, num_filters=num_filters, filter_size=1, act='relu')
+    conv1 = conv_bn_layer(
+        input=conv0,
+        num_filters=num_filters * 2,
+        filter_size=3,
+        stride=stride,
+        groups=cardinality,
+        act='relu')
+    conv2 = conv_bn_layer(
+        input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+    scale = squeeze_excitation(
+        input=conv2,
+        num_channels=num_filters * 2,
+        reduction_ratio=reduction_ratio)
+
+    short = shortcut(input, num_filters * 2, stride)
+
+    return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+
+
+def SE_ResNeXt50Small(batch_size=2, use_feed=False):
+    assert not use_feed, "SE_ResNeXt doesn't support feed yet"
+
+    img = fluid.layers.fill_constant(
+        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
+    label = fluid.layers.fill_constant(
+        shape=[batch_size, 1], dtype='int64', value=0.0)
+
+    conv = conv_bn_layer(
+        input=img, num_filters=16, filter_size=3, stride=2, act='relu')
+    conv = conv_bn_layer(
+        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
+    conv = conv_bn_layer(
+        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
+    conv = fluid.layers.pool2d(
+        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+
+    cardinality = 32
+    reduction_ratio = 16
+    depth = [3, 4, 6, 3]
+    num_filters = [128, 256, 512, 1024]
+
+    for block in range(len(depth)):
+        for i in range(depth[block]):
+            conv = bottleneck_block(
+                input=conv,
+                num_filters=num_filters[block],
+                stride=2 if i == 0 and block != 0 else 1,
+                cardinality=cardinality,
+                reduction_ratio=reduction_ratio)
+
+    shape = conv.shape
+    reshape = fluid.layers.reshape(
+        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
+    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
+    dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+    # Classifier layer:
+    prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestResnet(TestParallelExecutorBase):
+    def check_resnet_convergence(self,
+                                 balance_parameter_opt_between_cards,
+                                 use_cuda=True,
+                                 iter=20):
+        os.environ['CPU_NUM'] = str(4)
+
+        import functools
+        batch_size = 2
+        self.check_network_convergence(
+            functools.partial(
+                SE_ResNeXt50Small, batch_size=batch_size),
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
+        )
+
+    def test_resnet(self):
+        self.check_resnet_convergence(False, use_cuda=True)
+        self.check_resnet_convergence(False, use_cuda=False, iter=5)
+
+    def test_resnet_with_new_strategy(self):
+        self.check_resnet_convergence(True, use_cuda=True)
+        self.check_resnet_convergence(True, use_cuda=False, iter=5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
new file mode 100644
index 0000000000..31ba8c1d60
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import numpy as np
+import unittest
+import os
+
+
+def simple_fc_net():
+    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = img
+    for _ in xrange(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class ParallelExecutorTestingDuringTraining(unittest.TestCase):
+    def check_network_convergence(self, use_cuda, build_strategy=None):
+        os.environ['CPU_NUM'] = str(4)
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = simple_fc_net()
+            test_program = main.clone(for_test=True)
+
+            opt = fluid.optimizer.SGD(learning_rate=0.001)
+            opt.minimize(loss)
+
+            batch_size = 32
+            image = np.random.normal(size=(batch_size, 784)).astype('float32')
+            label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
+
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup)
+            feed_dict = {'image': image, 'label': label}
+
+            train_exe = fluid.ParallelExecutor(
+                use_cuda=use_cuda,
+                loss_name=loss.name,
+                main_program=main,
+                build_strategy=build_strategy)
+
+            test_exe = fluid.ParallelExecutor(
+                use_cuda=use_cuda,
+                main_program=test_program,
+                share_vars_from=train_exe,
+                build_strategy=build_strategy)
+
+            for i in xrange(5):
+                test_loss, = test_exe.run([loss.name], feed=feed_dict)
+                test_loss = np.array(test_loss)
+
+                train_loss, = train_exe.run([loss.name], feed=feed_dict)
+                train_loss = np.array(train_loss)
+                self.assertTrue(
+                    np.allclose(
+                        train_loss, test_loss, atol=1e-8),
+                    "Train loss: " + str(train_loss) + "\n Test loss:" +
+                    str(test_loss))
+
+    def test_parallel_testing(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+        self.check_network_convergence(
+            use_cuda=True, build_strategy=build_strategy)
+        self.check_network_convergence(
+            use_cuda=False, build_strategy=build_strategy)
+
+    def test_parallel_testing_with_new_strategy(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+        self.check_network_convergence(
+            use_cuda=True, build_strategy=build_strategy)
+        self.check_network_convergence(
+            use_cuda=False, build_strategy=build_strategy)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
new file mode 100644
index 0000000000..b6215fddb1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import transformer_model
+import numpy as np
+from parallel_executor_test_base import TestParallelExecutorBase
+import unittest
+import paddle
+import paddle.dataset.wmt16 as wmt16
+import os
+
+WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio"
+
+
+class ModelHyperParams(object):
+    # Dictionary size for source and target language. This model directly uses
+    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
+    # alreay been added, but the <pad> token is not added. Transformer requires
+    # sequences in a mini-batch are padded to have the same length. A <pad> token is
+    # added into the original dictionary in paddle.dateset.wmt16.
+
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # index for <pad> token in source language.
+    src_pad_idx = src_vocab_size
+
+    # size of target word dictionay
+    trg_vocab_size = 10000
+    # index for <pad> token in target language.
+    trg_pad_idx = trg_vocab_size
+
+    # position value corresponding to the <pad> token.
+    pos_pad_idx = 0
+
+    # max length of sequences. It should plus 1 to include position
+    # padding token for position encoding.
+    max_length = 50
+
+    # the dimension for word embeddings, which is also the last dimension of
+    # the input and output of multi-head attention, position-wise feed-forward
+    # networks, encoder and decoder.
+
+    d_model = 512
+    # size of the hidden layer in position-wise feed-forward networks.
+    d_inner_hid = 1024
+    # the dimension that keys are projected to for dot-product attention.
+    d_key = 64
+    # the dimension that values are projected to for dot-product attention.
+    d_value = 64
+    # number of head used in multi-head attention.
+    n_head = 8
+    # number of sub-layers to be stacked in the encoder and decoder.
+    n_layer = 6
+    # dropout rate used by all dropout layers.
+    dropout = 0.1
+
+
+def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias. Then, convert the numpy
+    data to tensors and return a dict mapping names to tensors.
+    """
+
+    def __pad_batch_data(insts,
+                         pad_idx,
+                         is_target=False,
+                         return_pos=True,
+                         return_attn_bias=True,
+                         return_max_len=True):
+        """
+        Pad the instances to the max sequence length in batch, and generate the
+        corresponding position data and attention bias.
+        """
+        return_list = []
+        max_len = max(len(inst) for inst in insts)
+        inst_data = np.array(
+            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
+        return_list += [inst_data.astype("int64").reshape([-1, 1])]
+        if return_pos:
+            inst_pos = np.array([[
+                pos_i + 1 if w_i != pad_idx else 0
+                for pos_i, w_i in enumerate(inst)
+            ] for inst in inst_data])
+
+            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
+        if return_attn_bias:
+            if is_target:
+                # This is used to avoid attention on paddings and subsequent
+                # words.
+                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
+                                              max_len))
+                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
+                    [-1, 1, max_len, max_len])
+                slf_attn_bias_data = np.tile(slf_attn_bias_data,
+                                             [1, n_head, 1, 1]) * [-1e9]
+            else:
+                # This is used to avoid attention on paddings.
+                slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
+                                               (max_len - len(inst))
+                                               for inst in insts])
+                slf_attn_bias_data = np.tile(
+                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
+                    [1, n_head, max_len, 1])
+            return_list += [slf_attn_bias_data.astype("float32")]
+        if return_max_len:
+            return_list += [max_len]
+        return return_list if len(return_list) > 1 else return_list[0]
+
+    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
+        [inst[0] for inst in insts], src_pad_idx, is_target=False)
+    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
+        [inst[1] for inst in insts], trg_pad_idx, is_target=True)
+    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
+                                [1, 1, trg_max_len, 1]).astype("float32")
+    lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
+                                False, False, False)
+    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
+
+    return [
+        src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
+        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+    ]
+
+
+def transformer(use_feed):
+    assert not use_feed, "transfomer doesn't support feed yet"
+    return transformer_model.transformer(
+        ModelHyperParams.src_vocab_size + 1,
+        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
+        ModelHyperParams.n_layer, ModelHyperParams.n_head,
+        ModelHyperParams.d_key, ModelHyperParams.d_value,
+        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
+        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
+        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
+
+
+class TestTransformer(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+        reader = paddle.batch(
+            wmt16.train(ModelHyperParams.src_vocab_size,
+                        ModelHyperParams.trg_vocab_size),
+            batch_size=transformer_model.batch_size)
+
+        with fluid.recordio_writer.create_recordio_writer(
+                WMT16_RECORDIO_FILE) as writer:
+            for batch in reader():
+                for tensor in prepare_batch_input(
+                        batch, ModelHyperParams.src_pad_idx,
+                        ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
+                    t = fluid.LoDTensor()
+                    t.set(tensor, fluid.CPUPlace())
+                    writer.append_tensor(t)
+                writer.complete_append_tensor()
+
+    @unittest.skip("transformer is buggy in multi gpu")
+    def test_main(self):
+        self.check_network_convergence(transformer, use_cuda=True)
+        self.check_network_convergence(transformer, use_cuda=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
new file mode 100644
index 0000000000..2105d32066
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
@@ -0,0 +1,68 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def PolygonBoxRestore(input):
+    shape = input.shape
+    batch_size = shape[0]
+    geo_channels = shape[1]
+    h = shape[2]
+    w = shape[3]
+    h_indexes = np.array(range(h) * w).reshape(
+        [w, h]).transpose()[np.newaxis, :]  # [1, h, w]
+    w_indexes = np.array(range(w) * h).reshape(
+        [h, w])[np.newaxis, :]  # [1, h, w]
+    indexes = np.concatenate(
+        (w_indexes, h_indexes))[np.newaxis, :]  # [1, 2, h, w]
+    indexes = indexes.repeat(
+        [geo_channels / 2],
+        axis=0)[np.newaxis, :]  # [1, geo_channels/2, 2, h, w]
+    indexes = indexes.repeat(
+        [batch_size], axis=0)  # [batch_size, geo_channels/2, 2, h, w]
+    return indexes.reshape(
+        input.shape) - input  # [batch_size, geo_channels, h, w]
+
+
+class TestPolygonBoxRestoreOp(OpTest):
+    def config(self):
+        self.input_shape = (1, 8, 2, 2)
+
+    def setUp(self):
+        self.config()
+        self.op_type = "polygon_box_transform"
+        input = np.random.random(self.input_shape).astype("float32")
+        self.inputs = {'Input': input}
+        output = PolygonBoxRestore(input)
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCase1(TestPolygonBoxRestoreOp):
+    def config(self):
+        self.input_shape = (2, 10, 3, 2)
+
+
+class TestCase2(TestPolygonBoxRestoreOp):
+    def config(self):
+        self.input_shape = (3, 12, 4, 5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_preprocessor.py b/python/paddle/fluid/tests/unittests/test_preprocessor.py
new file mode 100644
index 0000000000..cbf1a7e0c5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_preprocessor.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+import paddle.v2.dataset.mnist as mnist
+
+
+class TestPreprocessor(unittest.TestCase):
+    def setUp(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=32)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+                './mnist_for_preprocessor_test.recordio', reader, feeder)
+
+    def test_main(self):
+        N = 10
+
+        img_expected_res = []
+        lbl_expected_res = []
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = fluid.layers.io.open_recordio_file(
+                './mnist_for_preprocessor_test.recordio',
+                shapes=[[-1, 784], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            img, lbl = fluid.layers.io.read_file(data_file)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for _ in range(N):
+                img_v, lbl_v = exe.run(fetch_list=[img, lbl])
+                img_expected_res.append(img_v / 2)
+                lbl_expected_res.append(lbl_v + 1)
+
+        img_actual_res = []
+        lbl_actual_res = []
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = fluid.layers.io.open_recordio_file(
+                './mnist_for_preprocessor_test.recordio',
+                shapes=[[-1, 784], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            preprocessor = fluid.layers.io.Preprocessor(reader=data_file)
+            with preprocessor.block():
+                img, lbl = preprocessor.inputs()
+                img_out = img / 2
+                lbl_out = lbl + 1
+                preprocessor.outputs(img_out, lbl_out)
+
+            data_file = fluid.layers.io.double_buffer(preprocessor())
+            img, lbl = fluid.layers.io.read_file(data_file)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for _ in range(N):
+                img_v, lbl_v = exe.run(fetch_list=[img, lbl])
+                img_actual_res.append(img_v)
+                lbl_actual_res.append(lbl_v)
+
+        for idx in range(N):
+            np.allclose(img_expected_res[idx], img_actual_res[idx])
+            np.allclose(lbl_expected_res[idx], lbl_actual_res[idx])
diff --git a/python/paddle/fluid/tests/unittests/test_random_crop_op.py b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
new file mode 100644
index 0000000000..1c708d0386
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+
+
+class TestRandomCropOp(OpTest):
+    def setUp(self):
+        to_crop = np.array([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]] *
+                           5).astype("float32")
+        self.possible_res = [
+            np.array([[1, 2, 3], [5, 6, 7]]), np.array([[2, 3, 4], [6, 7, 8]]),
+            np.array([[5, 6, 7], [9, 10, 11]]),
+            np.array([[6, 7, 8], [10, 11, 12]])
+        ]
+        self.op_type = "random_crop"
+        self.inputs = {'X': to_crop, 'Seed': np.array([10])}
+        self.outputs = {'Out': np.array([]), 'SeedOut': np.array([])}
+        self.attrs = {'shape': [2, 3]}
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        out = np.array(outs[1])
+        for ins in out[:]:
+            is_equal = [(ins == res).all() for res in self.possible_res]
+            self.assertIn(True, is_equal)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 9b0cc3534d..865c2b7df0 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -34,8 +34,10 @@ class TestMeanOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_mean"
         self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
-        self.attrs = {'dim': 1}
-        self.outputs = {'Out': self.inputs['X'].mean(axis=self.attrs['dim'])}
+        self.attrs = {'dim': [1]}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -50,8 +52,10 @@ class TestMaxOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_max"
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': -1}
-        self.outputs = {'Out': self.inputs['X'].max(axis=self.attrs['dim'])}
+        self.attrs = {'dim': [-1]}
+        self.outputs = {
+            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -63,8 +67,10 @@ class TestMinOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_min"
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': 2}
-        self.outputs = {'Out': self.inputs['X'].min(axis=self.attrs['dim'])}
+        self.attrs = {'dim': [2]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -87,9 +93,10 @@ class TestKeepDimReduce(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': -2, 'keep_dim': True}
+        self.attrs = {'dim': [-2], 'keep_dim': True}
         self.outputs = {
-            'Out': self.inputs['X'].sum(axis=self.attrs['dim'], keepdims=True)
+            'Out':
+            self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True)
         }
 
     def test_check_output(self):
@@ -126,5 +133,67 @@ class TestReduceAll(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+## reduction in multi dims
+class TestReduceMeanOpMultiAxises(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
+        self.attrs = {'dim': [1, 2]}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=(1, 2))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestReduceMaxOpMultiAxises(OpTest):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': [-2, -1]}
+        self.outputs = {
+            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestReduceMinOpMultiAxises(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': [1, 2]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestKeepDimReduceSumMultiAxises(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': [-2, -1], 'keep_dim': True}
+        self.outputs = {
+            'Out':
+            self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reverse_op.py b/python/paddle/fluid/tests/unittests/test_reverse_op.py
new file mode 100644
index 0000000000..f845575a02
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestReverseOp(OpTest):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [0]
+
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = "reverse"
+        self.inputs = {"X": self.x}
+        self.attrs = {'axis': self.axis}
+        out = self.x
+        for a in self.axis:
+            out = np.flip(out, axis=a)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestCase0(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [1]
+
+
+class TestCase1(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [0, 1]
+
+
+class TestCase2(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4, 5)).astype('float32')
+        self.axis = [0, 2]
+
+
+class TestCase3(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4, 5)).astype('float32')
+        self.axis = [1, 2]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_shape_op.py b/python/paddle/fluid/tests/unittests/test_shape_op.py
new file mode 100644
index 0000000000..a62ee05007
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_shape_op.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestShapeOp(OpTest):
+    def setUp(self):
+        self.op_type = "shape"
+        self.config()
+        self.shape = [2, 3]
+        input = np.zeros(self.shape)
+        self.inputs = {'Input': input}
+        self.outputs = {'Out': np.array(self.shape)}
+
+    def config(self):
+        self.shape = [2, 3]
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class case1(TestShapeOp):
+    def config(self):
+        self.shape = [2]
+
+
+class case2(TestShapeOp):
+    def config(self):
+        self.shape = [1, 2, 3]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
new file mode 100644
index 0000000000..f4aa7426bc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
@@ -0,0 +1,80 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.fluid.transpiler.distribute_transpiler import delete_ops
+
+from transpiler_test import TranspilerTest
+
+
+class TestSimpleDistTranspiler(TranspilerTest):
+    def setUp(self):
+        self.current_pserver_ep = "127.0.0.1:6175"
+
+    def test_simple_transpiler(self):
+        np.random.seed(1)
+
+        trainer = self.get_trainer()
+        pserver, startup = self.get_pserver(self.current_pserver_ep)
+        self.assertEqual([op.type for op in trainer.global_block().ops],
+                         self.get_expect_trainer_ops())
+
+        self.assertEqual(len(pserver.blocks), 2)
+        # block0: listen_and_serv
+        self.assertEqual([op.type for op in pserver.blocks[0].ops],
+                         ["listen_and_serv"])
+        # block1: optimize pass
+        self.assertEqual([op.type for op in pserver.blocks[1].ops],
+                         ["sum", "scale", "sgd"])
+
+        # confirm startup program
+        self.assertEqual([op.type for op in startup.global_block().ops],
+                         ["fill_constant", "uniform_random", "uniform_random"])
+
+        # the variable #fc_w will NOT be splited
+        fc_w_var = startup.global_block().var("fc_w@GRAD")
+        self.assertEqual(fc_w_var.shape, (1000, 1000))
+
+        fc_w_var = startup.global_block().var("fc_w@GRAD.trainer_0")
+        self.assertEqual(fc_w_var.shape, (1000, 1000))
+
+    def get_expect_trainer_ops(self):
+        trainer = fluid.Program()
+
+        with fluid.program_guard(trainer):
+            optimize_ops, params_grads = self.net_conf()
+
+        delete_ops(trainer.global_block(), optimize_ops)
+        ops = [op.type for op in trainer.global_block().ops] + [
+            "send", "send_barrier", "recv", "recv", "fetch_barrier"
+        ]
+        ops.insert(ops.index("elementwise_add_grad") + 1, "send")
+        return ops
+
+    def _transpiler_instance(self):
+        main = self.get_main_program()
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            self.trainer_id,
+            program=main,
+            pservers=self.pserver_eps,
+            trainers=self.trainers,
+            slice_var_up=False)
+        return t
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
new file mode 100644
index 0000000000..1a48bce3bb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -0,0 +1,62 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestSliceOp(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCase1(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 2]
+        self.out = self.input[-3:3, 0:100, 2:-1, :]
+
+
+class TestCase2(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 3]
+        self.out = self.input[-3:3, 0:100, :, 2:-1]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_var.py b/python/paddle/fluid/tests/unittests/test_slice_var.py
similarity index 84%
rename from python/paddle/fluid/tests/unittests/test_split_var.py
rename to python/paddle/fluid/tests/unittests/test_slice_var.py
index 0c5e8901b9..82305b23a1 100644
--- a/python/paddle/fluid/tests/unittests/test_split_var.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_var.py
@@ -14,14 +14,14 @@
 
 import math
 import unittest
-from paddle.fluid.transpiler.distribute_transpiler import split_dense_variable
+from paddle.fluid.transpiler.distribute_transpiler import slice_variable
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import random
 
 
-class TestSplitVar(unittest.TestCase):
-    def check_split_output(self, shapes, expected_sizes, min_size):
+class TestSliceVar(unittest.TestCase):
+    def check_slice_output(self, shapes, expected_sizes, min_size):
         var_list = []
         program = fluid.Program()
         for shape in shapes:
@@ -31,7 +31,7 @@ class TestSplitVar(unittest.TestCase):
                 # dtype=core.VarDesc.VarType.LOD_TENSOR,
                 shape=shape)
             var_list.append(var)
-        blocks = split_dense_variable(var_list, 10, min_size)
+        blocks = slice_variable(var_list, 10, min_size)
         all_sizes = []
         for s in expected_sizes:
             for s2 in s:
@@ -49,7 +49,7 @@ class TestSplitVar(unittest.TestCase):
             [1150, 1150, 1150, 1150, 1150, 1150, 1100]
         ]
 
-        self.check_split_output(shapes, expected_sizes, 1024)
+        self.check_slice_output(shapes, expected_sizes, 1024)
 
     def test_check_output_8k(self):
         shapes = [[3, 5], [1024], [28, 784], [8, 1020], [800, 10],
@@ -57,7 +57,7 @@ class TestSplitVar(unittest.TestCase):
         expected_sizes = [[15], [1024], [10976, 10976], [8160], [8000],
                           [35937, 35937, 35937, 35937, 35937, 35937]]
 
-        self.check_split_output(shapes, expected_sizes, 8192)
+        self.check_slice_output(shapes, expected_sizes, 8192)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
new file mode 100644
index 0000000000..1dc94a80c9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -0,0 +1,182 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+def as_lodtensor(np_array, lod, place):
+    tensor = core.LoDTensor()
+    tensor.set(np_value, place)
+    if lod is not None:
+        tensor.set_lod(lod)
+    return tensor
+
+
+def create_op(scope, op_type, inputs, outputs, attrs):
+    kwargs = dict()
+
+    op_maker = core.op_proto_and_checker_maker
+    op_role_attr_name = op_maker.kOpRoleAttrName()
+
+    if op_role_attr_name not in attrs:
+        attrs[op_role_attr_name] = int(op_maker.OpRole.Forward)
+
+    def __create_var__(name, var_name):
+        scope.var(var_name).get_tensor()
+        kwargs[name].append(var_name)
+
+    for in_name, in_dup in Operator.get_op_inputs(op_type):
+        if in_name in inputs:
+            kwargs[in_name] = []
+            if in_dup:
+                sub_in = inputs[in_name]
+                for item in sub_in:
+                    sub_in_name, _ = item[0], item[1]
+                    __create_var__(in_name, sub_in_name)
+            else:
+                __create_var__(in_name, in_name)
+
+    for out_name, out_dup in Operator.get_op_outputs(op_type):
+        if out_name in outputs:
+            kwargs[out_name] = []
+            if out_dup:
+                sub_out = outputs[out_name]
+                for item in sub_out:
+                    sub_out_name, _ = item[0], item[1]
+                    __create_var__(out_name, sub_out_name)
+            else:
+                __create_var__(out_name, out_name)
+
+    for attr_name in Operator.get_op_attr_names(op_type):
+        if attr_name in attrs:
+            kwargs[attr_name] = attrs[attr_name]
+
+    return Operator(op_type, **kwargs)
+
+
+def set_input(scope, op, inputs, place):
+    def __set_input__(var_name, var):
+        if isinstance(var, tuple) or isinstance(var, np.ndarray):
+            tensor = scope.find_var(var_name).get_tensor()
+            if isinstance(var, tuple):
+                tensor.set_lod(var[1])
+                var = var[0]
+            tensor.set_dims(var.shape)
+            tensor.set(var, place)
+        elif isinstance(var, float):
+            scope.find_var(var_name).set_float(var)
+        elif isinstance(var, int):
+            scope.find_var(var_name).set_int(var)
+
+    for in_name, in_dup in Operator.get_op_inputs(op.type()):
+        if in_name in inputs:
+            if in_dup:
+                sub_in = inputs[in_name]
+                for item in sub_in:
+                    sub_in_name, sub_in_val = item[0], item[1]
+                    __set_input__(sub_in_name, sub_in_val)
+            else:
+                __set_input__(in_name, inputs[in_name])
+
+
+def append_input_output(block, op_proto, np_list, is_input, dtype):
+    '''Insert VarDesc and generate Python variable instance'''
+    proto_list = op_proto.inputs if is_input else op_proto.outputs
+
+    def create_var(block, name, np_list, var_proto):
+        dtype = None
+        shape = None
+        lod_level = None
+        if name not in np_list:
+            assert var_proto.intermediate, "{} not found".format(name)
+        else:
+            np_value = np_list[name]
+            if isinstance(np_value, tuple):
+                dtype = np_value[0].dtype
+                # output shape, lod should be infered from input.
+                if is_input:
+                    shape = list(np_value[0].shape)
+                    lod_level = len(np_value[1])
+            else:
+                dtype = np_value.dtype
+                if is_input:
+                    shape = list(np_value.shape)
+                    lod_level = 0
+        return block.create_var(
+            dtype=dtype, shape=shape, lod_level=lod_level, name=name)
+
+    var_dict = {}
+    for var_proto in proto_list:
+        var_name = str(var_proto.name)
+        if is_input:
+            if (var_name not in np_list) and var_proto.dispensable:
+                continue
+            assert (var_name in np_list) or (var_proto.dispensable), \
+                "Missing {} as input".format(var_name)
+        if var_proto.duplicable:
+            assert isinstance(np_list[var_name], list), \
+                "Duplicable {} should be set as list".format(var_name)
+            var_list = []
+            for (name, np_value) in np_list[var_name]:
+                var_list.append(
+                    create_var(block, name, {name: np_value}, var_proto))
+            var_dict[var_name] = var_list
+        else:
+            var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
+
+    return var_dict
+
+
+def append_loss_ops(block, output_names):
+    mean_inputs = map(block.var, output_names)
+    # for item in mean_inputs:
+    #     print(item)
+    #     print("Item", item.dtype)
+
+    if len(mean_inputs) == 1:
+        loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
+        op = block.append_op(
+            inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+    else:
+        avg_sum = []
+        for cur_loss in mean_inputs:
+            cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
+            op = block.append_op(
+                inputs={"X": [cur_loss]},
+                outputs={"Out": [cur_avg_loss]},
+                type="mean")
+            op.desc.infer_var_type(block.desc)
+            op.desc.infer_shape(block.desc)
+            avg_sum.append(cur_avg_loss)
+
+        loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
+        op_sum = block.append_op(
+            inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
+        op_sum.desc.infer_var_type(block.desc)
+        op_sum.desc.infer_shape(block.desc)
+
+        loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
+        op_loss = block.append_op(
+            inputs={"X": loss_sum},
+            outputs={"Out": loss},
+            type='scale',
+            attrs={'scale': 1.0 / float(len(avg_sum))})
+        op_loss.desc.infer_var_type(block.desc)
+        op_loss.desc.infer_shape(block.desc)
+    return loss
diff --git a/python/paddle/fluid/tests/unittests/transpiler_test.py b/python/paddle/fluid/tests/unittests/transpiler_test.py
new file mode 100644
index 0000000000..d84c5d9c41
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/transpiler_test.py
@@ -0,0 +1,73 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.layers as layers
+
+
+class TranspilerTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        self.trainer_id = 0
+        self.trainers = 2
+        self.pservers = 2
+        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
+
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'))
+
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+
+        optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+        return optimize_ops, params_grads
+
+    def get_main_program(self):
+        main = fluid.Program()
+
+        with fluid.program_guard(main):
+            self.net_conf()
+
+        return main
+
+    def get_trainer(self):
+        return self._transpiler_instance().get_trainer_program()
+
+    def get_pserver(self, ep):
+        t = self._transpiler_instance()
+        pserver = t.get_pserver_program(ep)
+        startup = t.get_startup_program(ep, pserver)
+        return pserver, startup
+
+    def _transpiler_instance(self):
+        main = self.get_main_program()
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            self.trainer_id,
+            program=main,
+            pservers=self.pserver_eps,
+            trainers=self.trainers)
+        return t
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 7da123dd92..efc28d8993 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -27,11 +27,8 @@ import parallel_executor
 from transpiler import distribute_transpiler
 
 __all__ = [
-    'Trainer',
-    'BeginEpochEvent',
-    'EndEpochEvent',
-    'BeginStepEvent',
-    'EndStepEvent',
+    'Trainer', 'BeginEpochEvent', 'EndEpochEvent', 'BeginStepEvent',
+    'EndStepEvent', 'CheckpointConfig'
 ]
 
 
@@ -59,6 +56,35 @@ class EndStepEvent(object):
         self.metrics = metrics
 
 
+class CheckpointConfig(object):
+    def __init__(self,
+                 checkpoint_dir=None,
+                 max_num_checkpoints=3,
+                 epoch_interval=1,
+                 step_interval=10):
+        if checkpoint_dir is None:
+            self.checkpoint_dir = os.getcwd()
+        else:
+            self.checkpoint_dir = checkpoint_dir
+
+        self.max_num_checkpoints = max_num_checkpoints
+
+        if epoch_interval < 1:
+            self.epoch_interval = 1
+        else:
+            self.epoch_interval = epoch_interval
+
+        if step_interval < 1:
+            self.step_interval = 10
+        else:
+            self.step_interval = step_interval
+
+        self.epoch_id = 0
+        self.step_id = 0
+        self.load_serial = None
+        self.is_pserver = False
+
+
 def check_and_get_place(place):
     """
     Check the type of place or get the default place
@@ -90,23 +116,32 @@ class Trainer(object):
 
     Args:
         train_func(callable): A function which will return loss. The loss must be a scalar.
-        optimizer(optimizer.Optimizer): The optimizer should be an instance of Optimizer
+        optimizer_func(callable): A function that returns an Optimizer object.
         place: The device place of this trainer.
     """
 
     def __init__(self,
                  train_func,
-                 optimizer,
+                 optimizer_func,
                  param_path=None,
                  place=None,
-                 parallel=False):
+                 parallel=False,
+                 checkpoint_config=None):
         self.__stop = False
         self.parallel = parallel
         # 1. we need to generate a framework.Program by calling
         # program_func. Reference: fluid.program_guard in
         # test_word2vec.py
-        if not isinstance(optimizer, opt_module.Optimizer):
-            raise TypeError("The optimizer should be an instance of Optimizer")
+
+        # config for checkpoint
+        # only chief worker will save variables
+        self.trainer_id = 0
+        self.checkpoint_cfg = checkpoint_config
+        if self.checkpoint_cfg:
+            assert isinstance(self.checkpoint_cfg, CheckpointConfig)
+            serial = io.get_latest_checkpoint_serial(
+                self.checkpoint_cfg.checkpoint_dir)
+            self.checkpoint_cfg.load_serial = serial if serial >= 0 else None
 
         self.scope = core.Scope()
 
@@ -117,12 +152,15 @@ class Trainer(object):
             program_func_outs = train_func()
             self.train_func_outputs = program_func_outs if isinstance(
                 program_func_outs, list) else [program_func_outs]
-            self.test_program = self.train_program.clone()
+            self.test_program = self.train_program.clone(for_test=True)
+
+            # The first element of program_func_outs is loss.
+            loss = self.train_func_outputs[0]
+
+            optimizer = optimizer_func()
             if not isinstance(optimizer, opt_module.Optimizer):
                 raise TypeError(
                     "The optimizer should be an instance of Optimizer")
-            # The fisrt element of program_func_outs is loss.
-            loss = self.train_func_outputs[0]
             optimize_ops, params_grads = optimizer.minimize(loss)
 
         self.place = check_and_get_place(place)
@@ -136,9 +174,25 @@ class Trainer(object):
             exe = executor.Executor(place)
             exe.run(self.startup_program)
 
-        if param_path:
+        if self.checkpoint_cfg and self.checkpoint_cfg.load_serial:
+            with self._prog_and_scope_guard():
+                exe = executor.Executor(place)
+                io.load_checkpoint(exe, self.checkpoint_cfg.checkpoint_dir,
+                                   self.checkpoint_cfg.load_serial,
+                                   self.startup_program)
+
+            if not self.checkpoint_cfg.is_pserver:
+                epoch_id, step_id = io.load_trainer_args(
+                    self.checkpoint_cfg.checkpoint_dir,
+                    self.checkpoint_cfg.load_serial, self.trainer_id,
+                    self._get_checkpoint_load_args())
+                self.checkpoint_cfg.epoch_id = int(epoch_id)
+                self.checkpoint_cfg.step_id = int(step_id)
+
+        if param_path and os.path.isdir(param_path):
             # load params from param_path into scope
-            io.load_persistables(exe, dirname=param_path)
+            io.load_persist_vars_without_grad(
+                exe, dirname=param_path, program=self.startup_program)
 
     def _transpile_nccl2_dist(self):
         # PADDLE_TRAINER_IPS
@@ -193,14 +247,18 @@ class Trainer(object):
         current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
         # the unique trainer id, starting from 0, needed by trainer
         # only
-        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+
         # the role, should be either PSERVER or TRAINER
         training_role = os.getenv("PADDLE_TRAINING_ROLE")
         with self._prog_and_scope_guard():
             t = distribute_transpiler.DistributeTranspiler()
             t.transpile(
-                trainer_id, pservers=pserver_endpoints, trainers=trainers)
+                self.trainer_id, pservers=pserver_endpoints, trainers=trainers)
             if training_role == "PSERVER":
+                if self.checkpoint_cfg:
+                    self.is_pserver = True
+
                 self.train_program = t.get_pserver_program(current_endpoint)
                 self.startup_program = t.get_startup_program(current_endpoint,
                                                              self.train_program)
@@ -293,11 +351,26 @@ class Trainer(object):
             self._train_by_any_executor(event_handler, exe, num_epochs, reader)
 
     def _train_by_any_executor(self, event_handler, exe, num_epochs, reader):
-        for epoch_id in range(num_epochs):
+        if self.checkpoint_cfg:
+            epochs = [
+                epoch_id for epoch_id in range(num_epochs)
+                if epoch_id >= self.checkpoint_cfg.epoch_id
+            ]
+        else:
+            epochs = [epoch_id for epoch_id in range(num_epochs)]
+
+        for epoch_id in epochs:
             event_handler(BeginEpochEvent(epoch_id))
             for step_id, data in enumerate(reader()):
                 if self.__stop:
+                    if self.checkpoint_cfg:
+                        self._clean_checkpoint()
                     return
+
+                if self.checkpoint_cfg and self.checkpoint_cfg.load_serial \
+                    and self.checkpoint_cfg.step_id >= step_id and self.checkpoint_cfg.epoch_id == epoch_id:
+                    continue
+
                 begin_event = BeginStepEvent(epoch_id, step_id)
                 event_handler(begin_event)
                 if begin_event.fetch_metrics:
@@ -308,8 +381,13 @@ class Trainer(object):
                                       ])
                 else:
                     metrics = exe.run(feed=data, fetch_list=[])
+
+                if self.checkpoint_cfg:
+                    self._save_checkpoint(epoch_id, step_id)
                 event_handler(EndStepEvent(epoch_id, step_id, metrics))
             event_handler(EndEpochEvent(epoch_id))
+        if self.checkpoint_cfg:
+            self._clean_checkpoint()
 
     def _test_by_executor(self, reader, feed_order, fetch_list):
         with executor.scope_guard(self.scope):
@@ -348,6 +426,38 @@ class Trainer(object):
                 loss_name=self.train_func_outputs[0].name)
         return self._get_parallel_executor()
 
+    def _clean_checkpoint(self):
+        assert self.checkpoint_cfg
+        io.clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir)
+
+    def _get_checkpoint_load_args(self):
+        """
+        epoch_id and step_id are runtime arguments, they are not variables, will load them independently.
+        """
+        return ["epoch_id", "step_id"]
+
+    def _get_checkpoint_save_args(self, epoch_id, step_id):
+        """
+        epoch_id and step_id are runtime arguments, they are not variables, will save them independently.
+        """
+        trainer_args = {}
+        trainer_args["epoch_id"] = epoch_id
+        trainer_args["step_id"] = step_id
+        return trainer_args
+
+    def _save_checkpoint(self, epoch_id, step_id):
+        assert self.checkpoint_cfg
+
+        if epoch_id % self.checkpoint_cfg.epoch_interval == 0 and step_id % self.checkpoint_cfg.step_interval == 0:
+            exe = executor.Executor(self.place)
+            io.save_checkpoint(
+                executor=exe,
+                checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
+                trainer_id=self.trainer_id,
+                trainer_args=self._get_checkpoint_save_args(epoch_id, step_id),
+                main_program=self.train_program,
+                max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints)
+
 
 def build_feed_var_list(program, feed_order):
     if not isinstance(program, framework.Program):
diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py
index 413c36c5c4..cf18090f71 100644
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
@@ -15,9 +15,9 @@
 from distribute_transpiler import DistributeTranspiler
 from inference_transpiler import InferenceTranspiler
 from memory_optimization_transpiler import memory_optimize, release_memory
-from distribute_transpiler_simple import SimpleDistributeTranspiler
+from ps_dispatcher import HashName, RoundRobin
 
 __all__ = [
-    "DistributeTranspiler", "InferenceTranspiler", "SimpleDistributeTranspiler",
-    "memory_optimize", "release_memory"
+    "DistributeTranspiler", "InferenceTranspiler", "memory_optimize",
+    "release_memory", "HashName", "RoundRobin"
 ]
diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py
new file mode 100644
index 0000000000..dc597c3384
--- /dev/null
+++ b/python/paddle/fluid/transpiler/details/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from program_utils import *
+from ufind import *
diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py
new file mode 100644
index 0000000000..f10b496306
--- /dev/null
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def delete_ops(block, ops):
+    try:
+        start = list(block.ops).index(ops[0])
+        end = list(block.ops).index(ops[-1])
+        [block.remove_op(start) for _ in xrange(end - start + 1)]
+    except Exception, e:
+        raise e
+    block.program.sync_with_cpp()
+
+
+def find_op_by_input_arg(block, arg_name):
+    for index, op in enumerate(block.ops):
+        if arg_name in op.input_arg_names:
+            return index
+    return -1
+
+
+def find_op_by_output_arg(block, arg_name):
+    for index, op in enumerate(block.ops):
+        if arg_name in op.output_arg_names:
+            return index
+    return -1
diff --git a/python/paddle/fluid/transpiler/details/ufind.py b/python/paddle/fluid/transpiler/details/ufind.py
new file mode 100644
index 0000000000..0e30d0e3f9
--- /dev/null
+++ b/python/paddle/fluid/transpiler/details/ufind.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class UnionFind(object):
+    """ Union-find data structure.
+
+    Union-find is a data structure that keeps track of a set of elements partitioned
+    into a number of disjoint (non-overlapping) subsets.
+
+    Reference:
+    https://en.wikipedia.org/wiki/Disjoint-set_data_structure
+
+    Args:
+      elements(list): The initialize element list.
+    """
+
+    def __init__(self, elementes=None):
+        self._parents = []  # index -> parent index
+        self._index = {}  # element -> index
+        self._curr_idx = 0
+        if not elementes:
+            elementes = []
+        for ele in elementes:
+            self._parents.append(self._curr_idx)
+            self._index.update({ele: self._curr_idx})
+            self._curr_idx += 1
+
+    def find(self, x):
+        # Find the root index of given element x,
+        # execute the path compress while findind the root index
+        if not x in self._index:
+            return -1
+        idx = self._index[x]
+        while idx != self._parents[idx]:
+            t = self._parents[idx]
+            self._parents[idx] = self._parents[t]
+            idx = t
+        return idx
+
+    def union(self, x, y):
+        # Union two given element
+        x_root = self.find(x)
+        y_root = self.find(y)
+
+        if x_root == y_root:
+            return
+        self._parents[x_root] = y_root
+
+    def is_connected(self, x, y):
+        # If two given elements have the same root index,
+        # then they are connected.
+        return self.find(x) == self.find(y)
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 42ff0a9eb1..2480d4e76a 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -11,20 +11,49 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Transpile the program to distributed data-parallelism programs.
+The main_program will be transformed to use a remote parameter server
+to do parameter optimization. And the optimization graph will be put
+into a parameter server program.
+
+Use different methods to split trainable variables to different
+parameter servers.
+
+Steps to transpile trainer:
+1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
+2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
+3. modify trainer program add split_op to each grad variable.
+4. append send_op to send splited variables to server and 
+5. add recv_op to fetch params(splited blocks or origin param) from server.
+6. append concat_op to merge splited blocks to update local weights.
+
+Steps to transpile pserver:
+1. create new program for parameter server.
+2. create params and grad variables that assigned to current server instance.
+3. create a sub-block in the server side program
+4. append ops that should run on current server instance.
+5. add listen_and_serv op
+"""
 
 from __future__ import print_function
 
 import math
+import numpy as np
 
-import distributed_splitter as splitter
+from ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
 from ..framework import Program, default_main_program, \
                         default_startup_program, \
                         Variable, Parameter, grad_var_name
+from details import *
 
 LOOKUP_TABLE_TYPE = "lookup_table"
 LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
-RPC_CLIENT_VAR_NAME = "RPC_CLIENT_VAR"
+OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName(
+)
+RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
 
 
 class VarBlock:
@@ -38,62 +67,11 @@ class VarBlock:
         return "%s:%d:%d" % (self.varname, self.offset, self.size)
 
 
-class UnionFind(object):
-    """ Union-find data structure.
-
-    Union-find is a data structure that keeps track of a set of elements partitioned
-    into a number of disjoint (non-overlapping) subsets.
-
-    Reference:
-    https://en.wikipedia.org/wiki/Disjoint-set_data_structure
-
-    Args:
-      elements(list): The initialize element list.
-    """
-
-    def __init__(self, elementes=None):
-        self._parents = []  # index -> parent index
-        self._index = {}  # element -> index
-        self._curr_idx = 0
-        if not elementes:
-            elementes = []
-        for ele in elementes:
-            self._parents.append(self._curr_idx)
-            self._index.update({ele: self._curr_idx})
-            self._curr_idx += 1
-
-    def find(self, x):
-        # Find the root index of given element x,
-        # execute the path compress while findind the root index
-        if not x in self._index:
-            return -1
-        idx = self._index[x]
-        while idx != self._parents[idx]:
-            t = self._parents[idx]
-            self._parents[idx] = self._parents[t]
-            idx = t
-        return idx
-
-    def union(self, x, y):
-        # Union two given element
-        x_root = self.find(x)
-        y_root = self.find(y)
-
-        if x_root == y_root:
-            return
-        self._parents[x_root] = y_root
-
-    def is_connected(self, x, y):
-        # If two given elements have the same root index,
-        # then they are connected.
-        return self.find(x) == self.find(y)
-
-
 def same_or_split_var(p_name, var_name):
     return p_name == var_name or p_name.startswith(var_name + ".block")
 
 
-def split_dense_variable(var_list, service_count, min_block_size=8192):
+def slice_variable(var_list, slice_count, min_block_size=8192):
     """
     We may need to split dense tensor to one or more blocks and put
     them equally onto parameter server. One block is a sub-tensor
@@ -101,25 +79,25 @@ def split_dense_variable(var_list, service_count, min_block_size=8192):
 
     We need to have a minimal block size so that the calculations in
     the parameter server side can gain better performance. By default
-    minimum block size 8K elements (maybe 16bit or 32bit or 64bit). 
+    minimum block size 8K elements (maybe 16bit or 32bit or 64bit).
 
     Args:
         var_list (list): List of variables.
-        service_count (int): Numel of pserver services. A pserver may have two
-            or more listening ports.
+        slice_count (int): Numel of count that variables will be sliced, which
+            could be the pserver services' count.
         min_block_size (int): Minimum splitted block size.
     Returns:
-        blocks (list[(varname, block_id, current_block_size)]): A list 
+        blocks (list[(varname, block_id, current_block_size)]): A list
             of VarBlocks. Each VarBlock specifies a shard of the var.
     """
     blocks = []
     for var in var_list:
-        split_count = service_count
+        split_count = slice_count
         var_numel = reduce(lambda x, y: x * y, var.shape)
         max_pserver_count = int(math.floor(var_numel / float(min_block_size)))
         if max_pserver_count == 0:
             max_pserver_count = 1
-        if max_pserver_count < service_count:
+        if max_pserver_count < slice_count:
             split_count = max_pserver_count
         block_size = int(math.ceil(var_numel / float(split_count)))
 
@@ -139,84 +117,15 @@ def split_dense_variable(var_list, service_count, min_block_size=8192):
     return blocks
 
 
-def delete_ops(block, ops):
-    try:
-        start = list(block.ops).index(ops[0])
-        end = list(block.ops).index(ops[-1])
-        [block.remove_op(start) for _ in xrange(end - start + 1)]
-    except Exception, e:
-        raise e
-    block.program.sync_with_cpp()
-
-
 class DistributeTranspiler:
-    def transpile(self,
-                  trainer_id,
-                  program=None,
-                  pservers="127.0.0.1:6174",
-                  trainers=1,
-                  split_method=splitter.round_robin,
-                  sync_mode=True):
-        """
-        Transpile the program to distributed data-parallelism programs.
-        The main_program will be transformed to use a remote parameter server
-        to do parameter optimization. And the optimization graph will be put
-        into a parameter server program.
-
-        Use different methods to split trainable variables to different
-        parameter servers.
-
-        Steps to transpile trainer:
-        1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
-        2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
-        3. modify trainer program add split_op to each grad variable.
-        4. append send_op to send splited variables to server and fetch
-            params(splited blocks or origin param) from server.
-        5. append concat_op to merge splited blocks to update local weights.
-
-        Steps to transpile pserver:
-        1. create new program for parameter server.
-        2. create params and grad variables that assigned to current server instance.
-        3. create a sub-block in the server side program
-        4. append ops that should run on current server instance.
-        5. add listen_and_serv op
-
-        :param trainer_id: one unique id for each trainer in a job.
-        :type trainer_id: int
-        :param program: program to transpile, default is default_main_program
-        :type program: Program
-        :param pservers: parameter server endpoints like "m1:6174,m2:6174"
-        :type pservers: string
-        :param trainers: total number of workers/trainers in the job
-        :type trainers: int
-        :param split_method: A function to determin how to split variables
-            to different servers equally.
-        :type split_method: function
-        :type sync_mode: boolean default True
-        :param sync_mode: if sync_mode is set True, it means that dist transpiler
-        will transpile the program into sync_mode pserver and trainer program.
-        """
-        assert (callable(split_method))
-        if program is None:
-            program = default_main_program()
-        self.origin_program = program
-        self.trainer_num = trainers
-        self.sync_mode = sync_mode
-        # TODO(typhoonzero): currently trainer_id is fetched from cluster system
-        # like Kubernetes, we should port this to use etcd later when developing
-        # fluid distributed training with fault-tolerance.
-        self.trainer_id = trainer_id
-        pserver_endpoints = pservers.split(",")
-        self.pserver_endpoints = pserver_endpoints
-        self.optimize_ops, params_grads = self._get_optimize_pass()
-
+    def _has_distributed_lookup_table(self):
         # process lookup_table_op
         # 1. check all lookup_table_op is distributed
         # 2. check all lookup_table_op share the same table.
         distributed_lookup_table_ops = []
         # support only one distributed_lookup_table now
         self.table_name = None
-        for op in program.global_block().ops:
+        for op in self.origin_program.global_block().ops:
             if op.type == LOOKUP_TABLE_TYPE:
                 if op.attrs['is_distributed'] is True:
                     if self.table_name is None:
@@ -229,20 +138,13 @@ class DistributeTranspiler:
                     if self.table_name is not None:
                         assert op.input("W")[0] != self.table_name
 
-        self.has_distributed_lookup_table = len(
-            distributed_lookup_table_ops) > 0
-
-        # step1: For large parameters and gradients, split them into smaller
-        # blocks.
-        param_list = []
-        grad_list = []
-        for p, g in params_grads:
-            # skip parameter marked not trainable
-            if type(p) == Parameter and p.trainable == False:
-                continue
-            param_list.append(p)
-            grad_list.append(g)
+        return len(distributed_lookup_table_ops) > 0
 
+    def _update_dist_lookup_table_vars(self, param_list, grad_list,
+                                       params_grads):
+        # TODO(wuyi): put find a way to put dist lookup table stuff all together.
+        # update self.table_param_grad and self.trainer_side_table_grad_list
+        program = self.origin_program
         if self.has_distributed_lookup_table:
             param_list = [
                 param for param in param_list if param.name != self.table_name
@@ -256,67 +158,223 @@ class DistributeTranspiler:
                 if param_grad[0].name == self.table_name
             ][0]
             table_grad_var = self.table_param_grad[1]
-            self.table_grad_list = [
-                program.global_block().create_var(
-                    name="%s.trainer_%d.pserver_%d" %
-                    (table_grad_var.name, trainer_id, index),
-                    type=table_grad_var.type,
-                    shape=table_grad_var.shape,
-                    dtype=table_grad_var.dtype)
-                for index in range(len(self.pserver_endpoints))
-            ]
+            if self.sync_mode:
+                self.trainer_side_table_grad_list = [
+                    program.global_block().create_var(
+                        name="%s.trainer_%d.pserver_%d" %
+                        (table_grad_var.name, self.trainer_id, index),
+                        type=table_grad_var.type,
+                        shape=table_grad_var.shape,
+                        dtype=table_grad_var.dtype)
+                    for index in range(len(self.pserver_endpoints))
+                ]
+            else:
+                self.trainer_side_table_grad_list = [
+                    program.global_block().create_var(
+                        name="%s.pserver_%d" % (table_grad_var.name, index),
+                        type=table_grad_var.type,
+                        shape=table_grad_var.shape,
+                        dtype=table_grad_var.dtype)
+                    for index in range(len(self.pserver_endpoints))
+                ]
+        return param_list, grad_list
+
+    def _init_splited_vars(self, slice_var_up):
+        # update these mappings for further transpile:
+        # 1. param_var_mapping: param var name -> [splited params vars]
+        # 2. grad_var_mapping: grad var name -> [splited grads vars]
+        # 3. grad_param_mapping: grad.blockx -> param.blockx
+        # 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []}
+
+        param_list = []
+        grad_list = []
+        param_grad_set = set()
+        for p, g in self.params_grads:
+            # skip parameter marked not trainable
+            if type(p) == Parameter and p.trainable == False:
+                continue
+            if p.name not in param_grad_set:
+                param_list.append(p)
+                param_grad_set.add(p.name)
+            if g.name not in param_grad_set:
+                grad_list.append(g)
+                param_grad_set.add(g.name)
+
+        param_list, grad_list = self._update_dist_lookup_table_vars(
+            param_list, grad_list, self.params_grads)
+
+        if slice_var_up:
+            # when we slice var up into blocks, we will slice the var according to
+            # pserver services' count. A pserver may have two or more listening ports.
+            grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints))
+            param_blocks = slice_variable(param_list,
+                                          len(self.pserver_endpoints))
+        else:
+            # when we do NOT slice var up into blocks, we will always slice params
+            # grads into one block.
+            grad_blocks = slice_variable(grad_list, 1)
+            param_blocks = slice_variable(param_list, 1)
+        assert (len(grad_blocks) == len(param_blocks))
+
+        # origin_varname -> [splited_var]
+        self.param_var_mapping = self._create_vars_from_blocklist(
+            self.origin_program, param_blocks)
+        self.grad_var_mapping = self._create_vars_from_blocklist(
+            self.origin_program,
+            grad_blocks,
+            add_trainer_suffix=self.trainer_num > 1)
+        self.grad_param_mapping = dict()
+        for g, p in zip(grad_blocks, param_blocks):
+            g_name, g_bid, _ = g.split(":")
+            p_name, p_bid, _ = p.split(":")
+            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] =  \
+                    self.param_var_mapping[p_name][int(p_bid)]
 
-        grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
-        param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
-        # step2: Create new vars for the parameters and gradients blocks and
-        # add ops to do the split.
-        grad_var_mapping = self._append_split_op(program, grad_blocks)
-        param_var_mapping = self._create_vars_from_blocklist(program,
-                                                             param_blocks)
-
-        # step3: Add gradients as send op inputs and parameters as send
-        # op outputs.
-        send_inputs = []
-        send_outputs = []
-        for b in grad_blocks:  # append by order
-            varname, block_id, _ = b.split(":")
-            send_inputs.append(grad_var_mapping[varname][int(block_id)])
-
-        for b in param_blocks:
-            varname, block_id, _ = b.split(":")
-            send_outputs.append(param_var_mapping[varname][int(block_id)])
-
-        # let send_op know which endpoint to send which var to, eplist has the same
-        # order as send_inputs.
-        eplist = split_method(send_inputs, pserver_endpoints)
         # create mapping of endpoint -> split var to create pserver side program
         self.param_grad_ep_mapping = dict()
+        [
+            self.param_grad_ep_mapping.update({
+                ep: {
+                    "params": [],
+                    "grads": []
+                }
+            }) for ep in self.pserver_endpoints
+        ]
+
+    def transpile(self,
+                  trainer_id,
+                  program=None,
+                  pservers="127.0.0.1:6174",
+                  trainers=1,
+                  slice_var_up=True,
+                  split_method=RoundRobin,
+                  sync_mode=True):
+        """
+        :param trainer_id: one unique id for each trainer in a job.
+        :type trainer_id: int
+        :param program: program to transpile, default is default_main_program
+        :type program: Program
+        :param pservers: parameter server endpoints like "m1:6174,m2:6174"
+        :type pservers: string
+        :param trainers: total number of workers/trainers in the job
+        :type trainers: int
+        :param split_method: A function to determin how to split variables
+            to different servers equally.
+        :type split_method: function
+        :type sync_mode: boolean default True
+        :param sync_mode: if sync_mode is set True, it means that dist transpiler
+        will transpile the program into sync_mode pserver and trainer program.
+        """
+        assert (split_method.__bases__[0] == PSDispatcher)
+        if program is None:
+            program = default_main_program()
+        self.origin_program = program
+        self.trainer_num = trainers
+        self.sync_mode = sync_mode
+        self.trainer_id = trainer_id
+        pserver_endpoints = pservers.split(",")
+        self.pserver_endpoints = pserver_endpoints
+        self.optimize_ops, self.params_grads = self._get_optimize_pass()
+
+        ps_dispatcher = split_method(self.pserver_endpoints)
+        self.has_distributed_lookup_table = self._has_distributed_lookup_table()
+
+        # split and create vars, then put splited vars in dicts for later use.
+        self._init_splited_vars(slice_var_up)
+
+        # step 3.1: insert send op to send gradient vars to parameter servers
+        ps_dispatcher.reset()
+        send_vars = []
+
+        # in general cases, the number of pservers is times of 2, and this
+        # will lead to uneven distribution among weights and bias:
+        #       fc_w@GRAD_trainer_0, fc_w@GRAD_trainer_1 --> pserver1
+        #       fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2
+        # shuffle the map will avoid the uneven distribution above
+        grad_var_mapping_items = self.grad_var_mapping.items()
+        if not slice_var_up:
+            np.random.shuffle(grad_var_mapping_items)
+
+        for orig_varname, splited_vars in grad_var_mapping_items:
+            eplist = ps_dispatcher.dispatch(splited_vars)
+
+            if not slice_var_up:
+                assert (len(splited_vars) == 1)
+
+            if len(splited_vars) == 1:
+                orig_varname = splited_vars[0].name
+                index = find_op_by_output_arg(program.global_block(),
+                                              orig_varname)
+            elif len(splited_vars) > 1:
+                orig_var = program.global_block().vars[orig_varname]
+                index = find_op_by_output_arg(program.global_block(),
+                                              orig_varname)
+                self._insert_split_op(program, orig_var, index, splited_vars)
+                index += 1
+            else:
+                AssertionError("Can not insert the send op by original "
+                               "variable name :", orig_varname)
+
+            program.global_block().insert_op(
+                index=index + 1,
+                type="send",
+                inputs={"X": splited_vars},
+                outputs={},
+                attrs={
+                    "epmap": eplist,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
+            for _, var in enumerate(splited_vars):
+                send_vars.append(var)
+
+        if self.sync_mode:
+            program.global_block().append_op(
+                type="send_barrier",
+                inputs={},
+                outputs={},
+                attrs={
+                    "endpoints": pserver_endpoints,
+                    "sync_mode": self.sync_mode,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
+
+        # step 3.2: insert recv op to receive parameters from parameter server
+        recv_vars = []
+        for _, var in enumerate(send_vars):
+            recv_vars.append(self.grad_param_mapping[var])
+        ps_dispatcher.reset()
+        eplist = ps_dispatcher.dispatch(recv_vars)
+
         for i, ep in enumerate(eplist):
-            param = send_outputs[i]
-            grad = send_inputs[i]
-            if not self.param_grad_ep_mapping.has_key(ep):
-                self.param_grad_ep_mapping[ep] = {"params": [], "grads": []}
-            self.param_grad_ep_mapping[ep]["params"].append(param)
-            self.param_grad_ep_mapping[ep]["grads"].append(grad)
-
-        rpc_client_var = program.global_block().create_var(
-            name=RPC_CLIENT_VAR_NAME,
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-
-        # create send_op
+            self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i])
+            self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
+
+        # step4: Concat the parameters splits together after recv.
+        for varname, splited_var in self.param_var_mapping.iteritems():
+            eps = []
+            for var in splited_var:
+                index = [v.name for v in recv_vars].index(var.name)
+                eps.append(eplist[index])
+
+            program.global_block().append_op(
+                type="recv",
+                inputs={},
+                outputs={"Out": splited_var},
+                attrs={
+                    "epmap": eps,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
+
         program.global_block().append_op(
-            type="send",
-            inputs={"X": send_inputs},
-            outputs={"Out": send_outputs,
-                     "RPCClient": rpc_client_var},
+            type="fetch_barrier",
+            inputs={},
+            outputs={},
             attrs={
                 "endpoints": pserver_endpoints,
-                "epmap": eplist,
-                "sync_mode": self.sync_mode
+                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
             })
-        # step4: Concat the parameters splits together after recv.
-        for varname, splited_var in param_var_mapping.iteritems():
+
+        for varname, splited_var in self.param_var_mapping.iteritems():
             if len(splited_var) <= 1:
                 continue
             orig_param = program.global_block().vars[varname]
@@ -327,10 +385,9 @@ class DistributeTranspiler:
                 attrs={"axis": 0})
 
         if self.has_distributed_lookup_table:
-            self._replace_lookup_table_op_with_prefetch(program, rpc_client_var,
-                                                        eplist)
-            self._split_table_grad_and_add_send_vars(program, rpc_client_var,
-                                                     pserver_endpoints)
+            self._replace_lookup_table_op_with_prefetch(program,
+                                                        pserver_endpoints)
+            self._split_table_grad_and_add_send_vars(program, pserver_endpoints)
 
     def get_trainer_program(self):
         # remove optimize ops and add a send op to main_program
@@ -358,7 +415,6 @@ class DistributeTranspiler:
             # we don't need to create them when grad arrives.
             # change client side var name to origin name by
             # removing ".trainer_%d" suffix
-
             suff_idx = v.name.find(".trainer_")
             if suff_idx >= 0:
                 orig_var_name = v.name[:suff_idx]
@@ -395,24 +451,14 @@ class DistributeTranspiler:
         # located on current pserver
         opt_op_on_pserver = []
         for _, op in enumerate(self.optimize_ops):
-            if self._is_opt_op(op) and self._is_opt_op_on_pserver(endpoint, op):
+            if self._is_optimizer_op(op) and self._is_opt_op_on_pserver(
+                    endpoint, op):
                 opt_op_on_pserver.append(op)
         # step 3.3
         # Iterate through the ops, and if an op and the optimize ops
         # which located on current pserver are in one set, then
         # append it into the sub program.
 
-        # We try to put optimization program run parallelly, assume
-        # optimization program always looks like:
-        #
-        # prevop -> prevop -> opt op -> following op -> following op; ->
-        # prevop -> prevop -> opt op -> following op -> following op; ->
-        # global op -> global op
-        #
-        # we put operators that can run parallelly to many program blocks.
-        # in above example, we seperate ops by the ";". Global ops must run
-        # after all the optimize ops finished.
-
         global_ops = []
         # HACK: optimization global ops only used to scale beta1 and beta2
         # replace it with dependency engine.
@@ -420,12 +466,18 @@ class DistributeTranspiler:
             if self._is_adam_connected_op(op):
                 global_ops.append(op)
 
-        def __append_optimize_op__(op, block, grad_to_block_id):
-            if self._is_opt_op(op):
+        def __append_optimize_op__(op, block, grad_to_block_id, merged_var):
+            if self._is_optimizer_op(op):
                 self._append_pserver_ops(block, op, endpoint, grad_to_block_id,
-                                         self.origin_program)
+                                         self.origin_program, merged_var)
             else:
-                self._append_pserver_non_opt_ops(block, op)
+                self._append_pserver_non_opt_ops(block, op, endpoint)
+
+        def __op_have_grad_input__(op):
+            for varname in op.input_arg_names:
+                if varname.find("@GRAD") >= 0:
+                    return varname
+            return ""
 
         # append lr decay ops to the child block if exists
         lr_ops = self._get_lr_ops()
@@ -433,17 +485,26 @@ class DistributeTranspiler:
             lr_decay_block = pserver_program.create_block(
                 pserver_program.num_blocks - 1)
             for _, op in enumerate(lr_ops):
-                self._append_pserver_non_opt_ops(lr_decay_block, op)
+                self._append_pserver_non_opt_ops(lr_decay_block, op, endpoint)
 
         # append op to the current block
         grad_to_block_id = []
         pre_block_idx = pserver_program.num_blocks - 1
         for idx, opt_op in enumerate(opt_op_on_pserver):
             per_opt_block = pserver_program.create_block(pre_block_idx)
+            # append grad merging ops before clip and weight decay
+            for _, op in enumerate(self.optimize_ops):
+                # find the origin @GRAD var before clipping
+                grad_varname_for_block = __op_have_grad_input__(op)
+                if ufind.is_connected(op, opt_op) and grad_varname_for_block:
+                    merged_var = self._append_pserver_grad_merge_ops(
+                        per_opt_block, grad_varname_for_block, endpoint,
+                        grad_to_block_id, self.origin_program)
             for _, op in enumerate(self.optimize_ops):
                 # optimizer is connected to itself
                 if ufind.is_connected(op, opt_op) and op not in global_ops:
-                    __append_optimize_op__(op, per_opt_block, grad_to_block_id)
+                    __append_optimize_op__(op, per_opt_block, grad_to_block_id,
+                                           merged_var)
 
         # append global ops
         if global_ops:
@@ -451,46 +512,41 @@ class DistributeTranspiler:
                 pserver_program.num_blocks - 1)
             for glb_op in global_ops:
                 __append_optimize_op__(glb_op, opt_state_block,
-                                       grad_to_block_id)
-
-        # NOT USED: single block version:
-        #
-        # for _, op in enumerate(self.optimize_ops):
-        #     for _, opt_op in enumerate(opt_op_on_pserver):
-        #         if ufind.is_connected(op, opt_op):
-        #             __append_optimize_op__(glb_op, optimize_block)
-        #             break
+                                       grad_to_block_id, None)
 
         # process distributed lookup_table
-        prefetch_block = None
+        prefetch_var_name_to_block_id = []
         if self.has_distributed_lookup_table:
             pserver_index = self.pserver_endpoints.index(endpoint)
             table_opt_block = self._create_table_optimize_block(
-                pserver_index, pserver_program, pre_block_idx)
-            prefetch_block = self._create_prefetch_block(
+                pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
+            prefetch_var_name_to_block_id = self._create_prefetch_block(
                 pserver_index, pserver_program, table_opt_block)
 
         # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
         # not be executed, so it's safe to use optimize_block to hold the place
         if self.has_distributed_lookup_table:
-            assert prefetch_block is not None
+            assert len(prefetch_var_name_to_block_id) > 0
         else:
-            assert prefetch_block is None
-            prefetch_block = pserver_program.global_block()
+            assert len(prefetch_var_name_to_block_id) == 0
+
+        attrs = {
+            "OptimizeBlock": pserver_program.block(1),
+            "endpoint": endpoint,
+            "Fanin": self.trainer_num,
+            "sync_mode": self.sync_mode,
+            "grad_to_block_id": grad_to_block_id
+        }
+        if len(prefetch_var_name_to_block_id) > 0:
+            attrs['prefetch_var_name_to_block_id'] \
+                = prefetch_var_name_to_block_id
 
         # step5 append the listen_and_serv op
         pserver_program.global_block().append_op(
             type="listen_and_serv",
             inputs={'X': recv_inputs},
             outputs={},
-            attrs={
-                "OptimizeBlock": pserver_program.block(1),
-                "endpoint": endpoint,
-                "Fanin": self.trainer_num,
-                "PrefetchBlock": prefetch_block,
-                "sync_mode": self.sync_mode,
-                "grad_to_block_id": grad_to_block_id
-            })
+            attrs=attrs)
 
         pserver_program.sync_with_cpp()
         return pserver_program
@@ -549,12 +605,21 @@ class DistributeTranspiler:
                     attrs=op.attrs)
         return s_prog
 
+    # ====================== private transpiler functions =====================
+
     # transpiler function for dis lookup_table
-    def _replace_lookup_table_op_with_prefetch(self, program, rpc_client_var,
-                                               eplist):
+    def _replace_lookup_table_op_with_prefetch(self, program,
+                                               pserver_endpoints):
         # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op
-        self.prefetch_input_vars = None
-        self.prefetch_output_vars = None
+        # self.all_prefetch_input_vars =
+        #       [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1]
+        #        [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]]
+        self.all_prefetch_input_vars = []
+
+        # self.all_prefetch_input_vars =
+        #       [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1]
+        #        [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]]
+        self.all_prefetch_output_vars = []
 
         continue_search_lookup_table_op = True
         while continue_search_lookup_table_op:
@@ -568,18 +633,19 @@ class DistributeTranspiler:
                     ids_name = op.input("Ids")
                     out_name = op.output("Out")
 
-                    if self.prefetch_input_vars is None:
-                        ids_var = program.global_block().vars[ids_name[0]]
-                        self.prefetch_input_vars = self.create_splited_vars(
-                            source_var=ids_var,
-                            block=program.global_block(),
-                            tag="_prefetch_in_")
-                    if self.prefetch_output_vars is None:
-                        out_var = program.global_block().vars[out_name[0]]
-                        self.prefetch_output_vars = self.create_splited_vars(
-                            source_var=out_var,
-                            block=program.global_block(),
-                            tag="_prefetch_out_")
+                    ids_var = program.global_block().vars[ids_name[0]]
+                    prefetch_input_vars = self.create_splited_vars(
+                        source_var=ids_var,
+                        block=program.global_block(),
+                        tag="_prefetch_in_")
+                    self.all_prefetch_input_vars.append(prefetch_input_vars)
+
+                    out_var = program.global_block().vars[out_name[0]]
+                    prefetch_output_vars = self.create_splited_vars(
+                        source_var=out_var,
+                        block=program.global_block(),
+                        tag="_prefetch_out_")
+                    self.all_prefetch_output_vars.append(prefetch_output_vars)
 
                     # insert split_ids_op
                     program.global_block().insert_op(
@@ -591,24 +657,24 @@ class DistributeTranspiler:
                                 for varname in ids_name
                             ]
                         },
-                        outputs={"Out": self.prefetch_input_vars})
+                        outputs={"Out": prefetch_input_vars})
 
                     # insert prefetch_op
                     program.global_block().insert_op(
                         index=op_index + 1,
                         type="prefetch",
-                        inputs={'X': self.prefetch_input_vars},
-                        outputs={
-                            "Out": self.prefetch_output_vars,
-                            "RPCClient": rpc_client_var
-                        },
-                        attrs={"epmap": eplist})
+                        inputs={'X': prefetch_input_vars},
+                        outputs={"Out": prefetch_output_vars},
+                        attrs={
+                            "epmap": pserver_endpoints,
+                            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                        })
 
                     # insert concat_op
                     program.global_block().insert_op(
                         index=op_index + 2,
                         type="concat",
-                        inputs={'X': self.prefetch_output_vars},
+                        inputs={'X': prefetch_output_vars},
                         outputs={
                             "Out": [
                                 program.global_block().vars[varname]
@@ -622,9 +688,8 @@ class DistributeTranspiler:
                     # break for loop
                     break
 
-    def _split_table_grad_and_add_send_vars(self, program, rpc_client_var,
-                                            pserver_endpoints):
-        # 2. add split_ids_op and send_vars_op to send gradient to pservers
+    def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints):
+        # 2. add split_ids_op and send_op to send gradient to pservers
         # there should only be one table_name
         all_ops = program.global_block().ops
         table_grad_name = grad_var_name(self.table_name)
@@ -638,56 +703,54 @@ class DistributeTranspiler:
                     inputs={
                         'Ids': [program.global_block().vars[table_grad_name]]
                     },
-                    outputs={"Out": self.table_grad_list})
+                    outputs={"Out": self.trainer_side_table_grad_list})
                 program.global_block().insert_op(
                     index=op_index + 2,
-                    type="send_vars",
-                    inputs={'X': self.table_grad_list},
-                    outputs={"RPCClient": rpc_client_var},
-                    attrs={"sync_send": True,
-                           "epmap": pserver_endpoints})
+                    type="send",
+                    inputs={'X': self.trainer_side_table_grad_list},
+                    outputs={},
+                    attrs={
+                        "sync_mode": True,
+                        "epmap": pserver_endpoints,
+                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                    })
                 break
 
     def _create_prefetch_block(self, pserver_index, pserver_program,
                                optimize_block):
         # STEP: create prefetch block
         table_var = pserver_program.global_block().vars[self.table_name]
-        prefetch_block = pserver_program.create_block(optimize_block.idx)
-        trainer_ids = self.prefetch_input_vars[pserver_index]
-        pserver_ids = pserver_program.global_block().create_var(
-            name=trainer_ids.name,
-            type=trainer_ids.type,
-            shape=trainer_ids.shape,
-            dtype=trainer_ids.dtype)
-        trainer_out = self.prefetch_output_vars[pserver_index]
-        pserver_out = pserver_program.global_block().create_var(
-            name=trainer_out.name,
-            type=trainer_out.type,
-            shape=trainer_out.shape,
-            dtype=trainer_out.dtype)
-        prefetch_block.append_op(
-            type="lookup_sparse_table",
-            inputs={'Ids': pserver_ids,
-                    "W": table_var},
-            outputs={"Out": pserver_out},
-            attrs={
-                "is_sparse": True,  # has no effect on lookup_table op
-                "is_distributed": True,
-                "padding_idx": -1
-            })
-        return prefetch_block
+        prefetch_var_name_to_block_id = []
+        for index in range(len(self.all_prefetch_input_vars)):
+            prefetch_block = pserver_program.create_block(optimize_block.idx)
+            trainer_ids = self.all_prefetch_input_vars[index][pserver_index]
+            pserver_ids = pserver_program.global_block().create_var(
+                name=trainer_ids.name,
+                type=trainer_ids.type,
+                shape=trainer_ids.shape,
+                dtype=trainer_ids.dtype)
+            trainer_out = self.all_prefetch_output_vars[index][pserver_index]
+            pserver_out = pserver_program.global_block().create_var(
+                name=trainer_out.name,
+                type=trainer_out.type,
+                shape=trainer_out.shape,
+                dtype=trainer_out.dtype)
+            prefetch_block.append_op(
+                type="lookup_sparse_table",
+                inputs={'Ids': pserver_ids,
+                        "W": table_var},
+                outputs={"Out": pserver_out},
+                attrs={
+                    "is_sparse": True,  # has no effect on lookup_table op
+                    "is_distributed": True,
+                    "padding_idx": -1
+                })
+            prefetch_var_name_to_block_id.append(trainer_ids.name + ":" + str(
+                prefetch_block.idx))
+        return prefetch_var_name_to_block_id
 
     def _create_table_optimize_block(self, pserver_index, pserver_program,
-                                     pre_block_idx):
-        def _clone_var(block, var, persistable=True):
-            assert isinstance(var, Variable)
-            return block.create_var(
-                name=var.name,
-                shape=var.shape,
-                dtype=var.dtype,
-                type=var.type,
-                persistable=persistable)
-
+                                     pre_block_idx, grad_to_block_id):
         # STEP: create table optimize block
         # create table param and grad var in pserver program
         origin_param_var = self.origin_program.global_block().vars[
@@ -698,11 +761,11 @@ class DistributeTranspiler:
             dtype=origin_param_var.dtype,
             type=core.VarDesc.VarType.SELECTED_ROWS,
             persistable=True)
-        grad_var = _clone_var(
-            pserver_program.global_block(),
+        # parameter must be selected rows
+        param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
+        grad_var = pserver_program.global_block().clone_variable(
             self.origin_program.global_block().vars[grad_var_name(
-                self.table_name)],
-            persistable=False)
+                self.table_name)])
 
         # create table optimize block in pserver program
         table_opt_op = [
@@ -716,7 +779,7 @@ class DistributeTranspiler:
         if self.sync_mode:
             # create grad vars in pserver program
             table_grad_var = self.table_param_grad[1]
-            table_grad_list = [
+            pserver_side_table_grad_list = [
                 pserver_program.global_block().create_var(
                     name="%s.trainer_%d.pserver_%d" %
                     (table_grad_var.name, index, pserver_index),
@@ -726,11 +789,21 @@ class DistributeTranspiler:
                 for index in range(self.trainer_num)
             ]
 
-            # append sum op for table_grad_list
+            # append sum op for pserver_side_table_grad_list
             table_opt_block.append_op(
                 type="sum",
-                inputs={"X": table_grad_list},
+                inputs={"X": pserver_side_table_grad_list},
                 outputs={"Out": [grad_var]})
+        else:
+            # in async_mode, for table gradient, it also need to be splited to each parameter server
+            origin_grad_name = grad_var.name
+            splited_grad_name = self.trainer_side_table_grad_list[
+                pserver_index].name
+            if not splited_grad_name.startswith(origin_grad_name):
+                raise ValueError("origin_grad_var: " + splited_grad_name +
+                                 " grad_var:" + grad_var.name)
+            grad_var = pserver_program.global_block().rename_var(
+                origin_grad_name, splited_grad_name)
 
         lr_var = pserver_program.global_block().vars[table_opt_op.input(
             "LearningRate")[0]]
@@ -746,9 +819,11 @@ class DistributeTranspiler:
             outputs=outputs,
             attrs=table_opt_op.attrs)
 
+        # add table parameter gradient and it's block id to grad_to_block_id
+        grad_to_block_id.append(grad_var.name + ":" + str(table_opt_block.idx))
+
         return table_opt_block
 
-    # ====================== private transpiler functions =====================
     def _create_vars_from_blocklist(self,
                                     program,
                                     block_list,
@@ -761,8 +836,8 @@ class DistributeTranspiler:
             program (ProgramDesc): ProgramDesc which gradients blong.
             block_list (list[(varname, block_id, block_size)]): List of gradient blocks.
             add_trainer_suffix (Bool): Add trainer suffix to new variable's name if set True.
-        Returns: 
-            var_mapping (dict(varname->[new_varname_variable])):A dict mapping 
+        Returns:
+            var_mapping (dict(varname->[new_varname_variable])):A dict mapping
                 from original var name to each var split.
         """
 
@@ -775,6 +850,9 @@ class DistributeTranspiler:
             if not block_map.has_key(varname):
                 block_map[varname] = []
             block_map[varname].append((long(offset), long(size)))
+        # Do not remove this important debug message:
+        print("block map: %s" % block_map)
+
         for varname, splited in block_map.iteritems():
             orig_var = program.global_block().var(varname)
             if len(splited) == 1:
@@ -838,50 +916,31 @@ class DistributeTranspiler:
             lod_level=var.lod_level,
             persistable=persistable)
 
-    def _append_split_op(self, program, gradblocks):
-        """
-        Split variables that need to be split and append respective ops
-        Args:
-            program (ProgramDesc): ProgramDesc that gradients blong.
-            gradblocks (list[(varname, block_id, block_size)]): List of gradient blocks.
-        Returns:
-            var_mapping (dict(varname->[new_splitted_variable])):A dict mapping 
-                from original var name to each var split.
-        """
-
-        add_suffix = False
-        if self.trainer_num > 1:
-            add_suffix = True
-        var_mapping = self._create_vars_from_blocklist(
-            program, gradblocks, add_trainer_suffix=add_suffix)
-        for varname, splited_vars in var_mapping.iteritems():
-            # variable that don't need to split have empty splited_vars
-            if len(splited_vars) <= 1:
-                continue
-            orig_var = program.global_block().vars[varname]
-            if orig_var.type == core.VarDesc.VarType.SELECTED_ROWS:
-                height_sections = []
-                for v in splited_vars:
-                    height_sections.append(v.shape[0])
-                program.global_block().append_op(
-                    type="split_selected_rows",
-                    inputs={"X": orig_var},
-                    outputs={"Out": splited_vars},
-                    attrs={"height_sections": height_sections})
-            elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR:
-                sections = []
-                for v in splited_vars:
-                    sections.append(v.shape[0])
-                program.global_block().append_op(
-                    type="split_byref",
-                    inputs={"X": orig_var},
-                    outputs={"Out": splited_vars},
-                    attrs={"sections": sections}  # assume split evenly
-                )
-            else:
-                AssertionError("Variable type should be in set "
-                               "[LOD_TENSOR, SELECTED_ROWS]")
-        return var_mapping
+    def _insert_split_op(self, program, orig_var, index, splited_vars):
+        if orig_var.type == core.VarDesc.VarType.SELECTED_ROWS:
+            height_sections = []
+            for v in splited_vars:
+                height_sections.append(v.shape[0])
+            program.global_block().insert_op(
+                index=index + 1,
+                type="split_selected_rows",
+                inputs={"X": orig_var},
+                outputs={"Out": splited_vars},
+                attrs={"height_sections": height_sections})
+        elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR:
+            sections = []
+            for v in splited_vars:
+                sections.append(v.shape[0])
+            program.global_block().insert_op(
+                index=index + 1,
+                type="split_byref",
+                inputs={"X": orig_var},
+                outputs={"Out": splited_vars},
+                attrs={"sections": sections}  # assume split evenly
+            )
+        else:
+            AssertionError("Variable type should be in set "
+                           "[LOD_TENSOR, SELECTED_ROWS]")
 
     def _get_optimizer_input_shape(self, op_type, varkey, orig_shape,
                                    param_shape):
@@ -910,17 +969,74 @@ class DistributeTranspiler:
             pass
         return orig_shape
 
-    def _orig_varname(self, varname):
-        suff_idx = varname.find(".trainer_")
+    def _get_varname_parts(self, varname):
+        # returns origin, blockid, trainerid
         orig_var_name = ""
-        if suff_idx >= 0:
-            orig_var_name = varname[:suff_idx]
+        trainer_part = ""
+        block_part = ""
+        trainer_idx = varname.find(".trainer_")
+        if trainer_idx >= 0:
+            trainer_part = varname[trainer_idx + 1:]
+        else:
+            trainer_idx = len(varname)
+        block_index = varname.find(".block")
+        if block_index >= 0:
+            block_part = varname[block_index + 1:trainer_idx]
         else:
-            orig_var_name = varname
-        return orig_var_name
+            block_index = len(varname)
+        orig_var_name = varname[0:min(block_index, trainer_idx)]
+        return orig_var_name, block_part, trainer_part
+
+    def _orig_varname(self, varname):
+        orig, _, _ = self._get_varname_parts(varname)
+        return orig
+
+    def _append_pserver_grad_merge_ops(self, optimize_block,
+                                       grad_varname_for_block, endpoint,
+                                       grad_to_block_id, origin_program):
+        program = optimize_block.program
+        pserver_block = program.global_block()
+        grad_block = None
+        for g in self.param_grad_ep_mapping[endpoint]["grads"]:
+            if self._orig_varname(g.name) == \
+                    self._orig_varname(grad_varname_for_block):
+                grad_block = g
+                break
+        if not grad_block:
+            # do not append this op if current endpoint
+            # is not dealing with this grad block
+            return
+        orig_varname, block_name, trainer_name = self._get_varname_parts(
+            grad_block.name)
+        if block_name:
+            merged_var_name = '.'.join([orig_varname, block_name])
+        else:
+            merged_var_name = orig_varname
+        merged_var = \
+            pserver_block.vars[merged_var_name]
+        grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx))
+        if self.sync_mode and self.trainer_num > 1:
+            vars2merge = []
+            for i in xrange(self.trainer_num):
+                per_trainer_name = "%s.trainer_%d" % \
+                (merged_var_name, i)
+                vars2merge.append(pserver_block.vars[per_trainer_name])
+
+            optimize_block.append_op(
+                type="sum",
+                inputs={"X": vars2merge},
+                outputs={"Out": merged_var})
+            # TODO(panyx0718): What if it's SELECTED_ROWS.
+            if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
+                optimize_block.append_op(
+                    type="scale",
+                    inputs={"X": merged_var},
+                    outputs={"Out": merged_var},
+                    attrs={"scale": 1.0 / float(self.trainer_num)})
+        return merged_var
 
     def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
-                            grad_to_block_id, origin_program):
+                            grad_to_block_id, origin_program, merged_var):
         program = optimize_block.program
         pserver_block = program.global_block()
         new_inputs = dict()
@@ -928,40 +1044,6 @@ class DistributeTranspiler:
         # moment can use the updated shape
         for key in opt_op.input_names:
             if key == "Grad":
-                grad_block = None
-                for g in self.param_grad_ep_mapping[endpoint]["grads"]:
-                    if same_or_split_var(
-                            self._orig_varname(g.name),
-                            self._orig_varname(opt_op.input(key)[0])):
-                        grad_block = g
-                        break
-                if not grad_block:
-                    # do not append this op if current endpoint
-                    # is not dealing with this grad block
-                    return
-                merged_var = \
-                    pserver_block.vars[self._orig_varname(grad_block.name)]
-                grad_to_block_id.append(merged_var.name + ":" + str(
-                    optimize_block.idx))
-                if self.sync_mode and self.trainer_num > 1:
-                    vars2merge = []
-                    for i in xrange(self.trainer_num):
-                        per_trainer_name = "%s.trainer_%d" % \
-                        (self._orig_varname(grad_block.name), i)
-                        vars2merge.append(pserver_block.vars[per_trainer_name])
-
-                    optimize_block.append_op(
-                        type="sum",
-                        inputs={"X": vars2merge},
-                        outputs={"Out": merged_var})
-                    # TODO(panyx0718): What if it's SELECTED_ROWS.
-                    if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
-                        optimize_block.append_op(
-                            type="scale",
-                            inputs={"X": merged_var},
-                            outputs={"Out": merged_var},
-                            attrs={"scale": 1.0 / float(self.trainer_num)})
-
                 new_inputs[key] = merged_var
             elif key == "Param":
                 # param is already created on global program
@@ -1020,17 +1102,31 @@ class DistributeTranspiler:
             outputs=outputs,
             attrs=opt_op.attrs)
 
-    def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
+    def _is_splited_grad_var(self, var, var_dict):
+        grad_block = None
+        for _, g in var_dict.iteritems():
+            if self._orig_varname(g.name) == self._orig_varname(var.name):
+                if g.name.find(".trainer_") == -1:
+                    grad_block = g
+                    break
+        return grad_block
+
+    def _append_pserver_non_opt_ops(self, optimize_block, opt_op, endpoint):
         program = optimize_block.program
         # Append the ops for parameters that do not need to be optimized/updated
         inputs = self._get_input_map_from_op(
             self.origin_program.global_block().vars, opt_op)
-        for varlist in inputs.itervalues():
+        for key, varlist in inputs.iteritems():
             if not isinstance(varlist, list):
                 varlist = [varlist]
-
             for var in varlist:
-                if not program.global_block().vars.has_key(var.name):
+                # for ops like clipping and weight decay, get the splited var
+                # for inputs/outputs
+                grad_block = self._is_splited_grad_var(
+                    var, program.global_block().vars)
+                if grad_block:
+                    inputs[key] = grad_block
+                elif not program.global_block().vars.has_key(var.name):
                     program.global_block().create_var(
                         name=var.name,
                         persistable=var.persistable,
@@ -1039,13 +1135,16 @@ class DistributeTranspiler:
 
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, opt_op)
-
-        for varlist in outputs.itervalues():
+        for key, varlist in outputs.iteritems():
             if not isinstance(varlist, list):
                 varlist = [varlist]
-
             for var in varlist:
-                program.global_block().clone_variable(var)
+                grad_block = self._is_splited_grad_var(
+                    var, program.global_block().vars)
+                if grad_block:
+                    outputs[key] = grad_block
+                elif not program.global_block().vars.has_key(var.name):
+                    program.global_block().clone_variable(var)
 
         optimize_block.append_op(
             type=opt_op.type,
@@ -1091,9 +1190,17 @@ class DistributeTranspiler:
                     ufind.union(op1, op2)
         return ufind
 
-    def _is_opt_op(self, op):
-        # NOTE: It's a HACK implement.
-        # optimize op: SGDOptimize, MomentumOptimizer, AdamOptimizer and etc...
+    def _is_opt_role_op(self, op):
+        # NOTE: depend on oprole to find out whether this op is for
+        # optimize
+        op_maker = core.op_proto_and_checker_maker
+        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
+        if op_maker.kOpRoleAttrName() in op.attrs and \
+            int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role):
+            return True
+        return False
+
+    def _is_optimizer_op(self, op):
         if "Param" in op.input_names and \
             "LearningRate" in op.input_names:
             return True
@@ -1143,7 +1250,7 @@ class DistributeTranspiler:
         # find learning rate variables by optimize op
         lr_vars = set()
         for op in self.optimize_ops:
-            if self._is_opt_op(op):
+            if self._is_optimizer_op(op):
                 lr_vars.add(op.input("LearningRate")[0])
 
         find_ops = []
@@ -1160,7 +1267,7 @@ class DistributeTranspiler:
                 # NOTE: we need to skip all optimize ops, since it is connected
                 # with forward/backward ops and lr ops, we only need the lr ops.
                 if op1 != op2 and self._is_op_connected(op1, op2) and \
-                    not self._is_opt_op(op1) and not self._is_opt_op(op2):
+                    not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2):
                     ufind.union(op1, op2)
         # find all ops which is related with lr var
         for op1 in block.ops:
@@ -1181,13 +1288,21 @@ class DistributeTranspiler:
         block = self.origin_program.global_block()
         opt_ops = []
         params_grads = []
+        origin_var_dict = self.origin_program.global_block().vars
         for op in block.ops:
-            if self._is_opt_op(op):
+            if self._is_opt_role_op(op):
                 opt_ops.append(op)
-                params_grads.append((self.origin_program.global_block().var(
-                    op.input("Param")[0]),
-                                     self.origin_program.global_block().var(
-                                         op.input("Grad")[0])))
+                # HACK(wuyi): if we find grad vars from input of optimize
+                # ops, we may get the output of clip op. Use syntax "@GRAD"
+                # and op_role_var to get the pair.
+                for input_name in op.input_arg_names:
+                    if input_name.find("@GRAD") != -1 and \
+                        op.attrs[RPC_OP_ROLE_ATTR_NAME]:
+                        param_name = op.attrs[OP_ROLE_VAR_ATTR_NAME][0]
+                        params_grads.append([
+                            origin_var_dict[param_name],
+                            origin_var_dict[input_name]
+                        ])
             elif self._is_adam_connected_op(op):
                 opt_ops.append(op)
             else:
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler_simple.py b/python/paddle/fluid/transpiler/distribute_transpiler_simple.py
deleted file mode 100644
index ea8c27cdca..0000000000
--- a/python/paddle/fluid/transpiler/distribute_transpiler_simple.py
+++ /dev/null
@@ -1,254 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ..framework import Program, default_main_program, Parameter, Variable
-from ..layer_helper import LayerHelper
-
-
-def hash_name_to_server(params_grads, pserver_endpoints):
-    """
-    :param param_grads:
-    :return: a map of pserver endpoint -> 
-                    params -> [param list]
-                    grads  -> [grad list]
-    """
-
-    def _hash_param(param_name, total):
-        return hash(param_name) % total
-
-    param_grad_map = dict()
-    for param, grad in params_grads:
-        if param.trainable is True and grad is not None:
-            server_id = _hash_param(param.name, len(pserver_endpoints))
-            server_for_param = pserver_endpoints[server_id]
-            if not param_grad_map.has_key(server_for_param):
-                param_grad_map[server_for_param] = {"params": [], "grads": []}
-            param_grad_map[server_for_param]["params"].append(param)
-            param_grad_map[server_for_param]["grads"].append(grad)
-
-    return param_grad_map
-
-
-def round_robin(params_grads, pserver_endpoints):
-    assert (len(params_grads) > len(pserver_endpoints))
-
-    param_grad_map = dict()
-    pserver_idx = 0
-    for param, grad in params_grads:
-        if param.trainable is True:
-            server_for_param = pserver_endpoints[pserver_idx]
-            if not param_grad_map.has_key(server_for_param):
-                param_grad_map[server_for_param] = {"params": [], "grads": []}
-
-            param_grad_map[server_for_param]["params"].append(param)
-            param_grad_map[server_for_param]["grads"].append(grad)
-
-            pserver_idx += 1
-            if pserver_idx >= len(pserver_endpoints):
-                pserver_idx = 0
-    return param_grad_map
-
-
-class SimpleDistributeTranspiler:
-    def transpile(self,
-                  optimize_ops,
-                  params_grads,
-                  program=None,
-                  pservers="127.0.0.1:6174",
-                  trainers=1,
-                  split_method=round_robin):
-        """
-            Transpile the program to a distributed data-parallelism programs.
-
-            The main_program will be transform to use a remote parameter server
-            to do parameter optimization. And the optimization graph will be put
-            in to a parameter server program.
-
-            Use different methods to split trainable varialbles to different
-            parameter servers.
-
-            Example to run:
-
-            exe = fluid.Executor(place)
-            t = fluid.DistributeTranspiler()
-            t.transpile(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1)
-
-            pserver_endpoint = os.getenv("PSERVER")
-            if pserver_endpoint:
-                pserver_prog = t.get_pserver_program(pserver_endpoint, optimize_ops)
-                exe.run(fluid.default_startup_program())
-                exe.run(pserver_prog)
-            else:
-                feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-                exe.run(fluid.default_startup_program())
-
-                for pass_id in range(PASS_NUM):
-                    ...
-
-            :param optimize_ops: op list of optimization, should be the
-                                 return value of Optimizer.minimize
-            :type optimize_ops: list
-            :param program: program to optimize, default default_main_program
-            :param pservers: parameter server endpoints like "m1:6174,m2:6174"
-            :type pservers: string
-
-            :return: return a list of programs
-        """
-        if program is None:
-            program = default_main_program()
-        self.program = program
-        self.trainers = trainers
-        self.optimize_ops = optimize_ops
-        self._optimize_distributed(
-            optimize_ops,
-            program,
-            params_grads,
-            pservers=pservers,
-            trainers=trainers,
-            split_method=split_method)
-
-    def _clone_param(self, block, v):
-        assert isinstance(v, Parameter)
-        new_p = Parameter(
-            block=block,
-            shape=v.shape,
-            dtype=v.dtype,
-            type=v.type,
-            lod_level=v.lod_level,
-            stop_gradient=v.stop_gradient,
-            trainable=v.trainable,
-            optimize_attr=v.optimize_attr,
-            regularizer=v.regularizer,
-            name=v.name)
-        block.vars[new_p.name] = new_p
-
-    def _clone_var(self, block, var):
-        assert isinstance(var, Variable)
-        return block.create_var(
-            name=var.name,
-            shape=var.shape,
-            dtype=var.dtype,
-            type=var.type,
-            lod_level=var.lod_level,
-            persistable=var.persistable)
-
-    def _optimize_distributed(self, optimize_ops, program, params_and_grads,
-                              **kwargs):
-        if kwargs.has_key("split_method"):
-            split_method = kwargs["split_method"]
-        else:
-            split_method = round_robin
-
-        assert (callable(split_method))
-        pserver_endpoints = kwargs["pservers"].split(",")
-        self.param_grad_map = split_method(params_and_grads, pserver_endpoints)
-
-        send_op_ordered_inputs = []
-        send_op_ordered_outputs = []
-        epmap = []
-        for ep, v in self.param_grad_map.iteritems():
-            send_op_ordered_inputs.extend(v["grads"])
-            send_op_ordered_outputs.extend(v["params"])
-            for i in v["grads"]:
-                epmap.append(ep)
-        send_op = program.global_block().append_op(
-            type="send",
-            inputs={"X": send_op_ordered_inputs
-                    },  # inputs is a list of tensors to be send
-            outputs={"Out": send_op_ordered_outputs},
-            attrs={"endpoints": pserver_endpoints,
-                   "epmap": epmap})
-
-    def get_trainer_program(self):
-        # remove optimize ops and add a send op to main_program
-        self.program.global_block().delete_ops(self.optimize_ops)
-        return self.program
-
-    def _create_var_for_trainers(self, block, var, trainers):
-        var_list = []
-        for i in xrange(trainers):
-            var_each = block.create_var(
-                name="%s.trainer_%d" % (var.name, i),
-                psersistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
-            var_list.append(var_each)
-        return var_list
-
-    def get_pserver_program(self, endpoint, optimize_ops):
-        pserver_program = Program()
-        for v in self.param_grad_map[endpoint]["params"]:
-            self._clone_param(pserver_program.global_block(), v)
-
-        optimize_sub_program = Program()
-        grad_var_names = [
-            var.name for var in self.param_grad_map[endpoint]["grads"]
-        ]
-        for opt_op in optimize_ops:
-            for _, var in opt_op.inputs.iteritems():
-                # NOTE: append operators to merge gradients from multiple
-                # trainers. If trainers == 1, this is not needed.
-                if self.trainers > 1 and var.name in grad_var_names:
-                    vars2merge = self._create_var_for_trainers(
-                        optimize_sub_program.global_block(), var, self.trainers)
-                    merged_var = optimize_sub_program.global_block().create_var(
-                        name=var.name,
-                        persistable=var.persistable,
-                        dtype=var.dtype,
-                        shape=var.shape)
-                    optimize_sub_program.global_block().append_op(
-                        type="sum",
-                        inputs={"X": vars2merge},
-                        outputs={"Out": merged_var})
-                    optimize_sub_program.global_block().append_op(
-                        type="scale",
-                        inputs={"X": merged_var},
-                        outputs={"Out": merged_var},
-                        attrs={"scale": 1.0 / float(self.trainers)})
-                else:
-                    optimize_sub_program.global_block().create_var(
-                        name=var.name,
-                        persistable=var.persistable,
-                        dtype=var.dtype,
-                        shape=var.shape)
-
-            if opt_op.inputs.has_key("Grad"):
-                if opt_op.inputs["Grad"].name in grad_var_names:
-                    optimize_sub_program.global_block().append_op(
-                        type=opt_op.type,
-                        inputs=opt_op.inputs,
-                        outputs=opt_op.outputs,
-                        attrs=opt_op.attrs)
-            else:
-                optimize_sub_program.global_block().append_op(
-                    type=opt_op.type,
-                    inputs=opt_op.inputs,
-                    outputs=opt_op.outputs,
-                    attrs=opt_op.attrs)
-        pserver_program.global_block().append_op(
-            type="recv",
-            inputs={"RX":
-                    self.param_grad_map[endpoint]["grads"]},  # grads to recv
-            outputs={},
-            attrs={
-                "OptimizeBlock": optimize_sub_program.global_block(),
-                "endpoint": endpoint,
-                "ParamList":
-                [p.name for p in self.param_grad_map[endpoint]["params"]],
-                "GradList":
-                [p.name for p in self.param_grad_map[endpoint]["grads"]],
-                "Trainers": self.trainers
-            })
-        pserver_program.sync_with_cpp()
-        return pserver_program
diff --git a/python/paddle/fluid/transpiler/distributed_splitter.py b/python/paddle/fluid/transpiler/distributed_splitter.py
deleted file mode 100644
index 060c1df8ad..0000000000
--- a/python/paddle/fluid/transpiler/distributed_splitter.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-def hash_name(varlist, pserver_endpoints):
-    """
-    hash variable names to several endpoints.
-
-    Args:
-        varlist(list): a list of Variables
-
-    Returns(dict): a map of pserver endpoint -> varname
-    """
-
-    def _hash_block(block_str, total):
-        return hash(block_str) % total
-
-    eplist = []
-    for var in varlist:
-        server_id = _hash_block(var.name(), len(pserver_endpoints))
-        server_for_param = pserver_endpoints[server_id]
-        eplist.append(server_for_param)
-    return eplist
-
-
-def round_robin(varlist, pserver_endpoints):
-    """
-    Distribute variables to several endpoints.
-    Args:
-        varlist(list): a list of variables
-        pserver_endpoints(list): a list of pserver endpoints
-
-    Returns(list[int]): the endpoint for each variable
-    """
-    assert (len(varlist) >= len(pserver_endpoints))
-
-    eplist = []
-    pserver_idx = 0
-    for var in varlist:
-        server_for_param = pserver_endpoints[pserver_idx]
-        eplist.append(server_for_param)
-
-        pserver_idx += 1
-        if pserver_idx >= len(pserver_endpoints):
-            pserver_idx = 0
-    return eplist
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 80a8f7c09c..9ff0ae6fca 100644
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -107,7 +107,7 @@ class ControlFlowGraph(object):
         # Repeatedly apply liveness updates until the algorithm stablize
         # on a complete set live input vars and live output vars.
         while True:
-            for i in range(self.op_size, 0, -1):
+            for i in reversed(range(self.op_size)):
                 live_in[i] = set(self._live_in[i])
                 live_out[i] = set(self._live_out[i])
                 for s in self._successors[i]:
diff --git a/python/paddle/fluid/transpiler/ps_dispatcher.py b/python/paddle/fluid/transpiler/ps_dispatcher.py
new file mode 100644
index 0000000000..d6a6867752
--- /dev/null
+++ b/python/paddle/fluid/transpiler/ps_dispatcher.py
@@ -0,0 +1,78 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class PSDispatcher(object):
+    """
+    PSDispatcher is the base class for dispatching vars
+    into different pserver instance.
+    You need to implement the `dispatch` inferface.
+    """
+
+    def __init__(self, pserver_endpoints):
+        self._eps = pserver_endpoints
+        self._step = 0
+
+    @property
+    def eps(self):
+        return self._eps
+
+    def reset(self):
+        self._step = 0
+
+    def dispatch(self, varlist):
+        """
+        :param varlist: a list of Variables
+        :return: a map of pserver endpoint -> varname 
+        """
+        AssertionError("Interface has not been implemented.")
+
+
+class HashName(PSDispatcher):
+    """
+      Hash variable names to several endpoints
+    """
+
+    def __init__(self, pserver_endpoints):
+        super(self.__class__, self).__init__(pserver_endpoints)
+
+    def _hash_block(self, block_str, total):
+        return hash(block_str) % total
+
+    def dispatch(self, varlist):
+        eplist = []
+        for var in varlist:
+            server_id = self._hash_block(var.name(), len(self._eps))
+            server_for_param = self._eps[server_id]
+            eplist.append(server_for_param)
+        return eplist
+
+
+class RoundRobin(PSDispatcher):
+    """
+    Distribute variables to serveral endpoints.
+    """
+
+    def __init__(self, pserver_endpoints):
+        super(self.__class__, self).__init__(pserver_endpoints)
+
+    def dispatch(self, varlist):
+        eplist = []
+        for var in varlist:
+            server_for_param = self._eps[self._step]
+            eplist.append(server_for_param)
+            self._step += 1
+            if self._step >= len(self._eps):
+                self._step = 0
+        return eplist
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
index 7bdddeaabe..357a4e9b00 100644
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -119,7 +119,8 @@ def reader_creator(data_file,
                 yield sample, int(label) - 1
 
     if use_xmap:
-        return xmap_readers(mapper, reader, cpu_count(), buffered_size)
+        cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
+        return xmap_readers(mapper, reader, cpu_num, buffered_size)
     else:
         return map_readers(mapper, reader)
 
diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py
index 317cf037c6..d48c54fcbb 100644
--- a/python/paddle/v2/minibatch.py
+++ b/python/paddle/v2/minibatch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
 
 
-def batch(reader, batch_size):
+def batch(reader, batch_size, drop_last=False):
     """
     Create a batched reader.
 
@@ -23,6 +23,8 @@ def batch(reader, batch_size):
     :type reader: callable
     :param batch_size: size of each mini-batch
     :type batch_size: int
+    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
+    :type drop_last: bool
     :return: the batched reader.
     :rtype: callable
     """
@@ -35,7 +37,7 @@ def batch(reader, batch_size):
             if len(b) == batch_size:
                 yield b
                 b = []
-        if b:
+        if drop_last == False and len(b) != 0:
             yield b
 
     return batch_reader
diff --git a/python/setup.py.in b/python/setup.py.in
index c42601d335..8257f1d5e2 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -69,7 +69,8 @@ packages=['paddle',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
           'paddle.fluid.layers',
-          'paddle.fluid.transpiler']
+          'paddle.fluid.transpiler',
+          'paddle.fluid.transpiler.details']
 
 if '${WITH_FLUID_ONLY}'== 'OFF':
     packages+=['paddle.proto',
diff --git a/tools/codestyle/.gitignore b/tools/codestyle/.gitignore
new file mode 100644
index 0000000000..0d20b6487c
--- /dev/null
+++ b/tools/codestyle/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index 94d1e23ce7..b194af76dc 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -4,8 +4,12 @@ TOTAL_ERRORS=0
 
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
-    cpplint $file;
-    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+    if [[ $file =~ ^(paddle/api/.*|paddle/capi/.*|paddle/contrib/.*|paddle/cuda/.*|paddle/function/.*|paddle/gserver/.*|paddle/math/.*|paddle/optimizer/.*|paddle/parameter/.*|paddle/pserver/.*|paddle/trainer/.*|paddle/utils/.*) ]]; then
+        continue;
+    else
+        cpplint $file;
+        TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+    fi
 done
 
 exit $TOTAL_ERRORS
diff --git a/tools/codestyle/docstring_checker.py b/tools/codestyle/docstring_checker.py
new file mode 100644
index 0000000000..54a6904626
--- /dev/null
+++ b/tools/codestyle/docstring_checker.py
@@ -0,0 +1,345 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DocstringChecker is used to check python doc string's style."""
+
+import six
+import astroid
+
+from pylint.checkers import BaseChecker, utils
+from pylint.interfaces import IAstroidChecker
+
+from collections import defaultdict
+import re
+
+
+def register(linter):
+    """Register checkers."""
+    linter.register_checker(DocstringChecker(linter))
+
+
+class Docstring(object):
+    """Docstring class holds the parsed doc string elements.
+    """
+
+    def __init__(self):
+        self.d = defaultdict(list)  #name->[]
+        self.clear()
+
+    def clear(self):
+        self.d['Args'] = []
+        self.d['Examples'] = []
+        self.d['Returns'] = []
+        self.d['Raises'] = []
+        self.args = {}  #arg_name->arg_type
+
+    def get_level(self, string, indent='    '):
+        level = 0
+        unit_size = len(indent)
+        while string[:unit_size] == indent:
+            string = string[unit_size:]
+            level += 1
+
+        return level
+
+    def parse(self, doc):
+        """parse gets sections from doc
+        Such as Args, Returns, Raises, Examples s
+        Args:
+            doc (string): is the astroid node doc string.
+        Returns:
+            True if doc is parsed successfully.
+        """
+        self.clear()
+
+        lines = doc.splitlines()
+        state = ("others", -1)
+        for l in lines:
+            c = l.strip()
+            if len(c) <= 0:
+                continue
+
+            level = self.get_level(l)
+            if c.startswith("Args:"):
+                state = ("Args", level)
+            elif c.startswith("Returns:"):
+                state = ("Returns", level)
+            elif c.startswith("Raises:"):
+                state = ("Raises", level)
+            elif c.startswith("Examples:"):
+                state = ("Examples", level)
+            else:
+                if level > state[1]:
+                    self.d[state[0]].append(c)
+                    continue
+
+                state = ("others", -1)
+                self.d[state[0]].append(c)
+
+        self._arg_with_type()
+        return True
+
+    def get_returns(self):
+        return self.d['Returns']
+
+    def get_raises(self):
+        return self.d['Raises']
+
+    def get_examples(self):
+        return self.d['Examples']
+
+    def _arg_with_type(self):
+
+        for t in self.d['Args']:
+            m = re.search('([A-Za-z0-9_-]+)\s{0,4}(\(.+\))\s{0,4}:', t)
+            if m:
+                self.args[m.group(1)] = m.group(2)
+
+        return self.args
+
+
+class DocstringChecker(BaseChecker):
+    """DosstringChecker is pylint checker to
+    check docstring style.
+    """
+    __implements__ = (IAstroidChecker, )
+
+    POSITIONAL_MESSAGE_ID = 'str-used-on-positional-format-argument'
+    KEYWORD_MESSAGE_ID = 'str-used-on-keyword-format-argument'
+
+    name = 'doc-string-checker'
+    symbol = "doc-string"
+    priority = -1
+    msgs = {
+        'W9001': ('One line doc string on > 1 lines', symbol + "-one-line",
+                  'Used when a short doc string is on multiple lines'),
+        'W9002':
+        ('Doc string does not end with "." period', symbol + "-end-with",
+         'Used when a doc string does not end with a period'),
+        'W9003':
+        ('All args with their types must be mentioned in doc string %s',
+         symbol + "-with-all-args",
+         'Used when not all arguments are in the doc string '),
+        'W9005': ('Missing docstring or docstring is too short',
+                  symbol + "-missing", 'Add docstring longer >=10'),
+        'W9006': ('Docstring indent error, use 4 space for indent',
+                  symbol + "-indent-error", 'Use 4 space for indent'),
+        'W9007': ('You should add `Returns` in comments',
+                  symbol + "-with-returns",
+                  'There should be a `Returns` section in comments'),
+        'W9008': ('You should add `Raises` section in comments',
+                  symbol + "-with-raises",
+                  'There should be a `Raises` section in comments'),
+    }
+    options = ()
+
+    def visit_functiondef(self, node):
+        """visit_functiondef checks Function node docstring style.
+        Args:
+            node (astroid.node): The visiting node.
+        Returns:
+            True if successful other wise False.
+        """
+
+        self.check_doc_string(node)
+
+        if node.tolineno - node.fromlineno <= 10:
+            return True
+
+        if not node.doc:
+            return True
+
+        doc = Docstring()
+        doc.parse(node.doc)
+
+        self.all_args_in_doc(node, doc)
+        self.with_returns(node, doc)
+        self.with_raises(node, doc)
+
+    def visit_module(self, node):
+        self.check_doc_string(node)
+
+    def visit_classdef(self, node):
+        self.check_doc_string(node)
+
+    def check_doc_string(self, node):
+        self.missing_doc_string(node)
+        self.one_line(node)
+        self.has_period(node)
+        self.indent_style(node)
+
+    def missing_doc_string(self, node):
+        if node.name.startswith("__") or node.name.startswith("_"):
+            return True
+        if node.tolineno - node.fromlineno <= 10:
+            return True
+
+        if node.doc is None or len(node.doc) < 10:
+            self.add_message('W9005', node=node, line=node.fromlineno)
+        return False
+
+    # FIXME(gongwb): give the docstring line-no
+    def indent_style(self, node, indent=4):
+        """indent_style checks docstring's indent style
+        Args:
+            node (astroid.node): The visiting node.
+            indent (int): The default indent of style
+        Returns:
+            True if successful other wise False.
+        """
+        if node.doc is None:
+            return True
+
+        doc = node.doc
+        lines = doc.splitlines()
+        line_num = 0
+
+        for l in lines:
+            if line_num == 0:
+                continue
+            cur_indent = len(l) - len(l.lstrip())
+            if cur_indent % indent != 0:
+                self.add_message('W9006', node=node, line=node.fromlineno)
+                return False
+            line_num += 1
+
+        return True
+
+    def one_line(self, node):
+        """one_line checks if docstring (len < 40) is on one line.
+        Args:
+            node (astroid.node): The node visiting.
+        Returns:
+            True if successful otherwise False.
+        """
+
+        doc = node.doc
+        if doc is None:
+            return True
+
+        if len(doc) > 40:
+            return True
+        elif sum(doc.find(nl) for nl in ('\n', '\r', '\n\r')) == -3:
+            return True
+        else:
+            self.add_message('W9001', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def has_period(self, node):
+        """has_period checks if one line doc end-with '.' .
+        Args:
+            node (astroid.node): the node is visiting.
+        Returns:
+            True if successful otherwise False.
+        """
+        if node.doc is None:
+            return True
+
+        if len(node.doc.splitlines()) > 1:
+            return True
+
+        if not node.doc.strip().endswith('.'):
+            self.add_message('W9002', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def with_raises(self, node, doc):
+        """with_raises checks if one line doc end-with '.' .
+        Args:
+            node (astroid.node): the node is visiting.
+            doc (Docstring): Docstring object.
+        Returns:
+            True if successful otherwise False.
+        """
+
+        find = False
+        for t in node.body:
+            if not isinstance(t, astroid.Raise):
+                continue
+
+            find = True
+            break
+
+        if not find:
+            return True
+
+        if len(doc.get_raises()) == 0:
+            self.add_message('W9008', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def with_returns(self, node, doc):
+        """with_returns checks if docstring comments what are returned .
+        Args:
+            node (astroid.node): the node is visiting.
+            doc (Docstring): Docstring object.
+        Returns:
+            True if successful otherwise False.
+        """
+
+        find = False
+        for t in node.body:
+            if not isinstance(t, astroid.Return):
+                continue
+
+            find = True
+            break
+
+        if not find:
+            return True
+
+        if len(doc.get_returns()) == 0:
+            self.add_message('W9007', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def all_args_in_doc(self, node, doc):
+        """all_args_in_doc checks if arguments are mentioned in doc
+        Args:
+            node (astroid.node): the node is visiting.
+            doc (Docstring): Docstring object
+        Returns:
+            True if successful otherwise False.
+        """
+        args = []
+        for arg in node.args.get_children():
+            if (not isinstance(arg, astroid.AssignName)) \
+                or arg.name == "self":
+                continue
+            args.append(arg.name)
+
+        if len(args) <= 0:
+            return True
+
+        parsed_args = doc.args
+        args_not_documented = set(args) - set(parsed_args)
+        if len(args) > 0 and len(parsed_args) <= 0:
+            self.add_message(
+                'W9003',
+                node=node,
+                line=node.fromlineno,
+                args=list(args_not_documented))
+            return False
+
+        for t in args:
+            if t not in parsed_args:
+                self.add_message(
+                    'W9003', node=node, line=node.fromlineno, args=[t, ])
+                return False
+
+        return True
diff --git a/tools/codestyle/pylint_pre_commit.hook b/tools/codestyle/pylint_pre_commit.hook
new file mode 100755
index 0000000000..150a3f5666
--- /dev/null
+++ b/tools/codestyle/pylint_pre_commit.hook
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+TOTAL_ERRORS=0
+
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+export PYTHONPATH=$DIR:$PYTHONPATH
+
+# The trick to remove deleted files: https://stackoverflow.com/a/2413151
+for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do
+    pylint --disable=all --load-plugins=docstring_checker \
+    --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file;
+    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+done
+
+exit $TOTAL_ERRORS
+#For now, just warning:
+#exit 0
+
diff --git a/tools/codestyle/test_docstring_checker.py b/tools/codestyle/test_docstring_checker.py
new file mode 100644
index 0000000000..0547f7d161
--- /dev/null
+++ b/tools/codestyle/test_docstring_checker.py
@@ -0,0 +1,232 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import docstring_checker
+import pylint.testutils
+import astroid
+import pytest
+import sys
+
+
+class TestDocstring(pylint.testutils.CheckerTestCase):
+    CHECKER_CLASS = docstring_checker.DocstringChecker
+
+    def test_one_line(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get 
+            news.
+            """
+            if True:
+                return 5
+            return 5
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9001' == got[0][0]
+
+    def test_one_line(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get news"""
+            if True:
+                return 5
+            return 5
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9002' == got[0][0]
+
+    def test_args(self):
+        func_node = astroid.extract_node('''
+        def test(scale, mean): 
+            """get news.
+            Args:
+                scale (int): scale is the number.
+            """
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9003' == got[0][0]
+
+    def test_missing(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9005' == got[0][0]
+
+    def test_indent(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """ get get get get get get get get
+              get get get get get get get get.
+            """
+            pass 
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9006' == got[0][0]
+
+    def test_with_resturns(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get news.
+            Args:
+                scale (int): scale is the number.
+            """
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            return mean
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9007' == got[0][0]
+
+    def test_with_raises(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get news.
+            Args:
+                scale (int): scale is the number.
+            """
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            raise ValueError('A very specific bad thing happened.')
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9008' == got[0][0]
+
+    def test_no_message(self):
+        p = '''
+def fc(input,
+       size,
+       num_flatten_dims=1,
+       param_attr=None,
+       bias_attr=None,
+       act=None,
+       name=None):
+    """
+    **Fully Connected Layer**
+    The fully connected layer can take multiple tensors as its inputs. It
+    creates a variable called weights for each input tensor, which represents
+    a fully connected weight matrix from each input unit to each output unit.
+    The fully connected layer multiplies each input tensor with its coresponding
+    weight to produce an output Tensor. If multiple input tensors are given,
+    the results of multiple multiplications will be sumed up. If bias_attr is
+    not None, a bias variable will be created and added to the output. Finally,
+    if activation is not None, it will be applied to the output as well.
+    This process can be formulated as follows:
+
+    Args:
+        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+            the input tensor(s) is at least 2.
+        size(int): The number of output units in this layer.
+        num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
+            two dimensions. If this happens, the multidimensional tensor will first be flattened
+            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
+            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
+            dimensions will be flatten to form the first dimension of the final matrix (height of
+            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
+            form the second dimension of the final matrix (width of the matrix). For example, suppose
+            `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
+        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+            parameters/weights of this layer.
+        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
+            of this layer. If it is set to None, no bias will be added to the output units.
+        act (str, default None): Activation to be applied to the output of this layer.
+        name (str, default None): The name of this layer.
+    Returns:
+        A tensor variable storing the transformation result.
+    Raises:
+        ValueError: If rank of the input tensor is less than 2.
+    Examples:
+        .. code-block:: python
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+    """
+    raise ValueError('A very specific bad thing happened.')
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    return size
+    '''
+
+        func_node = astroid.extract_node(p)
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 0