diff --git a/.clang_format.hook b/.clang_format.hook
new file mode 100755
index 0000000000..1d92821686
--- /dev/null
+++ b/.clang_format.hook
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+
+readonly VERSION="3.8"
+
+version=$(clang-format -version)
+
+if ! [[ $version == *"$VERSION"* ]]; then
+    echo "clang-format version check failed."
+    echo "a version contains '$VERSION' is needed, but get '$version'"
+    echo "you can install the right version, and make an soft-link to '\$PATH' env"
+    exit -1
+fi
+
+clang-format $@
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bb8c88787d..83fe9af768 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,10 +19,10 @@
     -   id: end-of-file-fixer
 -   repo: local
     hooks:
-    -   id: clang-format
+    -   id: clang-format-with-version-check
         name: clang-format
         description: Format files with ClangFormat.
-        entry: clang-format -i
+        entry: bash ./.clang_format.hook -i
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dcd1218a5b..ad559672ad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,7 @@ option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
+option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -137,9 +138,9 @@ set(EXTERNAL_LIBS
 )
 
 if(WITH_GPU)
-    list(APPEND EXTERNAL_LIB ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+    list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
     if(NOT WITH_DSO)
-        list(APPEND EXTERNAL_LIB ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
     endif(NOT WITH_DSO)
 endif(WITH_GPU)
 
diff --git a/Dockerfile b/Dockerfile
index da00471025..136db772cc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,13 +10,11 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 ARG WITH_DOC
-ARG WITH_STYLE_CHECK
 
 ENV WOBOQ OFF
-ENV WITH_GPU=${WITH_GPU:-OFF}
+ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-OFF}
-ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 
 ENV HOME /root
 # Add bash enhancements
@@ -71,20 +69,6 @@ RUN pip install -r /root/requirements.txt
 RUN apt-get install -y libssl-dev libffi-dev
 RUN pip install certifi urllib3[secure]
 
-# TODO(qijun) The template library Eigen doesn't work well with GCC 5 
-# coming with the default Docker image, so we switch to use GCC 4.8 
-# by default. And I will check Eigen library later.
-
-RUN ln -sf gcc-4.8 /usr/bin/gcc && \
-    ln -sf gcc-ar-4.8 /usr/bin/gcc-ar && \
-    ln -sf gcc-nm-4.8 /usr/bin/gcc-nm && \
-    ln -sf gcc-ranlib-4.8 /usr/bin/gcc-ranlib && \
-    ln -sf gcc-4.8 /usr/bin/x86_64-linux-gnu-gcc && \
-    ln -sf gcc-ar-4.8 /usr/bin/x86_64-linux-gnu-gcc-ar && \
-    ln -sf gcc-nm-4.8 /usr/bin/x86_64-linux-gnu-gcc-nm && \
-    ln -sf gcc-ranlib-4.8 /usr/bin/x86_64-linux-gnu-gcc-ranlib && \
-    ln -sf g++-4.8 /usr/bin/g++ && \
-    ln -sf g++-4.8 /usr/bin/x86_64-linux-gnu-g++ 
 
 # Install woboq_codebrowser to /woboq
 RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 209f9078a6..51c3b918cc 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -28,6 +28,10 @@ if(NOT WITH_TIMER)
     add_definitions(-DPADDLE_DISABLE_TIMER)
 endif(NOT WITH_TIMER)
 
+if(USE_EIGEN_FOR_BLAS)
+    add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
+endif(USE_EIGEN_FOR_BLAS)
+
 if(NOT WITH_PROFILER)
     add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 69f40df516..2c84061ff5 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -2,7 +2,7 @@ if(NOT WITH_GPU)
     return()
 endif()
 
-set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT")
+set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
 find_path(CUDNN_INCLUDE_DIR cudnn.h
     PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
     $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index b27eb71550..ff246b2eb4 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -9,13 +9,6 @@ function(CheckCompilerCXX11Flag)
         if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
             message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
         endif()
-        if(NOT ANDROID)
-            # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
-            # Use Debug mode instead for now.
-            if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9)
-                set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
-            endif()
-        endif()
     elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
         # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
         # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
@@ -160,7 +153,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
-LIST(APPEND CUDA_NVCC_FLAGS -std=c++11 --default-stream per-thread)
+LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
 LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
 
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index cb330ea5e1..2273c8e869 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -257,6 +257,11 @@ seq_concat
 ..  autoclass:: paddle.v2.layer.seq_concat
     :noindex:
 
+seq_slice
+---------
+..  autoclass:: paddle.v2.layer.seq_slice
+    :noindex:
+
 kmax_sequence_score
 -------------------
 ..  autoclass:: paddle.v2.layer.kmax_sequence_score
@@ -362,6 +367,11 @@ trans
 ..  autoclass:: paddle.v2.layer.trans
     :noindex:
 
+scale_shift
+-----------
+..  autoclass:: paddle.v2.layer.scale_shift
+    :noindex:
+
 Sampling Layers
 ===============
 
diff --git a/doc/design/cluster_train/README.md b/doc/design/cluster_train/README.md
index 74961f8005..177a5f5d54 100644
--- a/doc/design/cluster_train/README.md
+++ b/doc/design/cluster_train/README.md
@@ -54,17 +54,18 @@ The life cycle of a single task is illustrated below:
 <img src="src/paddle-task-states.png"/>
 
 1. When a new pass of training starts, all tasks will be placed in the todo queue.
-1. The master server will dispatch few tasks to each trainer at a time, puts them in the pending queue and waits for completion.
-1. The trainer will work on its tasks and tell the master server once a task is completed. The master server will dispatch a new task to that trainer.
-1. If a task timeout. the master server will move it back to the todo queue. The timeout count will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, so it will be discarded.
+1. Upon trainer requests for new task, the master server will dispatch a task from todo queue to it, put the task in the pending queue and wait for completion.
+1. The trainer will work on its task and tell the master server once the task is completed and ask for new task. The master server will dispatch a new task to that trainer.
+1. If a task fails for any reason in trainer, or takes longer than a specific period of time,  the master server will move the task back to the todo queue. The timeout count for that task will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, then it will be discarded.
 1. The master server will move completed task to the done queue. When the todo queue is empty, the master server will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero.
 
 ### Trainer Process
 
 The trainer process will:
 
-- Receive tasks from the master.
-- Work on the tasks: calculate and upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
+- Request tasks from the master.
+- Work on the tasks
+- Upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
 
 ### Parameter Server Process
 
@@ -119,8 +120,8 @@ When the master is started by the Kubernetes, it executes the following steps at
 
 1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations.
 1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them.
-1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers.
-1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update.
+1. Write its ip address to */master/addr* so that trainers can discover it.
+1. Listens to trainers' request of task, dispatch one upon request, and updates task queue using an etcd transaction to ensure lock is held during the update.
 
 When the master server process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.
 
@@ -128,13 +129,11 @@ When the master server process is dead for any reason, Kubernetes will restart i
 
 When the trainer is started by the Kubernetes, it executes the following steps at startup:
 
-1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count.
-1. Generates a unique ID, and sets key `/trainer/<unique ID>` with its contact address as value. The key will be deleted when the lease expires, so the master will be aware of the trainer being online and offline.
-1. Waits for tasks from the master to start training.
+1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count */ps_desired*.
+1. Finds and watches */master/addr* to get master's address.
+1. Requests for tasks from the master to start training.
 
-If trainer's etcd lease expires, it will try set key `/trainer/<unique ID>` again so that the master server can discover the trainer again.
-
-When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from the TODO queue and go on training.
+When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from master and go on training.
 
 ### Parameter Server Process
 
diff --git a/doc/design/cluster_train/src/paddle-etcd.graffle b/doc/design/cluster_train/src/paddle-etcd.graffle
index 56681ae5bb..f973dc9b9d 100644
Binary files a/doc/design/cluster_train/src/paddle-etcd.graffle and b/doc/design/cluster_train/src/paddle-etcd.graffle differ
diff --git a/doc/design/cluster_train/src/paddle-etcd.png b/doc/design/cluster_train/src/paddle-etcd.png
index 4f9c9762b3..57981ceb4b 100644
Binary files a/doc/design/cluster_train/src/paddle-etcd.png and b/doc/design/cluster_train/src/paddle-etcd.png differ
diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD
index e956994431..fe8da907d9 100644
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@@ -101,6 +101,7 @@ if use_mkldnn
 5. 在**Argument**里添加两个`MkldnnMatrixPtr`，取名为`mkldnnValue`和`mkldnnGrad`，用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名)，用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
 6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑，用于判断`deviceId`，并针对device在MKL-DNN和CPU之间不统一的情况，做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。
 7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
+8. 关于MKLDNN参数的保存。由于MKLDNN参数的格式与PaddlePaddle原有的格式存在不一样的情况，所以需要在保存参数时同时保存该格式信息。目前准备扩展[Header](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/parameter/Parameter.h#L247)里面的`int32_t version`。这个值不管是在v1还是在v2里面，一直保存的是0，所以可以充分利用这个信息，定义一个枚举处理所有MKLDNN的参数格式，从而`MKLDNNLayer`就可以从输入的参数中获取需要的格式信息。
 
 ## References
 
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index c0608ede8e..2f14614894 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -68,7 +68,7 @@ As a simple example, consider the following:
 
 1. **BLAS Dependencies(optional)**
   
-    CMake will search BLAS libraries from system. If not found, OpenBLAS will be downloaded, built and installed automatically.
+    CMake will search BLAS libraries from the system. If not found, OpenBLAS will be downloaded, built and installed automatically.
     To utilize preinstalled BLAS， you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
 
     ```bash
@@ -131,9 +131,9 @@ As a simple example, consider the following:
     To build GPU version, you will need the following installed:
 
         1. a CUDA-capable GPU
-        2. A supported version of Linux with a gcc compiler and toolchain
+        2. A supported version of Linux with a GCC compiler and toolchain
         3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
 
     The CUDA development environment relies on tight integration with the host development environment,
     including the host compiler and C runtime libraries, and is therefore only supported on
@@ -172,6 +172,7 @@ export PATH=<path to install>/bin:$PATH
 # install PaddlePaddle Python modules.
 sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
 ```
+
 ## <span id="centos">Build on Centos 7</span>
 
 ### Install Dependencies
@@ -192,9 +193,9 @@ sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
     To build GPU version, you will need the following installed:
 
         1. a CUDA-capable GPU
-        2. A supported version of Linux with a gcc compiler and toolchain
+        2. A supported version of Linux with a GCC compiler and toolchain
         3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
 
     The CUDA development environment relies on tight integration with the host development environment,
     including the host compiler and C runtime libraries, and is therefore only supported on
@@ -222,7 +223,7 @@ mkdir build && cd build
 ``` 
 
 Finally, you can build and install PaddlePaddle:
-
+  
 ```bash
 # you can add build option here, such as:    
 cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
diff --git a/doc/howto/dev/build_cn.md b/doc/howto/dev/build_cn.md
new file mode 100644
index 0000000000..0b911f7b75
--- /dev/null
+++ b/doc/howto/dev/build_cn.md
@@ -0,0 +1,124 @@
+# 编译PaddlePaddle和运行单元测试
+
+## 需要的软硬件
+
+为了开发PaddlePaddle，我们需要
+
+1. 一台电脑，可以装的是 Linux, BSD, Windows 或者 MacOS 操作系统，以及
+1. Docker。
+
+不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker image 里。
+
+## 总体流程
+
+1. 获取源码
+
+   ```bash
+   git clone https://github.com/paddlepaddle/paddle
+   ```
+
+2. 安装开发工具到 Docker image 里
+
+   ```bash
+   cd paddle; docker build -t paddle:dev .
+   ```
+
+   请注意这个命令结尾处的 `.`；它表示 `docker build` 应该读取当前目录下的 [`Dockerfile`文件](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile)，按照其内容创建一个名为 `paddle:dev` 的 Docker image，并且把各种开发工具安装进去。
+
+3. 编译
+
+   以下命令启动一个 Docker container 来执行 `paddle:dev` 这个 Docker image，同时把当前目录（源码树根目录）映射为 container 里的 `/paddle` 目录，并且运行 `Dockerfile` 描述的默认入口程序 [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `cmake` 和 `make` 来编译 `/paddle` 里的源码，结果输出到 `/paddle/build`，也就是本地的源码树根目录里的 `build` 子目录。
+
+   ```bash
+   docker run --rm -v $PWD:/paddle paddle:dev
+   ```
+
+   上述命令编译出一个 CUDA-enabled 版本。如果我们只需要编译一个只支持 CPU 的版本，可以用
+
+   ```bash
+   docker run --rm -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
+   ```
+
+4. 运行单元测试
+
+   用本机的第一个 GPU 来运行包括 GPU 单元测试在内的所有单元测试：
+
+   ```bash
+   NV_GPU=0 nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+
+   如果编译的时候我们用了 `WITH_GPU=OFF` 选项，那么编译过程只会产生 CPU-based 单元测试，那么我们也就不需要 nvidia-docker 来运行单元测试了。我们只需要：
+
+   ```bash
+   docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+
+   有时候我们只想运行一个特定的单元测试，比如 `memory_test`，我们可以
+
+   ```bash
+   nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
+   ```
+
+5. 清理
+
+   有时候我们会希望清理掉已经下载的第三方依赖以及已经编译的二进制文件。此时只需要：
+
+   ```bash
+   rm -rf build
+   ```
+
+## 为什么要 Docker 呀？
+
+- 什么是 Docker?
+
+  如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。
+
+- Docker 还是虚拟机？
+
+  有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
+
+- 为什么用 Docker?
+
+  把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。
+
+  另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。
+
+- 我可以选择不用Docker吗？
+
+  当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程，是因为这个流程比其他方法都更简便。
+
+- 学习 Docker 有多难？
+
+  理解 Docker 并不难，大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+
+- 我可以用 IDE 吗？
+
+  当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
+
+  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
+
+  ```emacs
+  (global-set-key "\C-cc" 'compile)
+  (setq compile-command
+   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+  ```
+
+  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
+
+- 可以并行编译吗？
+
+  是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+
+## 可能碰到的问题
+
+- Docker 需要 sudo
+
+  如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。
+
+- 在 Windows/MacOS 上编译很慢
+
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
+
+- 磁盘不够
+
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
diff --git a/doc/howto/dev/build_en.md b/doc/howto/dev/build_en.md
new file mode 100644
index 0000000000..d0048e3714
--- /dev/null
+++ b/doc/howto/dev/build_en.md
@@ -0,0 +1,124 @@
+# Build PaddlePaddle from Source Code and Run Unit Test
+
+## What Developers Need
+
+To contribute to PaddlePaddle, you need
+
+1. A computer -- Linux, BSD, Windows, MacOS, and
+1. Docker.
+
+Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image.  We run all the tools by running this image.
+
+## General Process
+
+1. Retrieve source code.
+
+   ```bash
+   git clone https://github.com/paddlepaddle/paddle
+   ```
+
+2. Install build tools into a Docker image.
+
+   ```bash
+   cd paddle; docker build -t paddle:dev .
+   ```
+
+   Please be aware of the `.` at the end of the command, which refers to the [`./Dockerfile` file](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile).  `docker build` follows instructions in this file to create a Docker image named `paddle:dev`, and installs building tools into it.
+
+3. Build from source.
+
+   This following command starts a Docker container that executes the Docker image `paddle:dev`, mapping the current directory to `/paddle/` in the container, and runs the default entry-point [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh) as specified in the Dockefile.  `build.sh` invokes `cmake` and `make` to build PaddlePaddle source code, which had been mapped to `/paddle`, and writes outputs to `/paddle/build`, which maps to `build` in the current source directory on the computer.
+
+   ```bash
+   docker run -v $PWD:/paddle paddle:dev
+   ```
+
+   Above command builds a CUDA-enabled version.  If we want to build a CPU-only version, we can type
+
+   ```bash
+   docker run -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
+   ```
+
+4. Run unit tests.
+
+   To run all unit tests using the first GPU of a node:
+
+   ```bash
+   NV_GPU=0 nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+
+   If we used `WITH_GPU=OFF` at build time, it generates only CPU-based unit tests, and we don't need nvidia-docker to run them.  We can just run
+
+   ```bash
+   docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+
+   Sometimes we want to run a specific unit test, say `memory_test`, we can run
+
+   ```bash
+   nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
+   ```
+
+5. Clean Build.
+
+   Sometimes, we might want to clean all thirt-party dependents and built binaries.  To do so, just
+
+   ```bash
+   rm -rf build
+   ```
+
+## Docker, Or Not?
+
+- What is Docker?
+
+  If you haven't heard of it, consider it something like Python's virtualenv.
+
+- Docker or virtual machine?
+
+  Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
+
+- Why Docker?
+
+  Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
+
+  Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
+
+- Can I choose not to use Docker?
+
+  Sure, you don't have to install build tools into a Docker image; instead, you can install them in your local computer.  This document exists because Docker would make the development way easier.
+
+- How difficult is it to learn Docker?
+
+    It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
+
+- Can I use my favorite IDE?
+
+  Yes, of course.  The source code resides on your local computer, and you can edit it using whatever editor you like.
+
+  Many PaddlePaddle developers are using Emacs.  They add the following few lines into their `~/.emacs` configure file:
+
+  ```emacs
+  (global-set-key "\C-cc" 'compile)
+  (setq compile-command
+   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+  ```
+
+  so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
+
+- Does Docker do parallel building?
+
+  Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+
+## Some Gotchas
+
+- Docker requires sudo
+
+  An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly.  If you use a shared computer for development, please ask the administrator to install and configure Docker.  We will do our best to support rkt, another container technology that doesn't require sudo.
+
+- Docker on Windows/MacOS builds slowly
+
+  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
+
+- Not enough disk space
+
+  Examples in this article uses option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index 26449a6365..0608aa3096 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -19,6 +19,7 @@
 ..  toctree::
   :maxdepth: 1
 
+  dev/build_cn.rst
   dev/write_docs_cn.rst
   dev/contribute_to_paddle_cn.md
 
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 1fbfcd260b..1b6034be4e 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -18,6 +18,7 @@ Development
 ..  toctree::
   :maxdepth: 1
 
+  dev/build_en.rst
   dev/new_layer_en.rst
   dev/contribute_to_paddle_en.md
 
diff --git a/go/master/client.go b/go/master/client.go
index 62801b9b7f..f04cf50ce3 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -63,13 +63,24 @@ func WithAddr(addr string) func(c *Client) error {
 // WithEtcd sets the client to use etcd for master discovery.
 func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error {
 	return func(c *Client) error {
-		cli, err := clientv3.New(clientv3.Config{
-			Endpoints:   endpoints,
-			DialTimeout: timeout,
-		})
-		if err != nil {
+		var cli *clientv3.Client
+		f := func() error {
+			var err error
+			cli, err = clientv3.New(clientv3.Config{
+				Endpoints:   endpoints,
+				DialTimeout: timeout,
+			})
 			return err
 		}
+		for {
+			err := f()
+			if err != nil {
+				log.Warningln(err)
+			} else {
+				break
+			}
+			time.Sleep(time.Second)
+		}
 
 		ch := make(chan string, 1)
 		a, err := GetKey(cli, DefaultAddrPath, timeout)
@@ -101,9 +112,6 @@ func NewClient(opts ...func(*Client) error) (*Client, error) {
 		}
 	}
 	c.ch = make(chan record, c.bufSize)
-	// FIXME: connection is created asyncrosly in monitorMaster go routine,
-	//        ensure the connection is ready for use before calling c.addClient.
-	time.Sleep(time.Second)
 	return c, nil
 }
 
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index cf61a243e9..ec866b2907 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -15,6 +15,7 @@ if(Boost_FOUND)
   add_subdirectory(platform)
   add_subdirectory(framework)
   add_subdirectory(operators)
+  add_subdirectory(pybind)
 endif()
 
 if(WITH_C_API)
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index 11022d1754..dde99ab340 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -53,7 +53,10 @@ add_custom_target(paddle_capi_whole ALL
 set_target_properties(paddle_capi_whole
   PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library})
 
+set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map")
+# TODO: merge mkl into paddle_capi_shared
 add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
+set_target_properties(paddle_capi_shared	PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
 target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
 link_paddle_exe(paddle_capi_shared)
 
diff --git a/paddle/capi/export.map b/paddle/capi/export.map
new file mode 100644
index 0000000000..8d673f675d
--- /dev/null
+++ b/paddle/capi/export.map
@@ -0,0 +1,6 @@
+{
+	global:
+		paddle_*;
+	local:
+		*;
+};
diff --git a/paddle/capi/export.sym b/paddle/capi/export.sym
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/paddle/capi/gradient_machine.cpp b/paddle/capi/gradient_machine.cpp
index b3287552db..629449bbd4 100644
--- a/paddle/capi/gradient_machine.cpp
+++ b/paddle/capi/gradient_machine.cpp
@@ -146,3 +146,19 @@ paddle_error paddle_gradient_machine_randomize_param(
   m->machine->randParameters();
   return kPD_NO_ERROR;
 }
+
+paddle_error paddle_gradient_machine_get_layer_output(
+    paddle_gradient_machine machine,
+    const char* layerName,
+    paddle_arguments args) {
+  auto m = cast(machine);
+  auto out = paddle::capi::cast<paddle::capi::CArguments>(args);
+  if (m == nullptr || layerName == nullptr || out == nullptr ||
+      m->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+
+  auto layerOutput = m->machine->getLayerOutput(layerName);
+  out->args.push_back(layerOutput);
+  return kPD_NO_ERROR;
+}
diff --git a/paddle/capi/gradient_machine.h b/paddle/capi/gradient_machine.h
index c613ade5b2..28eeb23e3b 100644
--- a/paddle/capi/gradient_machine.h
+++ b/paddle/capi/gradient_machine.h
@@ -39,7 +39,11 @@ PD_API paddle_error paddle_gradient_machine_create_for_inference(
 /**
  * @brief Create a gradient machine used for model inference, using config with
  *        parameters which is generated by `paddle merge_model`.
- * @param [out] machine that used for model inference.
+ *        Example:
+ *          paddle merge_model \
+ *                 --model_dir="pass-00000" \
+ *                 --model_file="merged_model.paddle"
+ * @param [out] machine that used for model inference
  * @param [in] mergedModel
  * @param [in] size
  * @return paddle_error
@@ -97,6 +101,18 @@ paddle_gradient_machine_randomize_param(paddle_gradient_machine machine);
 PD_API paddle_error
 paddle_gradient_machine_destroy(paddle_gradient_machine machine);
 
+/**
+ * @brief Get the output of the layer named `layerName`.
+ * @param [in] gradient machine that have run a inference
+ * @param [in] layerName name of specified layer
+ * @param [out] args output of the specified layer
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_get_layer_output(paddle_gradient_machine machine,
+                                         const char* layerName,
+                                         paddle_arguments args);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/paddle/cuda/include/hl_cuda_cudnn.h b/paddle/cuda/include/hl_cuda_cudnn.h
index db18e4912b..3f68c62de6 100644
--- a/paddle/cuda/include/hl_cuda_cudnn.h
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
@@ -214,7 +214,8 @@ extern void hl_conv_workspace(hl_tensor_descriptor input,
                               int* convBwdDataAlgo,
                               size_t* bwdDataLimitBytes,
                               int* convBwdFilterAlgo,
-                              size_t* bwdFilterLimitBytes);
+                              size_t* bwdFilterLimitBytes,
+                              bool useDilation);
 
 /**
  * @brief   destroy filter descriptor.
@@ -242,7 +243,9 @@ extern void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                              int padding_height,
                                              int padding_width,
                                              int stride_height,
-                                             int stride_width);
+                                             int stride_width,
+                                             int dilation_h = 1,
+                                             int dilation_w = 1);
 
 /**
  * @brief   reset convolution descriptor.
@@ -262,7 +265,9 @@ extern void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                             int padding_height,
                                             int padding_width,
                                             int stride_height,
-                                            int stride_width);
+                                            int stride_width,
+                                            int dilation_h = 1,
+                                            int dilation_w = 1);
 
 /**
  * @brief   destroy convolution descriptor.
diff --git a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
index abd0d6b099..3afcc6fa85 100644
--- a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
@@ -78,7 +78,9 @@ inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                              int padding_height,
                                              int padding_width,
                                              int stride_height,
-                                             int stride_width) {}
+                                             int stride_width,
+                                             int dilation_h,
+                                             int dilation_w) {}
 
 inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                             hl_tensor_descriptor image,
@@ -86,7 +88,9 @@ inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                             int padding_height,
                                             int padding_width,
                                             int stride_height,
-                                            int stride_width) {}
+                                            int stride_width,
+                                            int dilation_h,
+                                            int dilation_w) {}
 
 inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {}
 
@@ -99,7 +103,8 @@ inline void hl_conv_workspace(hl_tensor_descriptor input,
                               int* convBwdDataAlgo,
                               size_t* bwdDataLimitBytes,
                               int* convBwdFilterAlgo,
-                              size_t* bwdFilterLimitBytes) {}
+                              size_t* bwdFilterLimitBytes,
+                              bool useDilation) {}
 
 inline void hl_convolution_forward(hl_tensor_descriptor input,
                                    real* input_data,
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 78642a1744..f38ef69255 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -201,7 +201,8 @@ void hl_conv_workspace(hl_tensor_descriptor input,
                        int* convBwdDataAlgo,
                        size_t* bwdDataLimitBytes,
                        int* convBwdFilterAlgo,
-                       size_t* bwdFilterLimitBytes) {
+                       size_t* bwdFilterLimitBytes,
+                       bool useDilation) {
 #if CUDNN_VERSION >= 4000
 
   CHECK_NOTNULL(input);
@@ -213,21 +214,60 @@ void hl_conv_workspace(hl_tensor_descriptor input,
   size_t memoryLimitBytes =
       (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
 
+  // For dilation
+  int algo = 0;
+
   // cudnn convolution forward configuration
   cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
   cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
   cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
   cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  // cudnn convolution backward data configuration
+  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+  // cudnn convolution backward filter configuration
+  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
 
-  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
-      t_resource.cudnn_handle,
-      fwd_src_desc,
-      fwd_filter_desc,
-      fwd_conv_desc,
-      fwd_dest_desc,
-      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-      memoryLimitBytes,
-      reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
+  if (useDilation) {
+    convFwdAlgo = &algo;
+    convBwdDataAlgo = &algo;
+    convBwdFilterAlgo = &algo;
+  } else {
+    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
+        t_resource.cudnn_handle,
+        fwd_src_desc,
+        fwd_filter_desc,
+        fwd_conv_desc,
+        fwd_dest_desc,
+        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
+    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+        t_resource.cudnn_handle,
+        bwd_data_filter_desc,
+        bwd_data_diff_desc,
+        bwd_data_conv_desc,
+        bwd_data_grad_desc,
+        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
+    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+        t_resource.cudnn_handle,
+        bwd_filter_src_desc,
+        bwd_filter_diff_desc,
+        bwd_filter_conv_desc,
+        bwd_filter_grad_desc,
+        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
+  }
 
   CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
       t_resource.cudnn_handle,
@@ -238,23 +278,6 @@ void hl_conv_workspace(hl_tensor_descriptor input,
       static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
       fwdLimitBytes));
 
-  // cudnn convolution backward data configuration
-  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
-      GET_CONVOLUTION_DESCRIPTOR(conv);
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-      t_resource.cudnn_handle,
-      bwd_data_filter_desc,
-      bwd_data_diff_desc,
-      bwd_data_conv_desc,
-      bwd_data_grad_desc,
-      CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-      memoryLimitBytes,
-      reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
-
   CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
       t_resource.cudnn_handle,
       bwd_data_filter_desc,
@@ -264,23 +287,6 @@ void hl_conv_workspace(hl_tensor_descriptor input,
       static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
       bwdDataLimitBytes));
 
-  // cudnn convolution backward filter configuration
-  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
-      GET_CONVOLUTION_DESCRIPTOR(conv);
-  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-      t_resource.cudnn_handle,
-      bwd_filter_src_desc,
-      bwd_filter_diff_desc,
-      bwd_filter_conv_desc,
-      bwd_filter_grad_desc,
-      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-      memoryLimitBytes,
-      reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
-
   CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
       t_resource.cudnn_handle,
       bwd_filter_src_desc,
@@ -603,7 +609,9 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                       int padding_height,
                                       int padding_width,
                                       int stride_height,
-                                      int stride_width) {
+                                      int stride_width,
+                                      int dilation_h,
+                                      int dilation_w) {
   CHECK_NOTNULL(conv);
 
   cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
@@ -625,18 +633,24 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                                        padding_width,
                                                        stride_height,
                                                        stride_width,
-                                                       1,
-                                                       1,
+                                                       dilation_h,
+                                                       dilation_w,
                                                        mode,
                                                        data_type));
 #else
+  if (dilation_h > 1 || dilation_w > 1) {
+    LOG(FATAL)
+        << "Current cuDNN version does't support for dilation convolution. "
+        << "The dilation convolution requires cuDNN >= v6.0.";
+  }
+
   CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
                                                        padding_height,
                                                        padding_width,
                                                        stride_height,
                                                        stride_width,
-                                                       1,
-                                                       1,
+                                                       dilation_h,
+                                                       dilation_w,
                                                        mode));
 #endif
 
@@ -659,7 +673,9 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                      int padding_height,
                                      int padding_width,
                                      int stride_height,
-                                     int stride_width) {
+                                     int stride_width,
+                                     int dilation_h,
+                                     int dilation_w) {
   CHECK_NOTNULL(conv);
   CHECK_NOTNULL(image);
   CHECK_NOTNULL(filter);
@@ -678,8 +694,8 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                                        padding_width,
                                                        stride_height,
                                                        stride_width,
-                                                       1,
-                                                       1,
+                                                       dilation_h,
+                                                       dilation_w,
                                                        mode,
                                                        data_type));
 #else
@@ -688,8 +704,8 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                                        padding_width,
                                                        stride_height,
                                                        stride_width,
-                                                       1,
-                                                       1,
+                                                       dilation_h,
+                                                       dilation_w,
                                                        mode));
 #endif
 
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 68304c9fc8..c0838d9b75 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -18,8 +18,8 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 proto_library(framework_proto SRCS framework.proto)
 
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
-
-cc_library(operator SRCS operator.cc DEPS framework_proto device_context tensor scope attribute)
+cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 
 cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
@@ -39,21 +39,3 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
-
-if(WITH_PYTHON)
-cc_library(paddle_pybind SHARED
-    SRCS pybind.cc
-    DEPS pybind python backward
-    sgd_op
-    add_op
-    mul_op
-    rowwise_add_op
-    sigmoid_op
-    softmax_op
-    mean_op
-    cross_entropy_op
-    recurrent_op
-    uniform_random_op
-    gaussian_random_op
-    fill_zeros_like_op)
-endif(WITH_PYTHON)
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index c226e4e3d2..bfda18724c 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -15,6 +15,8 @@
 #include "paddle/framework/backward.h"
 
 #include <list>
+#include <memory>
+
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
@@ -43,11 +45,11 @@ static bool AllInSet(
   return all_in_set;
 }
 
-static std::shared_ptr<OperatorBase> NOP() {
-  auto net_op = std::make_shared<operators::NetOp>();
+static std::unique_ptr<OperatorBase> NOP() {
+  auto net_op = new operators::NetOp();
   net_op->SetType("@NOP@");
   net_op->CompleteAddOp();
-  return net_op;
+  return std::unique_ptr<OperatorBase>(net_op);
 }
 
 //  Get backward operator from a forward operator, a recursive implementation.
@@ -62,11 +64,7 @@ static std::shared_ptr<OperatorBase> NOP() {
 //  operator, in a complex situation, it maybe a NetOp.
 //
 //  See Backward.h for details
-static std::shared_ptr<OperatorBase> BackwardRecursive(
-    const OperatorBase& forwardOp,
-    std::unordered_set<std::string>& no_grad_names, size_t& uniq_id);
-
-std::shared_ptr<OperatorBase> BackwardRecursive(
+static std::unique_ptr<OperatorBase> BackwardRecursive(
     const OperatorBase& forwardOp,
     std::unordered_set<std::string>& no_grad_names, size_t& uniq_id) {
   //  If all input gradients of forwarding operator do not need to calculate,
@@ -91,7 +89,7 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
   }
 
   // Returned gradient network
-  auto net = std::make_shared<operators::NetOp>();
+  auto net = std::unique_ptr<operators::NetOp>(new operators::NetOp());
 
   if (forwardOp.IsNetOp()) {
     // Because forwardOp is a net op, it can static_cast.
@@ -105,14 +103,14 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
     // reversely travel forwardNet and collect all duplicate outputs.
     for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
          ++it, ++local_op_id) {
-      auto fwd = *it;
+      auto& fwd = *it;
       auto bwd = BackwardRecursive(*fwd, no_grad_names, uniq_id);
-      net->AddOp(bwd);
       ForEachVarName(bwd->Outputs(),
                      [&dup_output_ops, local_op_id](const std::string& out) {
                        dup_output_ops[out].emplace_back(local_op_id);
                        return false;
                      });
+      net->AppendOp(std::move(bwd));
     }
     // Get unique ID for this method.
     auto uid = uniq_id++;
@@ -122,7 +120,7 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
     // to handle this case. For each duplicate output, rename it to an alias
     // (original name with a offset), append an `add` op for its operator,
     // and finally sum all the alias variable to the final output variable y.
-    using Pos = std::pair<size_t, std::shared_ptr<OperatorBase>>;
+    using Pos = std::pair<size_t, std::unique_ptr<OperatorBase>>;
     std::list<Pos> insert_position;
     for (auto& dup_output_op : dup_output_ops) {
       const std::string& name = dup_output_op.first;
@@ -150,13 +148,13 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
         [](const Pos& l, const Pos& r) { return l.first > r.first; });
 
     for (auto& pos : insert_position) {
-      net->InsertOp(pos.first + 1, pos.second);
+      net->InsertOp(pos.first + 1, std::move(pos.second));
     }
   } else {
-    std::shared_ptr<OperatorBase> grad_op = OpRegistry::CreateGradOp(forwardOp);
+    std::unique_ptr<OperatorBase> grad_op(OpRegistry::CreateGradOp(forwardOp));
 
-    ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net,
-                                       grad_op](const std::string& grad_input) {
+    ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
+                                          const std::string& grad_input) {
       if (no_grad_names.count(grad_input)) {
         // +1 for \0
         std::string prefix = grad_input.substr(
@@ -165,8 +163,9 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
 
         // If part of input gradient of that operator is not calculated, fill
         // zero variables to that input gradient.
-        net->AddOp(OpRegistry::CreateOp("fill_zeros_like", {{"Src", {prefix}}},
-                                        {{"Dst", {grad_input}}}, {}));
+        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like",
+                                           {{"Src", {prefix}}},
+                                           {{"Dst", {grad_input}}}, {}));
       }
       return false;
     });
@@ -190,23 +189,23 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
       const auto& stepnet_op =
           *static_cast<const OperatorBase*>(&rnnop.stepnet());
       // create stepnet's gradient op
-      auto grad_stepnet = BackwardRecursive(stepnet_op, no_grad_names, uniq_id);
       rnn_grad_op->set_stepnet(
-          std::static_pointer_cast<operators::NetOp>(grad_stepnet));
+          BackwardRecursive(stepnet_op, no_grad_names, uniq_id));
     }
 
     if (net->ops_.empty()) {  // Current no aux op is added to network
       return grad_op;
     }
-    net->AddOp(grad_op);
+    net->AppendOp(std::move(grad_op));
   }
   net->SetType("@GENERATED_BACKWARD@");
   net->CompleteAddOp();
-  return net;
-}  // namespace framework
+  return std::unique_ptr<OperatorBase>(
+      static_cast<OperatorBase*>(net.release()));
+}
 
 // See header for comments
-std::shared_ptr<OperatorBase> Backward(
+std::unique_ptr<OperatorBase> Backward(
     const OperatorBase& forwardOp,
     const std::unordered_set<std::string>& no_grad_vars) {
   std::unordered_set<std::string> no_grad_names;
diff --git a/paddle/framework/backward.h b/paddle/framework/backward.h
index c181919dc1..1ecf69881b 100644
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
@@ -20,7 +20,7 @@ namespace framework {
 
 // Create the backward operator from a forward operator.
 // TODO(yuyang18): Add more API reference comment.
-extern std::shared_ptr<OperatorBase> Backward(
+extern std::unique_ptr<OperatorBase> Backward(
     const OperatorBase& forwardOp,
     const std::unordered_set<std::string>& no_grad_vars);
 }  // namespace framework
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index d942604bf0..f100c4d054 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -32,9 +32,9 @@ class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
  public:
   RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input X of Add").AsNoGradient();
-    AddInput("b", "Bias of Add").AsNoGradient();
-    AddOutput("Out", "Out of Add").AsNoGradient();
+    AddInput("X", "Input X of Add").NotInGradient();
+    AddInput("b", "Bias of Add").NotInGradient();
+    AddOutput("Out", "Out of Add").NotInGradient();
     AddComment("Add Op");
   }
 };
@@ -72,16 +72,16 @@ class NoGradOpMaker : public OpProtoAndCheckerMaker {
 
 class FcOp : public operators::NetOp {
  public:
-  FcOp(const std::string &type, const VarNameMap &inputs,
-       const VarNameMap &outputs, const AttributeMap &attrs)
+  FcOp(const std::string &type, const VariableNameMap &inputs,
+       const VariableNameMap &outputs, const AttributeMap &attrs)
       : NetOp(type, inputs, outputs, attrs) {
-    AddOp(OpRegistry::CreateOp("mul",
-                               {{"X", {Input("X")}}, {"Y", {Input("W")}}},
-                               {{"Out", {Output("mul_result")}}}, {}));
+    AppendOp(OpRegistry::CreateOp("mul",
+                                  {{"X", {Input("X")}}, {"Y", {Input("W")}}},
+                                  {{"Out", {Output("mul_result")}}}, {}));
     auto input_b = Inputs("b");
     std::string before_act = "mul_result";
     if (input_b.size() != 0) {
-      AddOp(OpRegistry::CreateOp(
+      AppendOp(OpRegistry::CreateOp(
           "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
           {{"Out", {Output("add_result")}}}, {}));
       before_act = "add_result";
@@ -92,8 +92,8 @@ class FcOp : public operators::NetOp {
       }
     }
 
-    AddOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
-                               {{"Out", {Output("Out")}}}, {}));
+    AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
+                                  {{"Out", {Output("Out")}}}, {}));
     CompleteAddOp(false);
   }
 };
@@ -180,8 +180,7 @@ TEST(Backward, simple_op_not_need_grad) {
   auto no_input_gop = f::Backward(*fwd, {"x", "b"});
   ASSERT_NE(no_input_gop, nullptr);
   ASSERT_TRUE(no_input_gop->IsNetOp());
-  ASSERT_EQ(0UL,
-            std::static_pointer_cast<ops::NetOp>(no_input_gop)->ops_.size());
+  ASSERT_EQ(0UL, static_cast<ops::NetOp *>(no_input_gop.get())->ops_.size());
 }
 
 TEST(Backward, net_fc_backward_normal) {
@@ -235,13 +234,13 @@ TEST(Backward, net_fc_backward_not_have_b) {
 
 TEST(Backward, net_input_of_network_not_need_grad) {
   ops::NetOp net;
-  net.AddOp(f::OpRegistry::CreateOp(
+  net.AppendOp(f::OpRegistry::CreateOp(
       "fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}},
       {{"mul_result", {"mul_tmp_0"}},
        {"add_result", {"add_tmp_0"}},
        {"Out", {"hidden0"}}},
       {}));
-  net.AddOp(f::OpRegistry::CreateOp(
+  net.AppendOp(f::OpRegistry::CreateOp(
       "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
       {{"mul_result", {"mul_tmp_1"}},
        {"add_result", {"add_tmp_1"}},
@@ -274,10 +273,10 @@ TEST(Backward, net_input_of_network_not_need_grad) {
 
 TEST(Backward, net_shared_weight) {
   ops::NetOp net;
-  net.AddOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
-                                    {{"Out", {"out"}}}, {}));
-  net.AddOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
-                                    {{"Out", {"FinalOut"}}}, {}));
+  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
+                                       {{"Out", {"out"}}}, {}));
+  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
+                                       {{"Out", {"FinalOut"}}}, {}));
   net.CompleteAddOp();
 
   auto bwd = f::Backward(net, {});
@@ -358,19 +357,19 @@ TEST(Backward, op_part_of_input_are_not_need) {
 
 TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
   ops::NetOp net;
-  net.AddOp(f::OpRegistry::CreateOp(
+  net.AppendOp(f::OpRegistry::CreateOp(
       "fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}},
       {{"mul_result", {"mul_out1"}},
        {"add_result", {"add_out1"}},
        {"Out", {"out1"}}},
       {}));
-  net.AddOp(f::OpRegistry::CreateOp(
+  net.AppendOp(f::OpRegistry::CreateOp(
       "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
       {{"mul_result", {"mul_out2"}},
        {"add_result", {"tmp_out2"}},
        {"Out", {"out2"}}},
       {}));
-  net.AddOp(f::OpRegistry::CreateOp(
+  net.AppendOp(f::OpRegistry::CreateOp(
       "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
       {{"mul_result", {"mul_out3"}},
        {"add_result", {"tmp_out3"}},
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index 7077e8aa2c..ae44a1ffd4 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -60,7 +60,7 @@ message OpProto {
 
     optional bool duplicable = 3 [ default = false ];
     optional bool intermediate = 4 [ default = false ];
-    optional bool no_gradient = 5 [ default = false ];
+    optional bool not_in_gradient = 5 [ default = false ];
   }
 
   // AttrProto describes the C++ type Attribute.
diff --git a/paddle/framework/grad_op_builder.cc b/paddle/framework/grad_op_builder.cc
index b73dac22d0..b02a599a80 100644
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
@@ -20,15 +20,15 @@ namespace framework {
 enum class OpArgType { IN, OUT };
 
 static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
-                       bool is_grad, OperatorBase::VarNameMap* vars) {
+                       bool is_grad, VariableNameMap* vars) {
   const auto& src_inout =
       src_type == OpArgType::IN ? src_op->Inputs() : src_op->Outputs();
   auto& dst_inout = *vars;
-  const OpProto* proto = OpRegistry::op_info_map().at(src_op->Type()).proto_;
+  auto& proto = OpInfoMap::Instance().Get(src_op->Type()).Proto();
   const auto& src_arg_list =
-      src_type == OpArgType::IN ? proto->inputs() : proto->outputs();
+      src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
   for (const auto& arg : src_arg_list) {
-    if (arg.no_gradient() && !is_grad) continue;
+    if (arg.not_in_gradient() && !is_grad) continue;
     const std::string src_name = arg.name();
     std::string dst_name = is_grad ? GradVarName(src_name) : src_name;
     dst_inout[dst_name].reserve(src_inout.at(src_name).size());
@@ -40,26 +40,18 @@ static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
 }
 
 OperatorBase* BuildGradOp(const OperatorBase* op) {
-  auto it = OpRegistry::op_info_map().find(op->Type());
-  PADDLE_ENFORCE(it != OpRegistry::op_info_map().end(),
-                 "'%s' has not been registered.", op->Type());
-  PADDLE_ENFORCE(it->second.proto_ != nullptr, "'%s' has no OpProto.",
-                 op->Type());
-  std::string grad_op_type = it->second.grad_op_type_;
-  PADDLE_ENFORCE(!grad_op_type.empty(), "'%s' has no gradient operator.",
-                 op->Type());
+  auto& info = OpInfoMap::Instance().Get(op->Type());
+  PADDLE_ENFORCE(info.HasGradientOp());
 
-  OperatorBase::VarNameMap inputs;
-  OperatorBase::VarNameMap outputs;
+  VariableNameMap inputs;
+  VariableNameMap outputs;
   TransOpArg(op, OpArgType::IN, false, &inputs);   // I
   TransOpArg(op, OpArgType::OUT, false, &inputs);  // O
   TransOpArg(op, OpArgType::OUT, true, &inputs);   // OG
   TransOpArg(op, OpArgType::IN, true, &outputs);   // IG
 
-  it = OpRegistry::op_info_map().find(grad_op_type);
-  PADDLE_ENFORCE(it != OpRegistry::op_info_map().end(),
-                 "'%s' has not been registered.", grad_op_type);
-  return it->second.creator_(grad_op_type, inputs, outputs, op->Attrs());
+  auto& grad_info = OpInfoMap::Instance().Get(info.grad_op_type_);
+  return grad_info.Creator()(info.grad_op_type_, inputs, outputs, op->Attrs());
 }
 
 }  // namespace framework
diff --git a/paddle/framework/grad_op_builder_test.cc b/paddle/framework/grad_op_builder_test.cc
index 0c26293fd2..902c2655e9 100644
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -26,10 +26,10 @@ class IOIgnoredOpMaker : public OpProtoAndCheckerMaker {
   IOIgnoredOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("In1", "a single input");
-    AddInput("In2_mult", "a multiple input").AsDuplicable().AsNoGradient();
+    AddInput("In2_mult", "a multiple input").AsDuplicable().NotInGradient();
     AddInput("In3_mult", "another multiple input").AsDuplicable();
     AddOutput("Out1_mult", "a multiple output").AsDuplicable();
-    AddOutput("Out2", "a single output").AsNoGradient();
+    AddOutput("Out2", "a single output").NotInGradient();
     AddComment("op with inputs and outputs ignored in gradient calculating");
   }
 };
diff --git a/paddle/framework/op_info.cc b/paddle/framework/op_info.cc
new file mode 100644
index 0000000000..81ba29797c
--- /dev/null
+++ b/paddle/framework/op_info.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/op_info.h"
+
+namespace paddle {
+namespace framework {
+
+static OpInfoMap* g_op_info_map = nullptr;
+
+OpInfoMap& OpInfoMap::Instance() {
+  if (g_op_info_map == nullptr) {
+    g_op_info_map = new OpInfoMap();
+  }
+  return *g_op_info_map;
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_info.h b/paddle/framework/op_info.h
new file mode 100644
index 0000000000..94245c6c44
--- /dev/null
+++ b/paddle/framework/op_info.h
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <functional>
+#include <map>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/framework/attribute.h"
+
+namespace paddle {
+namespace framework {
+class OperatorBase;
+using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+
+using OpCreator = std::function<OperatorBase*(
+    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
+    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
+
+struct OpInfo {
+  OpCreator creator_;
+  std::string grad_op_type_;
+  OpProto* proto_;
+  OpAttrChecker* checker_;
+
+  bool HasOpProtoAndChecker() const {
+    return proto_ != nullptr && checker_ != nullptr;
+  }
+
+  const OpProto& Proto() const {
+    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
+    PADDLE_ENFORCE(proto_->IsInitialized(),
+                   "Operator Proto must be initialized in op info");
+    return *proto_;
+  }
+
+  const OpAttrChecker& Checker() const {
+    PADDLE_ENFORCE_NOT_NULL(checker_,
+                            "Operator Checker has not been registered");
+    return *checker_;
+  }
+
+  const OpCreator& Creator() const {
+    PADDLE_ENFORCE_NOT_NULL(creator_,
+                            "Operator Creator has not been registered");
+    return creator_;
+  }
+
+  bool HasGradientOp() const { return !grad_op_type_.empty(); }
+};
+
+class OpInfoMap {
+ public:
+  static OpInfoMap& Instance();
+
+  OpInfoMap(const OpInfoMap& o) = delete;
+  OpInfoMap(OpInfoMap&& o) = delete;
+  OpInfoMap& operator=(const OpInfoMap& o) = delete;
+  OpInfoMap& operator=(OpInfoMap&& o) = delete;
+
+  bool Has(const std::string& op_type) const {
+    return map_.find(op_type) != map_.end();
+  }
+
+  void Insert(const std::string& type, const OpInfo& info) {
+    PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
+    map_.insert({type, info});
+  }
+
+  const OpInfo& Get(const std::string& type) const {
+    auto it = map_.find(type);
+    PADDLE_ENFORCE(it != map_.end(), "Operator %s are not found", type);
+    return it->second;
+  }
+
+  template <typename Callback>
+  void IterAllInfo(Callback callback) {
+    for (auto& it : map_) {
+      callback(it.first, it.second);
+    }
+  }
+
+ private:
+  OpInfoMap() = default;
+  std::unordered_map<std::string, const OpInfo> map_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
index 1caa02a2a1..b0e85dd49f 100644
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -17,5 +17,45 @@ limitations under the License. */
 #include <vector>
 
 namespace paddle {
-namespace framework {}  // namespace framework
+namespace framework {
+
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
+    const std::string& type, const VariableNameMap& inputs,
+    const VariableNameMap& outputs, AttributeMap attrs) {
+  auto& info = OpInfoMap::Instance().Get(type);
+  info.Checker().Check(attrs);
+  auto op = info.Creator()(type, inputs, outputs, attrs);
+  return std::unique_ptr<OperatorBase>(op);
+}
+
+static VariableNameMap ConvertOpDescVarsToVarNameMap(
+    const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars) {
+  VariableNameMap ret_val;
+  for (auto& var : op_desc_vars) {
+    auto& var_names = ret_val[var.parameter()];
+    auto& var_names_in_proto = var.arguments();
+    var_names.reserve(static_cast<size_t>(var_names_in_proto.size()));
+    std::copy(var_names_in_proto.begin(), var_names_in_proto.end(),
+              std::back_inserter(var_names));
+  }
+  return ret_val;
+}
+
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
+  VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
+  VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
+  AttributeMap attrs;
+  for (auto& attr : op_desc.attrs()) {
+    attrs[attr.name()] = GetAttrValue(attr);
+  }
+
+  return CreateOp(op_desc.type(), inputs, outputs, attrs);
+}
+
+std::unique_ptr<OperatorBase> OpRegistry::CreateGradOp(const OperatorBase& op) {
+  PADDLE_ENFORCE(!op.IsNetOp(), "Use framework::Backward to get backward ops");
+  return std::unique_ptr<OperatorBase>(BuildGradOp(&op));
+}
+
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 120f4ede6b..2d09cde41e 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -23,132 +23,24 @@ limitations under the License. */
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/grad_op_builder.h"
+#include "paddle/framework/op_info.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
 
 namespace paddle {
 namespace framework {
 
-// this class not only make proto but also init attribute checkers.
-class OpProtoAndCheckerMaker {
- public:
-  OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : proto_(proto), op_checker_(op_checker) {}
-
-  ~OpProtoAndCheckerMaker() {
-    PADDLE_ENFORCE(validated_, "should call Validate after build");
-  }
-
-  void Validate() {
-    validated_ = true;
-    CheckNoDuplicatedInOutAttrs();
-  }
-
- protected:
-  struct VariableBuilder {
-    OpProto::Var* var_;
-
-    VariableBuilder& AsDuplicable() {
-      var_->set_duplicable(true);
-      return *this;
-    }
-
-    VariableBuilder& AsIntermediate() {
-      var_->set_intermediate(true);
-      return *this;
-    }
-
-    // TODO(FengJiayi, yuyang18): `AsNoGradient` is a very bad name, because it
-    // means that input/output is not needed when calculate gradient. It does
-    // not mean no gradient when backward. It should be changed soon.
-    VariableBuilder& AsNoGradient() {
-      var_->set_no_gradient(true);
-      return *this;
-    }
-  };
-
-  VariableBuilder AddInput(const std::string& name,
-                           const std::string& comment) {
-    auto* input = proto_->add_inputs();
-    input->set_name(name);
-    input->set_comment(comment);
-    return VariableBuilder{input};
-  }
-
-  VariableBuilder AddOutput(const std::string& name,
-                            const std::string& comment) {
-    auto* output = proto_->add_outputs();
-    output->set_name(name);
-    output->set_comment(comment);
-    return VariableBuilder{output};
-  }
-
-  template <typename T>
-  TypedAttrChecker<T>& AddAttr(const std::string& name,
-                               const std::string& comment,
-                               bool generated = false) {
-    auto* attr = proto_->add_attrs();
-    attr->set_name(name);
-    attr->set_comment(comment);
-    attr->set_generated(generated);
-    attr->set_type(AttrTypeID<T>());
-    return op_checker_->AddAttrChecker<T>(name);
-  }
-
-  void AddComment(const std::string& comment) { proto_->set_comment(comment); }
-
- private:
-  void CheckNoDuplicatedInOutAttrs() {
-    std::unordered_set<std::string> names;
-    auto checker = [&](const std::string& name) {
-      PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
-      names.insert(name);
-    };
-    for (auto& attr : proto_->attrs()) {
-      checker(attr.name());
-    }
-    for (auto& input : proto_->inputs()) {
-      checker(input.name());
-    }
-    for (auto& output : proto_->outputs()) {
-      checker(output.name());
-    }
-  }
-
-  OpProto* proto_;
-  OpAttrChecker* op_checker_;
-  bool validated_{false};
-};
-
-class NOPMaker : public OpProtoAndCheckerMaker {
- public:
-  NOPMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {}
-};
-
 class OpRegistry {
-  using VarNameMap = OperatorBase::VarNameMap;
-  using OpCreator = std::function<OperatorBase*(
-      const std::string& /*type*/, const VarNameMap& /*inputs*/,
-      const VarNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
-
  public:
-  struct OpInfo {
-    OpCreator creator_;
-    std::string grad_op_type_;
-    OpProto* proto_;
-    OpAttrChecker* checker_;
-  };
-
   template <typename OpType, typename ProtoMakerType, typename GradOpType>
   static void RegisterOp(const std::string& op_type,
                          const std::string& grad_op_type) {
-    PADDLE_ENFORCE(op_info_map().count(op_type) == 0,
+    PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
                    "'%s' is registered more than once.", op_type);
     OpInfo op_info;
-    op_info.creator_ = [](const std::string& type, const VarNameMap& inputs,
-                          const VarNameMap& outputs,
-                          const AttributeMap& attrs) {
+    op_info.creator_ = [](
+        const std::string& type, const VariableNameMap& inputs,
+        const VariableNameMap& outputs, const AttributeMap& attrs) {
       return new OpType(type, inputs, outputs, attrs);
     };
     op_info.grad_op_type_ = grad_op_type;
@@ -167,60 +59,21 @@ class OpRegistry {
       op_info.proto_ = nullptr;
       op_info.checker_ = nullptr;
     }
-    op_info_map().insert(std::make_pair(op_type, op_info));
+    OpInfoMap::Instance().Insert(op_type, op_info);
     // register gradient op
     if (!grad_op_type.empty()) {
       RegisterOp<GradOpType, NOPMaker, NOP>(grad_op_type, "");
     }
   }
 
-  static std::shared_ptr<OperatorBase> CreateOp(const std::string& type,
-                                                const VarNameMap& inputs,
-                                                const VarNameMap& outputs,
-                                                AttributeMap attrs) {
-    auto it = op_info_map().find(type);
-    PADDLE_ENFORCE(it != op_info_map().end(),
-                   "Operator '%s' has not been registered.", type);
-    it->second.checker_->Check(attrs);
-    auto op = it->second.creator_(type, inputs, outputs, attrs);
-    return std::shared_ptr<OperatorBase>(op);
-  }
-
-  static VarNameMap ConvertOpDescVarsToVarNameMap(
-      const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars) {
-    VarNameMap ret_val;
-    for (auto& var : op_desc_vars) {
-      auto& var_names = ret_val[var.parameter()];
-      auto& var_names_in_proto = var.arguments();
-      var_names.reserve(static_cast<size_t>(var_names_in_proto.size()));
-      std::copy(var_names_in_proto.begin(), var_names_in_proto.end(),
-                std::back_inserter(var_names));
-    }
-    return ret_val;
-  }
-
-  static std::shared_ptr<OperatorBase> CreateOp(const OpDesc& op_desc) {
-    VarNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
-    VarNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
-    AttributeMap attrs;
-    for (auto& attr : op_desc.attrs()) {
-      attrs[attr.name()] = GetAttrValue(attr);
-    }
+  static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
+                                                const VariableNameMap& inputs,
+                                                const VariableNameMap& outputs,
+                                                AttributeMap attrs);
 
-    return CreateOp(op_desc.type(), inputs, outputs, attrs);
-  }
+  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
 
-  static std::shared_ptr<OperatorBase> CreateGradOp(const OperatorBase& op) {
-    PADDLE_ENFORCE(!op.IsNetOp(),
-                   "Use framework::Backward to get backward ops");
-    std::shared_ptr<OperatorBase> grad_op(BuildGradOp(&op));
-    return grad_op;
-  }
-
-  static std::unordered_map<std::string, const OpInfo>& op_info_map() {
-    static std::unordered_map<std::string, const OpInfo> op_info_map_;
-    return op_info_map_;
-  }
+  static std::unique_ptr<OperatorBase> CreateGradOp(const OperatorBase& op);
 };
 
 class Registrar {
@@ -272,8 +125,18 @@ class OpKernelRegistrar : public Registrar {
                     grad_op_class)                                            \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                             \
       __reg_op__##op_type, "REGISTER_OP must be called in global namespace"); \
-  static ::paddle::framework::OpRegistrar<op_class, op_maker_class,           \
-                                          grad_op_class>                      \
+  class _OpClass_##op_type##_ : public op_class {                             \
+   public:                                                                    \
+    DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_);                            \
+    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class);                   \
+  };                                                                          \
+  class _OpGradClass_##op_type##_ : public grad_op_class {                    \
+   public:                                                                    \
+    DEFINE_OP_CLONE_METHOD(_OpGradClass_##op_type##_);                        \
+    DEFINE_OP_CONSTRUCTOR(_OpGradClass_##op_type##_, grad_op_class);          \
+  };                                                                          \
+  static ::paddle::framework::OpRegistrar<                                    \
+      _OpClass_##op_type##_, op_maker_class, _OpGradClass_##op_type##_>       \
       __op_registrar_##op_type##__(#op_type, #grad_op_type);                  \
   int TouchOpRegistrar_##op_type() {                                          \
     __op_registrar_##op_type##__.Touch();                                     \
@@ -304,7 +167,8 @@ class OpKernelRegistrar : public Registrar {
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
 
 /**
- * Macro to mark what Operator and Kernel we will use and tell the compiler to
+ * Macro to mark what Operator and Kernel
+ * we will use and tell the compiler to
  * link them into target.
  */
 #define USE_OP_ITSELF(op_type)                                    \
@@ -324,7 +188,8 @@ class OpKernelRegistrar : public Registrar {
       __attribute__((unused)) =                                  \
           TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE()
 
-// TODO(fengjiayi): The following macros seems ugly, do we have better method?
+// TODO(fengjiayi): The following macros
+// seems ugly, do we have better method?
 
 #ifdef PADDLE_ONLY_CPU
 #define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index 1a85d56835..50c45919c5 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -76,8 +76,7 @@ TEST(OpRegistry, CreateOp) {
   attr->set_type(paddle::framework::AttrType::FLOAT);
   attr->set_f(scale);
 
-  std::shared_ptr<paddle::framework::OperatorBase> op =
-      paddle::framework::OpRegistry::CreateOp(op_desc);
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::framework::Scope scope;
   paddle::platform::CPUDeviceContext dev_ctx;
   op->Run(scope, dev_ctx);
@@ -118,8 +117,7 @@ TEST(OpRegistry, DefaultValue) {
 
   ASSERT_TRUE(op_desc.IsInitialized());
 
-  std::shared_ptr<paddle::framework::OperatorBase> op =
-      paddle::framework::OpRegistry::CreateOp(op_desc);
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::framework::Scope scope;
   paddle::platform::CPUDeviceContext dev_ctx;
   op->Run(scope, dev_ctx);
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 0daf12e7f5..7abbde610f 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -115,8 +115,8 @@ void OperatorBase::Rename(const std::string& old_name,
 }
 
 OperatorBase::OperatorBase(const std::string& type,
-                           const OperatorBase::VarNameMap& inputs,
-                           const OperatorBase::VarNameMap& outputs,
+                           const VariableNameMap& inputs,
+                           const VariableNameMap& outputs,
                            const AttributeMap& attrs)
     : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) {
   static std::atomic<size_t> gUniqId(0UL);
@@ -141,18 +141,10 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
     }
     return ret_val;
   }
-  auto it = OpRegistry::op_info_map().find(type_);
-  PADDLE_ENFORCE(
-      it != OpRegistry::op_info_map().end(),
-      "Operator %s not registered, cannot figure out intermediate outputs",
-      type_);
-  PADDLE_ENFORCE(
-      it->second.proto_ != nullptr,
-      "Operator %s has no OpProto, cannot figure out intermediate outputs",
-      type_);
+  auto& info = OpInfoMap::Instance().Get(Type());
 
   // get all OpProto::Var for outputs
-  for (auto& o : it->second.proto_->outputs()) {
+  for (auto& o : info.Proto().outputs()) {
     // ignore all intermediate output
     if (o.intermediate()) continue;
     auto out = outputs_.find(o.name());
@@ -164,5 +156,43 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
   return ret_val;
 }
 
+void OpProtoAndCheckerMaker::Validate() {
+  validated_ = true;
+  CheckNoDuplicatedInOutAttrs();
+}
+
+OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput(
+    const std::string& name, const std::string& comment) {
+  auto* input = proto_->add_inputs();
+  input->set_name(name);
+  input->set_comment(comment);
+  return OpProtoAndCheckerMaker::VariableBuilder{input};
+}
+
+OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
+    const std::string& name, const std::string& comment) {
+  auto* output = proto_->add_outputs();
+  output->set_name(name);
+  output->set_comment(comment);
+  return OpProtoAndCheckerMaker::VariableBuilder{output};
+}
+
+void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
+  std::unordered_set<std::string> names;
+  auto checker = [&](const std::string& name) {
+    PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
+    names.insert(name);
+  };
+  for (auto& attr : proto_->attrs()) {
+    checker(attr.name());
+  }
+  for (auto& input : proto_->inputs()) {
+    checker(input.name());
+  }
+  for (auto& output : proto_->outputs()) {
+    checker(output.name());
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 60d4f06c7e..8397570d26 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "op_info.h"
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/scope.h"
@@ -62,14 +63,8 @@ class ExecutionContext;
  */
 class OperatorBase {
  public:
-  using VarNameMap = std::map<std::string, std::vector<std::string>>;
-
-  OperatorBase(const std::string& type, const VarNameMap& inputs,
-               const VarNameMap& outputs, const AttributeMap& attrs);
-
-  OperatorBase(const OperatorBase& o) = delete;
-  OperatorBase& operator=(const OperatorBase& o) = delete;
-  OperatorBase(OperatorBase&& o) = delete;
+  OperatorBase(const std::string& type, const VariableNameMap& inputs,
+               const VariableNameMap& outputs, const AttributeMap& attrs);
 
   virtual ~OperatorBase() {}
 
@@ -97,8 +92,8 @@ class OperatorBase {
   /// rename inputs outputs name
   void Rename(const std::string& old_name, const std::string& new_name);
 
-  const VarNameMap& Inputs() const { return inputs_; }
-  const VarNameMap& Outputs() const { return outputs_; }
+  const VariableNameMap& Inputs() const { return inputs_; }
+  const VariableNameMap& Outputs() const { return outputs_; }
   //! Get a input with argument's name described in `op_proto`
   const std::string& Input(const std::string& name) const;
   //! Get a input which has multiple variables.
@@ -116,26 +111,117 @@ class OperatorBase {
   void SetType(const std::string& type) { type_ = type; }
   const AttributeMap& Attrs() const { return attrs_; }
 
+  // Return a new operator instance, which is as same as this.
+  // Use unique_ptr to prevent caller forget to delete this pointer.
+  virtual std::unique_ptr<OperatorBase> Clone() const = 0;
+
  protected:
   std::string type_;
   // NOTE: in case of OpGrad, inputs_ contains:
-  // I (Inputs)
+  // I (Inputs)opear
   // O (Outputs)
   // OG (Output Gradients)
-  VarNameMap inputs_;
+  VariableNameMap inputs_;
 
   // NOTE: in case of OpGrad, outputs_ contains
   // IG (Inputs Gradients)
-  VarNameMap outputs_;
+  VariableNameMap outputs_;
   AttributeMap attrs_;
 };
 
+// Macro for define a clone method.
+// If you are writing an kernel operator, `Clone` will be defined when you
+// register it. i.e. `Clone` method is not needed to define by yourself.
+#define DEFINE_OP_CLONE_METHOD(cls)                       \
+  std::unique_ptr<OperatorBase> Clone() const final {     \
+    return std::unique_ptr<OperatorBase>(new cls(*this)); \
+  }
+
+// Macro for define a default constructor for Operator.
+// You can also use
+//   using PARENT_CLASS::PARENT_CLASS;
+// to use parent's constructor.
+#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)             \
+  cls(const std::string& type,                             \
+      const ::paddle::framework::VariableNameMap& inputs,  \
+      const ::paddle::framework::VariableNameMap& outputs, \
+      const paddle::framework::AttributeMap& attrs)        \
+      : parent_cls(type, inputs, outputs, attrs) {}
+
 class NOP : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
   void InferShape(const Scope& scope) const override {}
   void Run(const Scope& scope,
            const platform::DeviceContext& dev_ctx) const override {}
+  std::unique_ptr<OperatorBase> Clone() const override {
+    return std::unique_ptr<OperatorBase>(new NOP(*this));
+  }
+};
+
+// this class not only make proto but also init attribute checkers.
+class OpProtoAndCheckerMaker {
+ public:
+  OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : proto_(proto), op_checker_(op_checker) {}
+
+  ~OpProtoAndCheckerMaker() {
+    PADDLE_ENFORCE(validated_, "should call Validate after build");
+  }
+
+  void Validate();
+
+ protected:
+  struct VariableBuilder {
+    OpProto::Var* var_;
+
+    VariableBuilder& AsDuplicable() {
+      var_->set_duplicable(true);
+      return *this;
+    }
+
+    VariableBuilder& AsIntermediate() {
+      var_->set_intermediate(true);
+      return *this;
+    }
+
+    VariableBuilder& NotInGradient() {
+      var_->set_not_in_gradient(true);
+      return *this;
+    }
+  };
+
+  VariableBuilder AddInput(const std::string& name, const std::string& comment);
+
+  VariableBuilder AddOutput(const std::string& name,
+                            const std::string& comment);
+
+  template <typename T>
+  TypedAttrChecker<T>& AddAttr(const std::string& name,
+                               const std::string& comment,
+                               bool generated = false) {
+    auto* attr = proto_->add_attrs();
+    attr->set_name(name);
+    attr->set_comment(comment);
+    attr->set_generated(generated);
+    attr->set_type(AttrTypeID<T>());
+    return op_checker_->AddAttrChecker<T>(name);
+  }
+
+  void AddComment(const std::string& comment) { proto_->set_comment(comment); }
+
+ private:
+  void CheckNoDuplicatedInOutAttrs();
+
+  OpProto* proto_;
+  OpAttrChecker* op_checker_;
+  bool validated_{false};
+};
+
+class NOPMaker : public OpProtoAndCheckerMaker {
+ public:
+  NOPMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {}
 };
 
 class InferShapeContext {
@@ -304,8 +390,8 @@ class OperatorWithKernel : public OperatorBase {
   using OpKernelMap =
       std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
 
-  OperatorWithKernel(const std::string& type, const VarNameMap& inputs,
-                     const VarNameMap& outputs, const AttributeMap& attrs)
+  OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
+                     const VariableNameMap& outputs, const AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void InferShape(const Scope& scope) const override {
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 0441cec9f6..1d7efb7b94 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -23,8 +23,8 @@ static int op_run_num = 0;
 
 class OpWithoutKernelTest : public OperatorBase {
  public:
-  OpWithoutKernelTest(const std::string& type, const VarNameMap& inputs,
-                      const VarNameMap& outputs, const AttributeMap& attrs)
+  OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
+                      const VariableNameMap& outputs, const AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs), x(1) {}
   void InferShape(const Scope& scope) const override {}
   void Run(const Scope& scope,
@@ -245,3 +245,22 @@ TEST(OpKernel, multi_inputs) {
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   op->Run(scope, cpu_device_context);
 }
+
+class OperatorClone : public paddle::framework::OperatorBase {
+ public:
+  DEFINE_OP_CLONE_METHOD(OperatorClone);
+  OperatorClone(const std::string& type,
+                const paddle::framework::VariableNameMap& inputs,
+                const paddle::framework::VariableNameMap& outputs,
+                const paddle::framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void InferShape(const paddle::framework::Scope& scope) const override {}
+  void Run(const paddle::framework::Scope& scope,
+           const paddle::platform::DeviceContext& dev_ctx) const override {}
+};
+
+TEST(Operator, Clone) {
+  OperatorClone a("ABC", {}, {}, {});
+  auto b = a.Clone();
+  ASSERT_EQ(a.Type(), b->Type());
+}
\ No newline at end of file
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index b8c779f4e5..643f875491 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -105,7 +105,10 @@ class Tensor {
   template <typename T>
   inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
 
-  platform::Place place() const { return holder_->place(); }
+  platform::Place place() const {
+    PADDLE_ENFORCE_NOT_NULL(holder_, "Tensor get place() must contains holder");
+    return holder_->place();
+  }
 
  private:
   template <typename T>
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 7dfb6f61c5..c572a9d433 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -4,6 +4,10 @@ file(GLOB cpp_files . *Op.cpp)
 list(APPEND h_files Function.h)
 list(APPEND cpp_files Function.cpp)
 list(APPEND cpp_files BufferArg.cpp)
+list(APPEND cpp_files GemmFunctor.cpp)
+if(USE_EIGEN_FOR_BLAS)
+  list(APPEND cpp_files EigenGemm.cpp)
+endif(USE_EIGEN_FOR_BLAS)
 
 if(WITH_GPU)
     file(GLOB cu_files . *OpGpu.cu)
diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index 490e8d546c..2f3112fe65 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "DepthwiseConvOp.h"
 #include "ConvOp.h"
-#include "GemmFunctor.h"
 
 namespace paddle {
 
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
index 33463805cb..2d722dfcfc 100644
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "DepthwiseConvOp.h"
-#include "GemmFunctor.h"
 #include "paddle/math/BaseMatrix.h"
 
 namespace paddle {
diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp
new file mode 100644
index 0000000000..674141ed39
--- /dev/null
+++ b/paddle/function/EigenGemm.cpp
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+
+template <class T>
+struct EigenBlasGemm {
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
+                           Eigen::Aligned>
+      Matrix;
+
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+    Eigen::array<int, 2> sizeA;
+    if (transA) {
+      sizeA[0] = K;
+      sizeA[1] = M;
+      CHECK_EQ(M, lda);
+    } else {
+      sizeA[0] = M;
+      sizeA[1] = K;
+      CHECK_EQ(K, lda);
+    }
+    Eigen::array<int, 2> sizeB;
+    if (transB) {
+      sizeB[0] = N;
+      sizeB[1] = K;
+      CHECK_EQ(K, ldb);
+    } else {
+      sizeB[0] = K;
+      sizeB[1] = N;
+      CHECK_EQ(N, ldb);
+    }
+    Eigen::array<int, 2> sizeC;
+    sizeC[0] = M;
+    sizeC[1] = N;
+    CHECK_EQ(N, ldc);
+
+    const Matrix a(const_cast<T*>(A), sizeA);
+    const Matrix b(const_cast<T*>(B), sizeB);
+    Matrix c(C, sizeC);
+
+    typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
+    Eigen::array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    dims[0].first = transA ? 0 : 1;
+    dims[0].second = transB ? 1 : 0;
+
+    Eigen::DefaultDevice device;
+    if (alpha == T(1) && beta == T(0)) {
+      c.device(device) = a.contract(b, dims);
+    } else if (alpha == T(1) && beta == T(1)) {
+      c.device(device) += a.contract(b, dims);
+    } else {
+      c.device(device) = alpha * a.contract(b, dims) + beta * c;
+    }
+  }
+};
+
+#ifdef PADDLE_TYPE_DOUBLE
+template class EigenBlasGemm<double>;
+#else
+template class EigenBlasGemm<float>;
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index 0ada4d70a0..f8cf4ebea8 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -85,7 +85,6 @@ public:
     }
 
     Im2ColFunctor<kCFO, Device, real> im2col;
-    GemmFunctor<Device, real> gemm;
     size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
@@ -108,19 +107,19 @@ public:
         int M = outputChannels / groups_;
         int N = outputHeight * outputWidth;
         int K = inputChannels / groups_ * filterHeight * filterWidth;
-        gemm(CblasNoTrans,
-             CblasNoTrans,
-             M,
-             N,
-             K,
-             1.0f,
-             filterData + g * filterOffset,
-             K,
-             colData,
-             N,
-             beta,
-             outputData + g * outputOffset,
-             N);
+        BlasGemm<Device, real>::compute(false,
+                                        false,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        filterData + g * filterOffset,
+                                        K,
+                                        colData,
+                                        N,
+                                        beta,
+                                        outputData + g * outputOffset,
+                                        N);
       }
       inputData += inputChannels * inputHeight * inputWidth;
       outputData += outputChannels * outputHeight * outputWidth;
@@ -188,8 +187,6 @@ public:
     }
 
     Col2ImFunctor<kCFO, Device, real> col2im;
-    GemmFunctor<Device, real> gemm;
-
     size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
@@ -205,19 +202,19 @@ public:
           colData = inputGrad + g * inputOffset;
           scale = 1.0f;
         }
-        gemm(CblasTrans,
-             CblasNoTrans,
-             M,
-             N,
-             K,
-             1.0f,
-             filterData + g * filterOffset,
-             M,
-             outputGrad + g * outputOffset,
-             N,
-             scale,
-             colData,
-             N);
+        BlasGemm<Device, real>::compute(true,
+                                        false,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        filterData + g * filterOffset,
+                                        M,
+                                        outputGrad + g * outputOffset,
+                                        N,
+                                        scale,
+                                        colData,
+                                        N);
         if (needIm2col) {
           col2im(inputGrad + g * inputOffset,
                  imShape,
@@ -299,7 +296,6 @@ public:
     }
 
     Im2ColFunctor<kCFO, Device, real> im2col;
-    GemmFunctor<Device, real> gemm;
     size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
@@ -321,19 +317,19 @@ public:
         int M = outputChannels / groups_;
         int K = outputHeight * outputWidth;
         int N = inputChannels / groups_ * filterHeight * filterWidth;
-        gemm(CblasNoTrans,
-             CblasTrans,
-             M,
-             N,
-             K,
-             1.0f,
-             outputGrad + g * outputOffset,
-             K,
-             colData,
-             K,
-             i == 0 ? beta : 1.0f,
-             filterGrad + g * filterOffset,
-             N);
+        BlasGemm<Device, real>::compute(false,
+                                        true,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        outputGrad + g * outputOffset,
+                                        K,
+                                        colData,
+                                        K,
+                                        i == 0 ? beta : 1.0f,
+                                        filterGrad + g * filterOffset,
+                                        N);
       }
       inputData += inputChannels * inputHeight * inputWidth;
       outputGrad += outputChannels * outputHeight * outputWidth;
diff --git a/paddle/function/GemmFunctor.cpp b/paddle/function/GemmFunctor.cpp
new file mode 100644
index 0000000000..9e25ee58a1
--- /dev/null
+++ b/paddle/function/GemmFunctor.cpp
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GemmFunctor.h"
+#include "paddle/math/MathFunctions.h"
+
+namespace paddle {
+
+template <class T>
+struct BlasGemm<DEVICE_TYPE_CPU, T> {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+#ifdef PADDLE_USE_EIGEN_FOR_BLAS
+    EigenBlasGemm<T>::compute(
+        transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+#else
+    gemm<T>(transA == false ? CblasNoTrans : CblasTrans,
+            transB == false ? CblasNoTrans : CblasTrans,
+            M,
+            N,
+            K,
+            alpha,
+            A,
+            lda,
+            B,
+            ldb,
+            beta,
+            C,
+            ldc);
+#endif
+  }
+};
+
+template <class T>
+struct BlasGemm<DEVICE_TYPE_GPU, T> {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+    hl_matrix_mul((T*)A,
+                  transA == false ? HPPL_OP_N : HPPL_OP_T,
+                  (T*)B,
+                  transB == false ? HPPL_OP_N : HPPL_OP_T,
+                  C,
+                  M,
+                  N,
+                  K,
+                  alpha,
+                  beta,
+                  lda,
+                  ldb,
+                  ldc);
+  }
+};
+
+template struct BlasGemm<DEVICE_TYPE_CPU, real>;
+template struct BlasGemm<DEVICE_TYPE_GPU, real>;
+
+}  // namespace paddle
diff --git a/paddle/function/GemmFunctor.h b/paddle/function/GemmFunctor.h
index d5db5cf5e7..0809953b4e 100644
--- a/paddle/function/GemmFunctor.h
+++ b/paddle/function/GemmFunctor.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/math/MathFunctions.h"
+#include "TensorType.h"
 
 namespace paddle {
 
@@ -24,73 +24,42 @@ namespace paddle {
 // of MatMulFunction, we need to consider the reconstruction of hl_matrix_mul
 // interface.
 template <DeviceType Device, class T>
-class GemmFunctor {
-public:
-  void operator()(const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE TransB,
-                  const int M,
-                  const int N,
-                  const int K,
-                  const T alpha,
-                  const T* A,
-                  const int lda,
-                  const T* B,
-                  const int ldb,
-                  const T beta,
-                  T* C,
-                  const int ldc);
+struct BlasGemm {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc);
 };
 
+// TODO(hedaoyuan): Since the definition of the real type in the Paddle
+// conflicts with the Eigen library, so compile the Eigen code can not
+// include the Paddle header file. And need an EigenBlasGemm template class
+// that does not contain the DeviceType parameter.
+// I will fix this problem and merge BlasGemm and EigenBlasGemm into one.
 template <class T>
-class GemmFunctor<DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE TransB,
-                  const int M,
-                  const int N,
-                  const int K,
-                  const T alpha,
-                  const T* A,
-                  const int lda,
-                  const T* B,
-                  const int ldb,
-                  const T beta,
-                  T* C,
-                  const int ldc) {
-    gemm<T>(transA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
-  }
-};
-
-template <class T>
-class GemmFunctor<DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE TransB,
-                  const int M,
-                  const int N,
-                  const int K,
-                  const T alpha,
-                  const T* A,
-                  const int lda,
-                  const T* B,
-                  const int ldb,
-                  const T beta,
-                  T* C,
-                  const int ldc) {
-    hl_matrix_mul((T*)A,
-                  transA == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T,
-                  (T*)B,
-                  TransB == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T,
-                  C,
-                  M,
-                  N,
-                  K,
-                  alpha,
-                  beta,
-                  lda,
-                  ldb,
-                  ldc);
-  }
+struct EigenBlasGemm {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc);
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index cfa80a8936..26cff3e677 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -202,7 +202,7 @@ void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
         auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
             para->getMat(PARAMETER_VALUE).get());
         para->clearGradient();
-        mat->clearIndices();
+        if (mat) mat->clearIndices();
       }
     }
   }
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index f98bf95064..9f29b97466 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -184,7 +184,7 @@ public:
   }
 
   void backward(const UpdateCallback& callback) override {
-    if (biases_) {
+    if (biases_ && biases_->getWGrad()) {
       backwardActivation();
       biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
       biases_->getParameterPtr()->incUpdate(callback);
@@ -1012,11 +1012,6 @@ void RecurrentGradientMachine::generateSequence() {
                            /* width */ resultNum,
                            false,
                            /* useGpu */ false);
-    Matrix::resizeOrCreate(generator_.outArg.value,
-                           /* height */ maxGenWordCount,
-                           /* width */ 1,
-                           false,
-                           /* useGpu */ false);
   }
   ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
                                 numSequences + 1,
@@ -1026,7 +1021,7 @@ void RecurrentGradientMachine::generateSequence() {
   } else {
     oneWaySearch(numSequences);
   }
-  if (dataArgsSize_) createDataOutlink(batchMachineIdVec_);
+  if (dataArgsSize_) createDataOutlink();
 
   size_t size = generator_.ids.size();
   generator_.outArg.ids->resize(size);
@@ -1106,6 +1101,7 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
   }
 
   batchMachineIdVec_.clear();
+  batchMachineStartPos_.clear();
   int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
   starts[0] = 0;
   generator_.ids.clear();
@@ -1312,13 +1308,20 @@ void RecurrentGradientMachine::fillGenOutputs() {
     finalPaths_[i].resize(minFinalPathsSize);
   }
 
-  batchMachineIdVec_.clear();
   generator_.ids.clear();
   int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
   starts[0] = 0;
   if (numResults > 1) {
-    real* probs = generator_.outArg.in->getData();
+    int idsProbSaveSize = 0;
+    for (auto inSeq : finalPaths_) {
+      for (auto path : inSeq) idsProbSaveSize += path.ids.size();
+      idsProbSaveSize += inSeq.size();
+    }
+    Matrix::resizeOrCreate(
+        generator_.outArg.value, idsProbSaveSize, 1, false, false);
     real* idsProb = generator_.outArg.value->getData();
+
+    real* probs = generator_.outArg.in->getData();
     size_t curPos = 0;
     for (size_t i = 0; i < finalPaths_.size(); ++i) {
       for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
@@ -1333,24 +1336,16 @@ void RecurrentGradientMachine::fillGenOutputs() {
         curPos += genLen;
         idsProb[curPos++] = -1.0;
         probs[i * numResults + j] = path.logProb;
-
-        if (!j && dataArgsSize_) {
-          // in beam search, here only reserved the top 1 generated result
-          // for out_links that are not the generated word indices.
-          batchMachineIdVec_.insert(batchMachineIdVec_.end(),
-                                    path.machineIdVec.begin(),
-                                    path.machineIdVec.end());
-        }
       }
       starts[i + 1] = generator_.ids.size();
     }
   } else {
     for (size_t i = 0; i < finalPaths_.size(); ++i) {
       CHECK(!finalPaths_[i].empty());
-      generator_.ids.insert(generator_.ids.begin(),
-                            finalPaths_[i][0].ids.begin(),
-                            finalPaths_[i][0].ids.end());
-      starts[i + 1] = starts[i] + finalPaths_[i][0].ids.size();
+      Path& path = finalPaths_[i][0];
+      generator_.ids.insert(
+          generator_.ids.end(), path.ids.begin(), path.ids.end());
+      starts[i + 1] = starts[i] + path.ids.size();
     }
   }
 }
@@ -1364,25 +1359,76 @@ void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
   }
 }
 
-void RecurrentGradientMachine::createDataOutlink(
-    std::vector<int>& machineIdVec) {
-  size_t seqNum =
-      getBeamSize() > 1UL ? finalPaths_.size() : finalPaths_[0].size();
-  std::vector<int> starts(seqNum + 1, 0);
-  for (size_t i = 0; i < seqNum; ++i) {
-    size_t seqLen = getBeamSize() > 1UL ? finalPaths_[i][0].ids.size()
-                                        : finalPaths_[0][i].ids.size();
-    starts[i + 1] = starts[i] + seqLen;
+void RecurrentGradientMachine::createDataOutlinkSelRowsInfo(
+    bool isSeq, std::vector<Argument>& outArgs) {
+  batchMachineIdVec_.clear();
+
+  size_t seqIdx = 0;
+  for (size_t i = 0; i < finalPaths_.size(); ++i) {
+    for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+      std::vector<int>& machineIdVec = finalPaths_[i][j].machineIdVec;
+      if (isSeq) {
+        for (size_t i = 0; i < machineIdVec.size(); ++i) {
+          size_t rowId = machineIdVec[i];
+          int* seqPos =
+              outArgs[i].sequenceStartPositions->getMutableData(false);
+          batchMachineIdVec_.push_back(seqPos[rowId]);
+        }
+      } else {
+        batchMachineIdVec_.insert(
+            batchMachineIdVec_.end(), machineIdVec.begin(), machineIdVec.end());
+      }
+      seqIdx++;
+    }
+  }
+}
+
+void RecurrentGradientMachine::createDataOutlinkCopySizeInfo(
+    bool isSeq, std::vector<Argument>& outArgs, std::vector<int>& copySize) {
+  size_t totalSeqNum = std::accumulate(
+      finalPaths_.begin(),
+      finalPaths_.end(),
+      0UL,
+      [](size_t a, const std::vector<Path>& b) { return a + b.size(); });
+  copySize.resize(totalSeqNum, 1);
+
+  batchMachineStartPos_.resize(totalSeqNum + 1, 0);
+  if (isSeq) {
+    ICpuGpuVectorPtr inputSeqStartPos = outArgs[0].sequenceStartPositions;
+    CHECK_EQ(static_cast<size_t>(inputSeqStartPos->getSize() - 1),
+             getBeamSize() > 1 ? finalPaths_.size() : finalPaths_[0].size());
+    int* starts = inputSeqStartPos->getMutableData(false);
+    int seqId = 0;
+    for (size_t i = 0; i < finalPaths_.size(); ++i) {
+      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+        copySize[seqId] = getBeamSize() > 1 ? starts[i + 1] - starts[i]
+                                            : starts[j + 1] - starts[j];
+        batchMachineStartPos_[seqId + 1] =
+            batchMachineStartPos_[seqId] + finalPaths_[i][j].ids.size();
+        seqId++;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < finalPaths_[0].size(); ++i)
+      batchMachineStartPos_[i + 1] =
+          batchMachineStartPos_[i] + finalPaths_[0][i].ids.size();
   }
+}
 
+void RecurrentGradientMachine::createDataOutlink() {
   for (size_t i = 0; i < dataArgsSize_; i++) {
+    bool isSeq = dataArgsFrame_[i][0].hasSeq();
+    std::vector<int> copySize;
+    createDataOutlinkCopySizeInfo(isSeq, dataArgsFrame_[i], copySize);
+    createDataOutlinkSelRowsInfo(isSeq, dataArgsFrame_[i]);
+
     dataArgs_[i].concat(dataArgsFrame_[i],
-                        machineIdVec,
-                        starts,
+                        batchMachineIdVec_,
+                        batchMachineStartPos_,
+                        copySize,
                         useGpu_,
                         HPPL_STREAM_1,
                         PASS_TEST);
-
     auto dataAgent =
         dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
     CHECK_NOTNULL(dataAgent);
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index fb3fc5877a..c16fae6d17 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -190,7 +190,7 @@ public:
     std::vector<int> ids;
 
     /**
-     * @brief idsProb, log probability of each generated words.
+     * @brief idsProb, log probability of each generated word.
      */
     std::vector<real> idsProb;
 
@@ -472,15 +472,43 @@ private:
   void copyDataOutlinkFrame(size_t machineCur);
 
   /*
-   * @brief In generation, if the layer group has more than 1 outlink, outlinks
-   * except the first one are data outlinks. This function creates the data
-   * outlinks.
-   * @note In beam search, only one generated sequence with the hightest log
-   * probabilites are retained.
-   * @param machineIdVec : select a row of output matrix in each frame
-   * that the generation process expanded.
+   * @brief In generation, if the layer group has more than 1 outlink, outlink
+   * except the first one is a data outlink. In RecurrentLayerGroup, each time
+   * step is a separate Network, outputs of a layer inside the
+   * RecurrentLayerGroup are stored in separate Arguments. If one layer is
+   * specified as an outlink of RecurrentLayerGroup. This function will
+   * collect outputs in each time step of each generated sequence which are
+   * dispersed in separate Arguments to form a new single Argument as output of
+   * RecurrentLayerGroup.
    */
-  void createDataOutlink(std::vector<int>& machineIdVec);
+  void createDataOutlink();
+
+  /*
+   * @brief decide to select how many rows from the Matrix stored the forward
+   * pass results from a start position.
+   *
+   * @param isSeq: a flag indicating whetehr the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the the returned Arguments of the forward pass
+   * during the generation process.
+   * @param copySize: the returned result, number of rows to select from the
+   * Matrix stored the forward pass results from a start position.
+   */
+  void createDataOutlinkCopySizeInfo(bool isSeq,
+                                     std::vector<Argument>& outArgs,
+                                     std::vector<int>& copySize);
+
+  /*
+   * @brief decide index of the start row for each time step of a generated
+   * sequence in Matrix stored the entire beam search batch's forward pass
+   * results.
+   *
+   * @param isSeq: a flag indicating whether the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the returned Arguments of the forward pass
+   * during the generation process.
+   */
+  void createDataOutlinkSelRowsInfo(bool isSeq, std::vector<Argument>& outArgs);
 
   /*
    * @brief used in beam search, connect previous frame to form recurrent link
@@ -543,6 +571,7 @@ private:
   std::vector<int> topIds_;
   std::vector<int> seqIds_;
   std::vector<int> batchMachineIdVec_;
+  std::vector<int> batchMachineStartPos_;
   std::vector<std::vector<Path>> finalPaths_;
   std::vector<real> minFinalPathLogProb_;
   BeamSearchControlCallbacks* beamSearchCtrlCallbacks_;
diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
index e161d89c38..a5328ef834 100644
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -32,9 +32,11 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
     const ConvConfig& conf = inputConfig.conv_conf();
     padding_.push_back(conf.padding());
     stride_.push_back(conf.stride());
+    dilation_.push_back(conf.dilation());
     filterSize_.push_back(conf.filter_size());
     paddingY_.push_back(conf.padding_y());
     strideY_.push_back(conf.stride_y());
+    dilationY_.push_back(conf.dilation_y());
     filterSizeY_.push_back(conf.filter_size_y());
     filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
     channels_.push_back(conf.channels());
@@ -89,7 +91,11 @@ size_t ConvBaseLayer::calOutputSize() {
   size_t layerSize = 0;
 
   auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) {
+    size_t filterSizeY;
+    size_t filterSize;
     for (size_t i = 0; i < inputLayers_.size(); i++) {
+      filterSizeY = (filterSizeY_[i] - 1) * dilationY_[i] + 1;
+      filterSize = (filterSize_[i] - 1) * dilation_[i] + 1;
       inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
       inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
       const ConvConfig& conf = config_.inputs(i).conv_conf();
@@ -98,17 +104,17 @@ size_t ConvBaseLayer::calOutputSize() {
           inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
         if (inW[i] == 0) inW[i] = conf.output_x();
         outH.push_back(imageSize(
-            inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
-        outW.push_back(imageSize(
-            inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_));
+            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
+        outW.push_back(
+            imageSize(inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
       } else {
         if (inH[i] == 0)
           inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
         if (inW[i] == 0) inW[i] = conf.img_size();
         outH.push_back(outputSize(
-            inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
+            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
         outW.push_back(outputSize(
-            inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_));
+            inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
       }
       CHECK_EQ(outH[i], outH[0]);
       CHECK_EQ(outW[i], outW[0]);
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
index e9d15d94f8..223bce8e29 100644
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -40,6 +40,10 @@ protected:
   IntV stride_;
   /// The y dimension of the stride.
   IntV strideY_;
+  /// The x dimension of the dilation.
+  IntV dilation_;
+  /// The y dimension of the dilation.
+  IntV dilationY_;
   /// The x dimension of a filter kernel.
   IntV filterSize_;
   /// The y dimension of a filter kernel.
diff --git a/paddle/gserver/layers/ConvBaseOperator.cpp b/paddle/gserver/layers/ConvBaseOperator.cpp
index 5c23198629..5469c41c87 100644
--- a/paddle/gserver/layers/ConvBaseOperator.cpp
+++ b/paddle/gserver/layers/ConvBaseOperator.cpp
@@ -59,7 +59,8 @@ void ConvBaseOperator::allocConvWorkSpace() {
                     &bwdDataAlgo_,
                     &bwdDataLimitBytes_,
                     &bwdFilterAlgo_,
-                    &bwdFilterLimitBytes_);
+                    &bwdFilterLimitBytes_,
+                    /*useDilation*/ false);
 
   size_t maxWorkSpace = 0;
   maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
diff --git a/paddle/gserver/layers/ConvBaseProjection.cpp b/paddle/gserver/layers/ConvBaseProjection.cpp
index eb6b0445c9..08f36c516c 100644
--- a/paddle/gserver/layers/ConvBaseProjection.cpp
+++ b/paddle/gserver/layers/ConvBaseProjection.cpp
@@ -41,6 +41,11 @@ void ConvBaseProjection::getConvParams() {
   strideH_ = conf.stride_y();
   strideW_ = conf.stride();
 
+  dilationH_ = conf.dilation_y();
+  dilationW_ = conf.dilation();
+  CHECK_GT(dilationH_, 0);
+  CHECK_GT(dilationW_, 0);
+
   filterH_ = conf.filter_size_y();
   filterW_ = conf.filter_size();
 
@@ -77,7 +82,9 @@ void ConvBaseProjection::initCudnn() {
                                    paddingH_,
                                    paddingW_,
                                    strideH_,
-                                   strideW_);
+                                   strideW_,
+                                   dilationH_,
+                                   dilationW_);
 
   // initialize all to default algorithms
   fwdAlgo_ = 0;
@@ -131,7 +138,9 @@ void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
                                   paddingH_,
                                   paddingW_,
                                   strideH_,
-                                  strideW_);
+                                  strideW_,
+                                  dilationH_,
+                                  dilationW_);
 }
 
 void ConvBaseProjection::reshape(int batchSize) {
@@ -140,6 +149,10 @@ void ConvBaseProjection::reshape(int batchSize) {
   CHECK_EQ(calInputSize(), in_->value->getWidth());
 
   reshapeTensorDesc(batchSize);
+  bool useDilation = false;
+  if (dilationH_ > 1 || dilationW_ > 1) {
+    useDilation = true;
+  }
   hl_conv_workspace(imageDesc_,
                     outputDesc_,
                     filterDesc_,
@@ -149,7 +162,8 @@ void ConvBaseProjection::reshape(int batchSize) {
                     &bwdDataAlgo_,
                     &bwdDataLimitBytes_,
                     &bwdFilterAlgo_,
-                    &bwdFilterLimitBytes_);
+                    &bwdFilterLimitBytes_,
+                    useDilation);
 
   size_t maxWorkSpace = 0;
   maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h
index e9d9f8f1b2..ebdb57845b 100644
--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@@ -63,6 +63,7 @@ protected:
   int configChannels_, configNumFilters_;
   int paddingH_, paddingW_;
   int strideH_, strideW_;
+  int dilationH_, dilationW_;
   int filterH_, filterW_;
   /// One group offset of input data.
   int inputOffset_;
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
index 5b7ecc5560..6f0106b713 100644
--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -25,12 +25,12 @@ size_t ConvProjection::calOutputSize() {
   if (imageH_ == 0) imageH_ = configImgH_;
   if (imageW_ == 0) imageW_ = configImgW_;
   outputH_ = outputSize(imageH_,
-                        filterH_,
+                        (filterH_ - 1) * dilationH_ + 1,
                         paddingH_,
                         strideH_,
                         /* caffeMode */ true);
   outputW_ = outputSize(imageW_,
-                        filterW_,
+                        (filterW_ - 1) * dilationW_ + 1,
                         paddingW_,
                         strideW_,
                         /* caffeMode */ true);
diff --git a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
index 8ce591d476..d5407555b2 100644
--- a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
+++ b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
@@ -80,13 +80,14 @@ void KmaxSeqScoreLayer::forward(PassType passType) {
       << "input of " << getName()
       << " must be a sequence or a nested sequence.";
   CHECK_EQ(input.value->getWidth(), 1UL)
-      << "input of " << getName()
-      << " is score over a sequence or a nested sequence, so its width "
-      << " must be 1.";
+      << "input of " << getName() << " are scores over a sequence or "
+      << "a nested sequence, so its width must be 1.";
 
   if (useGpu_) {
-    // this Layer runs only in CPU, if the model is runing on GPU,
-    // then copy the input to this layer from GPU to CPU.
+    /*
+     * currently, this Layer only runs in CPU, if the other part of the model is
+     * runing on GPU, then copy the input to this layer from GPU to CPU.
+     */
     Matrix::resizeOrCreate(scores_,
                            inputScore->getHeight(),
                            1,
@@ -97,6 +98,14 @@ void KmaxSeqScoreLayer::forward(PassType passType) {
     scores_ = inputScore;
   }
 
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but output of this layer which is some selected indices of the give
+   * sequence are actually filled with int types so that storing int types
+   * information in a real number matrix is dangerous, since real numbers will
+   * be convered to int types.
+   */
   Matrix::resizeOrCreate(
       output_.value,
       input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(),
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index 30f567eaf8..d201fac65e 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -57,11 +57,14 @@ bool MKLDNNFcLayer::init(const LayerMap& layerMap,
 }
 
 void MKLDNNFcLayer::convertWeightsFromPaddle() {
-  if (FLAGS_use_mkldnn_wgt) {
+  if (hasInitedWgt_) {
     return;
   }
 
-  if (hasInitedWgt_) {
+  // TODO(TJ): dst format should get from wgtVal_
+  int dstFmt = PARAM_FORMAT_MKLDNN_OI;
+  int srcFmt = weight_->getParameterPtr()->getHeaderFormat();
+  if (srcFmt == dstFmt) {
     return;
   }
 
@@ -78,6 +81,7 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
   MatrixPtr paddleWgtT;
   paddleWgt->transpose(paddleWgtT, true);
   weight_->getW()->copyFrom(*paddleWgtT);
+  weight_->getParameterPtr()->setHeaderFormat(dstFmt);
   hasInitedWgt_ = true;
 }
 
diff --git a/paddle/gserver/layers/ScaleShiftLayer.cpp b/paddle/gserver/layers/ScaleShiftLayer.cpp
new file mode 100644
index 0000000000..35fd038ab4
--- /dev/null
+++ b/paddle/gserver/layers/ScaleShiftLayer.cpp
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer applies a linear transformation to each element in each row of
+ * the input matrix. For each element, the layer first re-scale it and then
+ * adds a bias to it.
+ *
+ * \f[
+ *    y = wx + b
+ * \f]
+ *
+ * Here, w is the scale and b is the bias. Both w and b are trainable scalars.
+ *
+ */
+
+class ScaleShiftLayer : public Layer {
+protected:
+  std::unique_ptr<Weight> scale_;
+  std::unique_ptr<Weight> offset_;
+
+public:
+  explicit ScaleShiftLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(scale_shift, ScaleShiftLayer);
+
+bool ScaleShiftLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 1U);
+  scale_.reset(new Weight(1, 1, parameters_[0]));
+  if (biasParameter_.get() != NULL) {
+    offset_ = std::unique_ptr<Weight>(new Weight(1, 1, biasParameter_));
+  }
+  return true;
+}
+
+void ScaleShiftLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+  resetOutput(inV->getHeight(), inV->getWidth());
+  MatrixPtr outV = getOutputValue();
+  real scaleValue = scale_->getW()->getElement(0, 0);
+  outV->mulScalar(*inV, scaleValue);
+  if (offset_) {
+    real offsetValue = offset_->getW()->getElement(0, 0);
+    outV->add(offsetValue);
+  }
+}
+
+void ScaleShiftLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+
+  /* Calculate the parameter gradient for the current layer */
+  if (scale_->getWGrad()) {
+    MatrixPtr rowSumMtx;
+    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
+    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
+    rowSumMtx->sumOfProducts(
+        /* b= */ *inV, /* c= */ *outG, /* scaleSum= */ 1, /* scaleDest= */ 0.);
+    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
+    scale_->getWGrad()->sumCols(
+        /* b= */ *rowSumMtx, /* scaleSum= */ 1., /* scaleDest= */ 1.);
+    scale_->getParameterPtr()->incUpdate(callback);
+  }
+  if (offset_ && offset_->getWGrad()) {
+    MatrixPtr rowSumMtx;
+    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
+    rowSumMtx->sumRows(*outG, 1., 0.);
+    offset_->getWGrad()->sumCols(*rowSumMtx, 1., 1.);
+    offset_->getParameterPtr()->incUpdate(callback);
+  }
+
+  /* Calculate the input layers error */
+  if (inG) {
+    real scaleValue = scale_->getW()->getElement(0, 0);
+    inG->add(*outG, scaleValue);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceSliceLayer.cpp b/paddle/gserver/layers/SequenceSliceLayer.cpp
new file mode 100644
index 0000000000..d3a83fad27
--- /dev/null
+++ b/paddle/gserver/layers/SequenceSliceLayer.cpp
@@ -0,0 +1,223 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+class SequenceSliceLayer : public Layer {
+public:
+  explicit SequenceSliceLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+private:
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but the second and the (optional) third input which are some
+   * selected indices of the give sequence to trim the sequence, are actually
+   * filled with int types so that storing int types information in real number
+   * matrices is very dangerous, since real numbers will be convered to int
+   * types. If a user fills this matrix himself, invalid data may occor.
+   */
+
+  MatrixPtr startIdsOnCpu_;
+  MatrixPtr endIdsOnCpu_;
+
+  std::vector<int> selectedRows_;
+  IVectorPtr rowIndice_;
+  std::vector<std::vector<int>> inputSeqInfoVec_;
+  std::vector<int> outSubSeqStartPos_;
+  std::vector<int> outSeqStartPos_;
+
+  void checkInputs();
+  void copySliceIdsToCpu();
+  void calSelectedRows(const MatrixPtr starts, const MatrixPtr ends);
+};
+
+REGISTER_LAYER(seq_slice, SequenceSliceLayer);
+
+bool SequenceSliceLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_GE(inputLayers_.size(), 2U);
+  CHECK_LE(inputLayers_.size(), 3U);
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequenceSliceLayer::checkInputs() {
+  const Argument& inputSeq = getInput(0);
+  CHECK(inputSeq.hasSeq()) << "The first input of sequence slice layer "
+                           << "must be a sequence.";
+  const MatrixPtr indices1 = getInputValue(1);
+  CHECK_EQ(static_cast<size_t>(indices1->getHeight()),
+           inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
+                                : inputSeq.getNumSequences())
+      << "Height of the second input should be equal to number of sequence "
+      << "in the first input.";
+  if (inputLayers_.size() == 3) {
+    const MatrixPtr indices2 = getInputValue(2);
+    CHECK_EQ(indices2->getHeight(), indices1->getHeight())
+        << "start indices and end indices should have the same height.";
+    CHECK_EQ(indices2->getWidth(), indices1->getWidth())
+        << "start indices and end indices should have the same Width.";
+  }
+}
+
+void SequenceSliceLayer::copySliceIdsToCpu() {
+  const MatrixPtr indices1 = getInputValue(1);
+  if (inputLayers_.size() == 2U) {
+    if (config_.select_first()) {
+      Matrix::resizeOrCreate(startIdsOnCpu_,
+                             indices1->getHeight(),
+                             indices1->getWidth(),
+                             false /* trans */,
+                             false /* useGpu */);
+      startIdsOnCpu_->copyFrom(*indices1);
+      endIdsOnCpu_ = nullptr;
+    } else {
+      Matrix::resizeOrCreate(endIdsOnCpu_,
+                             indices1->getHeight(),
+                             indices1->getWidth(),
+                             false /* trans */,
+                             false /* useGpu */);
+      endIdsOnCpu_->copyFrom(*indices1);
+      startIdsOnCpu_ = nullptr;
+    }
+  } else if (inputLayers_.size() == 3U) {
+    Matrix::resizeOrCreate(startIdsOnCpu_,
+                           indices1->getHeight(),
+                           indices1->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    startIdsOnCpu_->copyFrom(*indices1);
+
+    const MatrixPtr indices2 = getInputValue(2);
+    Matrix::resizeOrCreate(endIdsOnCpu_,
+                           indices2->getHeight(),
+                           indices2->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    endIdsOnCpu_->copyFrom(*indices2);
+  }
+}
+
+void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
+                                         const MatrixPtr ends) {
+  CHECK(starts || ends) << "At least one of the start or end indices "
+                        << "should be given.";
+
+  bool hasSubseq = getInput(0).hasSubseq();
+
+  outSeqStartPos_.resize(1, 0);
+  outSubSeqStartPos_.resize(1, 0);
+  selectedRows_.clear();
+
+  size_t beamSize = starts ? starts->getWidth() : ends->getWidth();
+  size_t rowIdx = 0;
+  for (size_t i = 0; i < inputSeqInfoVec_.size(); ++i) {
+    for (size_t j = 0; j < inputSeqInfoVec_[i].size() - 1; ++j) {
+      for (size_t k = 0; k < beamSize; ++k) {
+        if (starts && starts->getElement(rowIdx, k) == -1.) break;
+        if (ends && ends->getElement(rowIdx, k) == -1.) break;
+
+        int begPos = inputSeqInfoVec_[i][j];
+        if (starts) begPos += starts->getElement(rowIdx, k);
+
+        int endPos = inputSeqInfoVec_[i][j + 1] - 1;
+        if (ends) endPos = inputSeqInfoVec_[i][j] + ends->getElement(rowIdx, k);
+
+        int seqLen = endPos - begPos + 1;
+        CHECK_GT(seqLen, 0U);
+        for (int m = begPos; m <= endPos; ++m) selectedRows_.push_back(m);
+        hasSubseq
+            ? outSubSeqStartPos_.push_back(outSubSeqStartPos_.back() + seqLen)
+            : outSeqStartPos_.push_back(outSeqStartPos_.back() + seqLen);
+      }
+      rowIdx++;
+    }
+    if (hasSubseq) outSeqStartPos_.push_back(outSubSeqStartPos_.back());
+  }
+
+  if (useGpu_) {
+    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
+    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
+  } else {
+    rowIndice_ =
+        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
+  }
+
+  // create the sequence information for the output.
+  ICpuGpuVector::resizeOrCreate(
+      output_.sequenceStartPositions, outSeqStartPos_.size(), false);
+  output_.sequenceStartPositions->copyFrom(
+      outSeqStartPos_.data(), outSeqStartPos_.size(), false);
+
+  if (hasSubseq) {
+    ICpuGpuVector::resizeOrCreate(
+        output_.subSequenceStartPositions, outSubSeqStartPos_.size(), false);
+    output_.subSequenceStartPositions->copyFrom(
+        outSubSeqStartPos_.data(), outSubSeqStartPos_.size(), false);
+  }
+}
+
+void SequenceSliceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  checkInputs();
+
+  const Argument& inputSeq = getInput(0);
+  inputSeqInfoVec_.clear();
+  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
+                              inputSeq.subSequenceStartPositions,
+                              inputSeqInfoVec_);
+  if (!useGpu_) {
+    if (inputLayers_.size() == 2U) {
+      startIdsOnCpu_ = config_.select_first() ? getInputValue(1) : nullptr;
+      endIdsOnCpu_ = config_.select_first() ? nullptr : getInputValue(1);
+    } else if (inputLayers_.size() == 3U) {
+      startIdsOnCpu_ = getInputValue(1);
+      endIdsOnCpu_ = getInputValue(2);
+    }
+  } else {
+    copySliceIdsToCpu();
+  }
+
+  /*
+   * calculate the selected row indices in a batch, and build the output
+   * sequence information.
+   */
+  calSelectedRows(startIdsOnCpu_, endIdsOnCpu_);
+
+  resetOutput(selectedRows_.size(), getSize());
+
+  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
+}
+
+void SequenceSliceLayer::backward(const UpdateCallback& callback) {
+  getOutputGrad()->addToRows(*getInputGrad(0), *rowIndice_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
index 648d3908f3..e9bee77212 100644
--- a/paddle/gserver/layers/SubNestedSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
@@ -52,23 +52,34 @@ private:
    *   ]
    *
    * ths output is saved to private member rowIndice_;
-   * [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
-   *  16,17,18,19,20,21,22,23,24,25,26,27]
+   * [0,1,2,3,4,5,6,7,8,9,15,16,17,18,19,20,21,23,24,25,26,27]
    */
 
-  void calSelectedCols(const MatrixPtr selectedIndices,
+  void calSelectedRows(const MatrixPtr selectedIndices,
                        const std::vector<std::vector<int>>& inputSeqInfo);
 
-  // if the second input of this layer is on GPU memory, copy it to CPU memory.
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but the second is some selected indices of the give sequence to trim
+   * the nested sequence, are actually filled with int types so that storing
+   * int types information in real number matrices is very dangerous, since
+   * real numbers will be convered to int types. If a user fills this matrix
+   * himself, invalid data may occor.
+   *
+   * if the second input of this layer is on GPU memory, copy it to CPU memory.
+   */
   MatrixPtr selIdsCpu_;
 
-  // reorganized sequenceStartPositions and subSequenceStartPositions
-  // into a 2d vector to facilitate the sequence selection process.
+  /*
+   * reorganize sequenceStartPositions and subSequenceStartPositions
+   * into a 2d vector to facilitate the sequence selection process.
+   */
   std::vector<std::vector<int>> inputSeqInfoVec_;
 
-  // the final selected row indices in a batch,
-  // rowIdx_ and selectedRows_ actually share a same memory.
+  /* store the final selected row indices in a batch */
   IVectorPtr rowIndice_;
+  /* rowIndice_ and selectedRows_ actually share a same memory. */
   std::vector<int> selectedRows_;
 };
 
@@ -83,7 +94,7 @@ bool SubNestedSequenceLayer::init(const LayerMap& layerMap,
   return true;
 }
 
-void SubNestedSequenceLayer::calSelectedCols(
+void SubNestedSequenceLayer::calSelectedRows(
     const MatrixPtr selectedIndices,
     const std::vector<std::vector<int>>& inputSeqInfo) {
   selectedRows_.clear();
@@ -160,7 +171,7 @@ void SubNestedSequenceLayer::forward(PassType passType) {
   Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
                               inputSeq.subSequenceStartPositions,
                               inputSeqInfoVec_);
-  calSelectedCols(selIdsCpu_, inputSeqInfoVec_);
+  calSelectedRows(selIdsCpu_, inputSeqInfoVec_);
 
   resetOutput(selectedRows_.size(), getSize());
   getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index c2a2993620..346c01ced6 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -34,6 +34,12 @@ add_unittest_without_exec(test_CRFLayerGrad
 add_test(NAME test_CRFLayerGrad
     COMMAND test_CRFLayerGrad)
 
+################ test_SeqSliceLayerGrad ####################
+add_unittest_without_exec(test_SeqSliceLayerGrad
+    test_SeqSliceLayerGrad.cpp
+    LayerGradUtil.cpp)
+add_test(NAME test_SeqSliceLayerGrad
+    COMMAND test_SeqSliceLayerGrad)
 
 add_unittest_without_exec(test_ActivationGrad
     test_ActivationGrad.cpp
diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
index 99c8c4948c..de1635be2a 100644
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -330,9 +330,7 @@ void MKLDNNTester::run(const TestConfig& dnn,
   log_ = log;
   lvl_ = level;
 
-  // Firstly test FLAGS_use_mkldnn_wgt = false
-  FLAGS_use_mkldnn_wgt = false;
-  // reset and run once
+  // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
   reset(dnn, ref, batchSize);
   randomWgtDatas();
   clearWgtDiffs();
@@ -342,17 +340,32 @@ void MKLDNNTester::run(const TestConfig& dnn,
     runOnce();
   }
 
-  // Then test FLAGS_use_mkldnn_wgt = true
-  FLAGS_use_mkldnn_wgt = true;
-  // after run once the mkldnn weight has been stored in dnnlayer
+  if (parameters_[DNN].empty()) {
+    // has no paramters
+    return;
+  }
+
+  // After run some iterations, the mkldnn weight has been stored in dnnLayer
+  // and we can also get the mkldnn weight parameter header format.
+  // Weight parameter should always be index 0 (and bias index 1).
+  // TODO(TJ): should also consider mean and var format when batchnorm ready
+  int dnnWgtFmt = parameters_[DNN][0]->getHeaderFormat();
+  int refWgtFmt = parameters_[REF][0]->getHeaderFormat();
+  if (dnnWgtFmt == refWgtFmt) {
+    // weight format are equal, so no need check more
+    return;
+  }
+
   // then save the weights and restart again
   vector<VectorPtr> dnnWgts, refWgts;
   CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
   saveWgt(parameters_[DNN], dnnWgts);
   saveWgt(parameters_[REF], refWgts);
 
-  // restart again with flag true
+  // restart again with dnn weight format
   reset(dnn, ref, batchSize);
+  // TODO(TJ): should also considerate mean and var format when batchnorm ready
+  parameters_[DNN][0]->setHeaderFormat(dnnWgtFmt);
 
   // restore wgt
   restoreWgt(dnnWgts, parameters_[DNN]);
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
index 522eeaf24b..e55e4493ff 100644
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -108,7 +108,7 @@ private:
    * if many(>failRate) wrong(abs(dnn-ref)/abs(ref)>thres) points return the
    * max(diff/ref)
    * else return sum(abs(a-b)) / sum(abs(b))
-   * The return value should smaller than eps when passing.
+   * The return value should be smaller than eps when passing.
    */
   double getDelta(const real* d1,
                   const real* d2,
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 0f312b6ca5..9946f76664 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifndef PADDLE_ONLY_CPU
+#include <cudnn.h>
+#endif
 #include <gtest/gtest.h>
 #include <string>
 #include <vector>
@@ -189,10 +192,16 @@ TEST(Projection, scaling) {
 void testProjectionConv(size_t groups, bool isDeconv) {
   const int NUM_FILTERS = 18;
   const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 4;
+  const int FILTER_SIZE_Y = 2;
   const int CHANNELS = 3;
   const int IMAGE_SIZE = 16;
 
+#if CUDNN_VERSION >= 6000
+  const int DILATION = 2;
+#else
+  const int DILATION = 1;
+#endif
+
   ProjectionConfig conf;
   if (isDeconv) {
     conf.set_type("convt");
@@ -209,6 +218,8 @@ void testProjectionConv(size_t groups, bool isDeconv) {
   conv->set_padding_y(1);
   conv->set_stride(2);
   conv->set_stride_y(2);
+  conv->set_dilation(DILATION);
+  conv->set_dilation_y(DILATION);
   conv->set_groups(groups);
   if (isDeconv) {
     conv->set_filter_channels(NUM_FILTERS / conv->groups());
@@ -217,12 +228,12 @@ void testProjectionConv(size_t groups, bool isDeconv) {
   }
   conv->set_img_size(IMAGE_SIZE);
   int output_x = outputSize(conv->img_size(),
-                            conv->filter_size(),
+                            (conv->filter_size() - 1) * DILATION + 1,
                             conv->padding(),
                             conv->stride(),
                             /* caffeMode */ true);
   int output_y = outputSize(conv->img_size(),
-                            conv->filter_size_y(),
+                            (conv->filter_size_y() - 1) * DILATION + 1,
                             conv->padding_y(),
                             conv->stride_y(),
                             /* caffeMode */ true);
@@ -424,27 +435,38 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
   config.layerConfig.set_partial_sum(1);
   config.layerConfig.set_shared_biases(true);
 
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 384, 288});
+  int dilation = 1;
+  if (type == "cudnn_conv") {
+#if CUDNN_VERSION >= 6000
+    dilation = 2;
+#else
+    dilation = 1;
+#endif
+  }
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 192});
   LayerInputConfig* input = config.layerConfig.add_inputs();
   ConvConfig* conv = input->mutable_conv_conf();
   conv->set_filter_size(2);
-  conv->set_filter_size_y(3);
+  conv->set_filter_size_y(2);
   conv->set_channels(3);
   conv->set_padding(0);
   conv->set_padding_y(1);
   conv->set_stride(2);
   conv->set_stride_y(2);
+  conv->set_dilation(dilation);
+  conv->set_dilation_y(dilation);
   conv->set_groups(1);
   conv->set_filter_channels(conv->channels() / conv->groups());
   conv->set_img_size(16);
-  conv->set_img_size_y(8);
+  conv->set_img_size_y(16);
   conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
+                                (conv->filter_size() - 1) * dilation + 1,
                                 conv->padding(),
                                 conv->stride(),
                                 /* caffeMode */ true));
   conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
+                                (conv->filter_size_y() - 1) * dilation + 1,
                                 conv->padding_y(),
                                 conv->stride_y(),
                                 /* caffeMode */ true));
@@ -2007,6 +2029,21 @@ TEST(Layer, RowL2NormLayer) {
   }
 }
 
+TEST(Layer, ScaleShiftLayer) {
+  const size_t batchSize = 16;
+  const size_t size = 32;
+  TestConfig config;
+  config.layerConfig.set_type("scale_shift");
+  config.layerConfig.set_size(size);
+  config.biasSize = 1;
+  config.inputDefs.push_back(
+      {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index f930c72fde..d36f72360f 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -269,7 +269,8 @@ TEST(Compare, img_conv2) {
   bool useGpu = FLAGS_use_gpu;
   double eps = FLAGS_checkgrad_eps;
   FLAGS_use_gpu = true;
-  FLAGS_checkgrad_eps = 1e-2;
+  // Sometimes, this unit test will fail with 1e-2
+  FLAGS_checkgrad_eps = 4e-2;
   compareNetwork(config_file_a, config_file_b);
   FLAGS_use_gpu = useGpu;
   FLAGS_checkgrad_eps = eps;
diff --git a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
new file mode 100644
index 0000000000..e1d4ae1617
--- /dev/null
+++ b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
@@ -0,0 +1,225 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/trainer/Trainer.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+const int MAX_SEQ_NUM = 17;
+const int MAX_SEQ_LEN = 23;
+const int MAX_BEAM_SIZE = 13;
+
+const size_t SEED = (size_t)(time(NULL));
+
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+void genSeqInfo(vector<int>& seqStartPos, vector<int>& subSeqStartPos) {
+  seqStartPos.resize(1, 0);
+  subSeqStartPos.resize(1, 0);
+
+  srand(SEED);
+  int seqNum = 1 + (rand() % MAX_SEQ_NUM);
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqNum = 1 + (rand() % MAX_SEQ_NUM);
+    for (int j = 0; j < subSeqNum; ++j)
+      subSeqStartPos.push_back(subSeqStartPos.back() +
+                               (1 + (rand() % MAX_SEQ_LEN)));
+    seqStartPos.push_back(subSeqStartPos.back());
+  }
+}
+
+/*
+  generate start indices according to sequence start positions.
+ */
+void genStarts(vector<int>& seqStartPos,
+               vector<vector<real>>& starts,
+               size_t beamSize) {
+  starts.clear();
+  starts.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
+
+  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
+    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+    vector<real> randStarts =
+        randSampling(seqLen, min(seqLen, static_cast<int>(beamSize)));
+    copy(begin(randStarts), end(randStarts), begin(starts[i]));
+  }
+}
+
+/*
+  generate end indices according to sequence start positions and start indices.
+ */
+void genEnds(vector<int>& seqStartPos,
+             vector<vector<real>>& starts,
+             vector<vector<real>>& ends,
+             size_t beamSize) {
+  CHECK_EQ(seqStartPos.size() - 1, starts.size());
+  ends.clear();
+  ends.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
+
+  for (size_t i = 0; i < starts.size(); ++i) {
+    for (size_t j = 0; j < starts[i].size(); ++j) {
+      int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+      CHECK_GE(seqLen - 1, starts[i][j]);
+      if (starts[i][j] == -1.) break;
+      if (starts[i][j] == (seqLen - 1)) {
+        ends[i][j] = starts[i][j];
+      } else {
+        ends[i][j] = starts[i][j] + randSampling(seqLen - starts[i][j], 1)[0];
+      }
+    }
+  }
+}
+
+void genTestData(vector<int>& seqStartPos,
+                 vector<int>& subSeqStartPos,
+                 vector<vector<real>>& starts,
+                 vector<vector<real>>& ends,
+                 bool hasSubseq) {
+  size_t beamSize = 1 + (rand() % MAX_BEAM_SIZE);
+  genSeqInfo(seqStartPos, subSeqStartPos);
+
+  genStarts(hasSubseq ? subSeqStartPos : seqStartPos, starts, beamSize);
+  genEnds(hasSubseq ? subSeqStartPos : seqStartPos, starts, ends, beamSize);
+}
+
+template <typename T>
+void flatten2dVector(vector<vector<T>>& inVec, vector<T>& outVec) {
+  size_t totalSize{0};
+  for (auto const& items : inVec) totalSize += items.size();
+  outVec.reserve(totalSize);
+
+  for (auto& items : inVec)
+    move(items.begin(), items.end(), back_inserter(outVec));
+}
+
+void testSeqSliceLayer(bool hasSubseq,
+                       bool useGpu,
+                       vector<int>& seqStartPos,
+                       vector<int>& subSeqStartPos,
+                       vector<vector<real>>& starts,
+                       vector<vector<real>>& ends) {
+  // layer size is not crutial for this layer,
+  // so here use a small layer size in the unittest.
+  const size_t layerSize{4};
+  TestConfig config;
+  config.layerConfig.set_type("seq_slice");
+  config.layerConfig.set_size(layerSize);
+
+  // add the first input
+  MatrixPtr seqInputPtr =
+      Matrix::create(hasSubseq ? subSeqStartPos.back() : seqStartPos.back(),
+                     layerSize,
+                     false,
+                     false);
+  seqInputPtr->randomizeUniform();
+
+  if (hasSubseq) {
+    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                "seq_input",
+                                seqInputPtr,
+                                seqStartPos,
+                                subSeqStartPos});
+  } else {
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, "seq_input", seqInputPtr, seqStartPos});
+  }
+  config.layerConfig.add_inputs();
+
+  // add start indices
+  if (starts.size()) {
+    vector<real> startsToVec;
+    flatten2dVector(starts, startsToVec);
+
+    MatrixPtr startMatrixPtr =
+        Matrix::create(starts.size(), starts[0].size(), false, false);
+    startMatrixPtr->copyFrom(startsToVec.data(), startsToVec.size());
+
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, "starts", startMatrixPtr});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_select_first(true);
+  }
+
+  // add end indices
+  if (ends.size()) {
+    vector<real> endsToVec;
+    flatten2dVector(ends, endsToVec);
+
+    MatrixPtr endMatrixPtr =
+        Matrix::create(ends.size(), ends[0].size(), false, false);
+    endMatrixPtr->copyFrom(endsToVec.data(), endsToVec.size());
+
+    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "ends", endMatrixPtr});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_select_first(false);
+  }
+
+  testLayerGrad(config, "seq_slice", /*batchSize*/ 100, false, useGpu, false);
+}
+
+TEST(Layer, SeqSliceLayer) {
+  vector<int> seqStartPos;
+  vector<int> subSeqStartPos;
+  vector<vector<real>> starts;
+  vector<vector<real>> ends;
+
+  std::vector<bool> mode = {false};
+#ifndef PADDLE_ONLY_CPU
+  mode.push_back(true);
+#endif
+  genSeqInfo(seqStartPos, subSeqStartPos);
+  for (bool hasSubseq : {true, false}) {
+    LOG(INFO) << "hasSubSeq : " << hasSubseq;
+    genTestData(seqStartPos, subSeqStartPos, starts, ends, hasSubseq);
+    for (bool useGpu : mode) {
+      vector<vector<real>> tmp;
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, tmp, ends);
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, tmp);
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, ends);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 8035d93bfe..9cc4233e43 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_subdirectory(detail)
 
 cc_library(memory SRCS memory.cc)
-cc_library(memcpy SRCS memcpy.cc DEPS device_context)
+cc_library(memcpy SRCS memcpy.cc)
 
 cc_library(paddle_memory
     DEPS
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index f61e67a329..a270bd5958 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -27,7 +27,7 @@ limitations under the License. */
 // between host and device.  Allocates too much would reduce the amount
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, false, "If set, allocate cpu pinned memory.");
+DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
index aaab1142ca..a19a3e3675 100644
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -16,8 +16,6 @@ limitations under the License. */
 
 #include <cstring>  // for memcpy
 
-#include "paddle/platform/device_context.h"
-
 namespace paddle {
 namespace memory {
 
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 207025f9b1..29bc26f9d3 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -13,22 +13,38 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/memory.h"
+
+#include <algorithm>  // for transform
+#include <cstring>    // for memcpy
+#include <memory>     // for unique_ptr
+#include <mutex>      // for call_once
+
+#include "glog/logging.h"
+
 #include "paddle/memory/detail/buddy_allocator.h"
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/gpu_info.h"
 
-#include <cstring>  // for memcpy
+DECLARE_double(fraction_of_gpu_memory_to_use);
 
 namespace paddle {
 namespace memory {
 
-detail::BuddyAllocator* GetCPUBuddyAllocator() {
-  static detail::BuddyAllocator* a = nullptr;
-  if (a == nullptr) {
-    a = new detail::BuddyAllocator(new detail::CPUAllocator,
-                                   platform::CpuMinChunkSize(),
-                                   platform::CpuMaxChunkSize());
-  }
-  return a;
+using BuddyAllocator = detail::BuddyAllocator;
+
+std::once_flag cpu_allocator_flag;
+std::once_flag gpu_allocator_flag;
+
+BuddyAllocator* GetCPUBuddyAllocator() {
+  static std::unique_ptr<BuddyAllocator> a{nullptr};
+
+  std::call_once(cpu_allocator_flag, [&]() {
+    a.reset(new BuddyAllocator(new detail::CPUAllocator,
+                               platform::CpuMinChunkSize(),
+                               platform::CpuMaxChunkSize()));
+  });
+
+  return a.get();
 }
 
 template <>
@@ -48,20 +64,36 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
 
 #ifndef PADDLE_ONLY_CPU
 
-detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  static detail::BuddyAllocator** as = NULL;
-  if (as == NULL) {
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  using BuddyAllocVec = std::vector<BuddyAllocator*>;
+  static std::unique_ptr<BuddyAllocVec, void (*)(BuddyAllocVec * p)> as{
+      new BuddyAllocVec, [](BuddyAllocVec* p) {
+        std::for_each(p->begin(), p->end(),
+                      [](BuddyAllocator* p) { delete p; });
+      }};
+
+  // GPU buddy allocators
+  auto& allocators = *as.get();
+
+  // GPU buddy allocator initialization
+  std::call_once(gpu_allocator_flag, [&]() {
     int gpu_num = platform::GetDeviceCount();
-    as = new detail::BuddyAllocator*[gpu_num];
+    allocators.reserve(gpu_num);
     for (int gpu = 0; gpu < gpu_num; gpu++) {
       platform::SetDeviceId(gpu);
-      as[gpu] = new detail::BuddyAllocator(new detail::GPUAllocator,
-                                           platform::GpuMinChunkSize(),
-                                           platform::GpuMaxChunkSize());
+      allocators.emplace_back(new BuddyAllocator(new detail::GPUAllocator,
+                                                 platform::GpuMinChunkSize(),
+                                                 platform::GpuMaxChunkSize()));
     }
-  }
+    VLOG(3) << "\n\nNOTE: each GPU device use "
+            << FLAGS_fraction_of_gpu_memory_to_use * 100 << "% of GPU memory.\n"
+            << "You can set environment variable '"
+            << platform::kEnvFractionGpuMemoryToUse
+            << "' to change the fraction of GPU usage.\n\n";
+  });
+
   platform::SetDeviceId(gpu_id);
-  return as[gpu_id];
+  return allocators[gpu_id];
 }
 
 template <>
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 72351b9dfa..11bbb88187 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/platform/gpu_info.h"
 #include "paddle/platform/place.h"
 
 namespace paddle {
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index a7c89787e4..f0fd12f1b5 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -42,9 +42,12 @@ function(op_library TARGET)
 endfunction()
 
 add_subdirectory(math)
+
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
+op_library(gather_op SRCS gather_op.cc gather_op.cu)
 
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
+op_library(scatter_op SRCS scatter_op.cc scatter_op.cu)
 
 cc_library(net_op SRCS net_op.cc DEPS op_registry)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
@@ -66,5 +69,7 @@ op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
 
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
     DEPS framework_proto tensor op_registry operator net_op)
-op_library(uniform_random_op
-        SRCS uniform_random_op.cc uniform_random_op.cu)
+op_library(uniform_random_op SRCS uniform_random_op.cc uniform_random_op.cu)
+op_library(lookup_table_op SRCS lookup_table_op.cc lookup_table_op.cu)
+op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op)
+op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op)
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index a623c551e1..ab1e1c101a 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -39,11 +39,10 @@ class OnehotCrossEntropyGradientOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto X_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto X = ctx.Input<Tensor>("X");
 
-    // TODO(superjom) add enforce here after helper functions ready
-    X_grad->Resize(X->dims());
+    dX->Resize(X->dims());
   }
 };
 
@@ -70,9 +69,7 @@ namespace ops = paddle::operators;
 REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp,
             ops::OnehotCrossEntropyOpMaker, onehot_cross_entropy_grad,
             ops::OnehotCrossEntropyGradientOp);
-REGISTER_OP_CPU_KERNEL(
-    onehot_cross_entropy,
-    ops::OnehotCrossEntropyOpKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    onehot_cross_entropy_grad,
-    ops::OnehotCrossEntropyGradientOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(onehot_cross_entropy,
+                       ops::OnehotCrossEntropyOpKernel<float>);
+REGISTER_OP_CPU_KERNEL(onehot_cross_entropy_grad,
+                       ops::OnehotCrossEntropyGradientOpKernel<float>);
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 4bbc8f093a..d999bfce58 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -12,10 +12,122 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
-#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+__host__ __device__ T clipping_log(const T x) {
+  PADDLE_ASSERT(std::is_floating_point<T>::value);
+  const T kApproInf = 1e20;
+  T v = log(x);
+  if (v == INFINITY) {
+    return kApproInf;
+  }
+  if (v == -INFINITY) {
+    return -kApproInf;
+  }
+  return v;
+}
+
+template <typename T>
+__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
+                                   const int N, const int D) {
+  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
+  // CUDA_1D_KERNEL_LOOP(i, N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
+    Y[i] = -clipping_log(X[i * D + label[i]]);
+  }
+}
+
+// TODO(qingqing): make zero setting an common function.
+template <typename T>
+__global__ void zero(T* X, const int N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    X[i] = 0.0;
+  }
+}
+
+template <typename T>
+__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
+                                           const int* label, const int N,
+                                           const int D) {
+  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
+  // CUDA_1D_KERNEL_LOOP(i, N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    int idx = i * D + label[i];
+    dX[idx] = -dY[i] / X[idx];
+  }
+}
+
+template <typename T>
+class OnehotCrossEntropyOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+
+    auto X = ctx.Input<Tensor>("X");
+    const T* Xdata = X->data<T>();
+    const int* label_data = ctx.Input<Tensor>("label")->data<int>();
+    auto Y = ctx.Output<Tensor>("Y");
+    Y->mutable_data<T>(ctx.GetPlace());
+    T* Ydata = Y->data<T>();
+
+    int N = X->dims()[0];
+    int D = X->dims()[1];
+    int block = 512;
+    int grid = (N + block - 1) / block;
+    // TODO(qingqing) launch kernel on specified stream
+    // base on ExecutionContext.
+    CrossEntropyKernel<T><<<grid, block>>>(Ydata, Xdata, label_data, N, D);
+  }
+};
+
+template <typename T>
+class OnehotCrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+
+    auto X = ctx.Input<Tensor>("X");
+    auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto label = ctx.Input<Tensor>("label");
+
+    auto* dXdata = dX->template mutable_data<T>(ctx.GetPlace());
+    auto* dYdata = dY->template data<T>();
+    auto* Xdata = X->template data<T>();
+    auto* label_data = label->data<int>();
+
+    int N = X->dims()[0];
+    int D = X->dims()[1];
+    int block = 512;
+    int grid = (N * D + block - 1) / block;
+    zero<T><<<grid, block>>>(dXdata, N * D);
+
+    grid = (N + block - 1) / block;
+    // TODO(qingqing): launch kernel on specified stream
+    // base on ExecutionContext.
+    CrossEntropyGradientKernel<T><<<grid, block>>>(dXdata, dYdata, Xdata,
+                                                   label_data, N, D);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    onehot_cross_entropy,
-    ops::OnehotCrossEntropyOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
+                       ops::OnehotCrossEntropyOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(onehot_cross_entropy_grad,
+                       ops::OnehotCrossEntropyGradientOpCUDAKernel<float>);
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index b7df92c9a9..eb4d1348de 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -21,7 +21,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename T>
-T tolerable_value(T x) {
+inline T tolerable_value(const T x) {
   static_assert(std::is_floating_point<T>::value,
                 "tolerable_value works only on float, "
                 "double and double double.");
@@ -39,10 +39,13 @@ T tolerable_value(T x) {
   return x;
 }
 
-template <typename Place, typename T>
+template <typename T>
 class OnehotCrossEntropyOpKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
     auto X = ctx.Input<Tensor>("X");
     const T* Xdata = X->data<T>();
     const int* label_data = ctx.Input<Tensor>("label")->data<int>();
@@ -62,10 +65,13 @@ class OnehotCrossEntropyOpKernel : public framework::OpKernel {
   }
 };
 
-template <typename Place, typename T>
+template <typename T>
 class OnehotCrossEntropyGradientOpKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
     auto X = ctx.Input<Tensor>("X");
     auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));
@@ -79,6 +85,8 @@ class OnehotCrossEntropyGradientOpKernel : public framework::OpKernel {
     const int batch_size = X->dims()[0];
     const int class_num = X->dims()[1];
 
+    // TODO(qingqing): make zero setting an common function.
+    memset(dXdata, 0, sizeof(T) * batch_size * class_num);
     for (int i = 0; i < batch_size; ++i) {
       int index = i * class_num + label_data[i];
       dXdata[index] = -tolerable_value(dYdata[i] / Xdata[index]);
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
index fd380ca851..969998ce2e 100644
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -26,7 +26,7 @@ class FillZerosLikeKernel : public framework::OpKernel {
     auto* output = context.Output<framework::Tensor>("Dst");
     output->mutable_data<T>(context.GetPlace());
     auto t = framework::EigenVector<T>::Flatten(*output);
-    t.device(context.GetEigenDevice<Place>()) = t.constant(T(0));
+    t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
   }
 };
 
diff --git a/paddle/operators/gather.h b/paddle/operators/gather.h
index d6e6990394..92fb51ec17 100644
--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cstring>
 
 #include "paddle/framework/ddim.h"
+#include "paddle/framework/eigen.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/place.h"
 
@@ -25,13 +26,13 @@ namespace operators {
 
 // Implementation of CPU copy
 template <typename T>
-void CPUGather(const T* params, const int* indices, const int slice_size,
+void CPUGather(const T* src, const int* indices, const int slice_size,
                const int index_size, T* output) {
   const size_t slice_bytes = slice_size * sizeof(T);
 
   for (int i = 0; i < index_size; ++i) {
     int index_ = indices[i];
-    memcpy(output + i * slice_size, params + index_ * slice_size, slice_bytes);
+    memcpy(output + i * slice_size, src + index_ * slice_size, slice_bytes);
   }
 }
 
@@ -55,7 +56,7 @@ void Gather(const platform::Place& place, const paddle::framework::Tensor* src,
   int index_size = index->dims()[0];
 
   auto src_dims = src->dims();
-  paddle::framework::DDim output_dims(src_dims);
+  framework::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
   // slice size
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
new file mode 100644
index 0000000000..123bed296c
--- /dev/null
+++ b/paddle/operators/gather_op.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/gather_op.h"
+#include "paddle/framework/ddim.h"
+
+namespace paddle {
+namespace operators {
+
+class GatherOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    int batch_size = ctx.Input<Tensor>("Index")->dims()[0];
+    PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
+    framework::DDim output_dims(ctx.Input<Tensor>("X")->dims());
+    output_dims[0] = batch_size;
+    ctx.Output<Tensor>("Out")->Resize(output_dims);
+  }
+};
+
+class GatherGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto X_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto X = ctx.Input<Tensor>("X");
+
+    X_grad->Resize(X->dims());
+  }
+};
+
+class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GatherOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The source input of gather op");
+    AddInput("Index", "The index input of gather op");
+    AddOutput("Out", "The output of add op");
+    AddComment(R"DOC(
+Gather Operator by selecting from the first axis, 
+
+Out = X[Index]
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
+            ops::GatherGradOp);
+REGISTER_OP_CPU_KERNEL(gather,
+                       ops::GatherOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    gather_grad,
+    ops::GatherGradientOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu
new file mode 100644
index 0000000000..3f04a7b3f8
--- /dev/null
+++ b/paddle/operators/gather_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/gather_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gather,
+                       ops::GatherOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/gather_op.h b/paddle/operators/gather_op.h
new file mode 100644
index 0000000000..381854f301
--- /dev/null
+++ b/paddle/operators/gather_op.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "gather.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "scatter.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename Place, typename T>
+class GatherOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *X = ctx.Input<Tensor>("X");
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Y = ctx.Output<Tensor>("Out");
+
+    Y->mutable_data<T>(ctx.GetPlace());
+    Gather<T>(ctx.GetPlace(), X, Index, Y);
+  }
+};
+
+template <typename Place, typename T>
+class GatherGradientOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    dX->mutable_data<T>(ctx.GetPlace());
+    ScatterUpdate<T>(ctx.GetPlace(), dO, Index, dX);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gather_test.cc b/paddle/operators/gather_test.cc
index d24d83f299..0ae1e99452 100644
--- a/paddle/operators/gather_test.cc
+++ b/paddle/operators/gather_test.cc
@@ -45,4 +45,8 @@ TEST(Gather, GatherData) {
 
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
   for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
+
+  delete src;
+  delete index;
+  delete output;
 }
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index f30bbce958..a85363ad81 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
-
    http://www.apache.org/licenses/LICENSE-2.0
-
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,25 +16,25 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-class GaussianRandomKernel : public framework::OpKernel {
+class CPUGaussianRandomKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     float mean = context.op_.GetAttr<float>("mean");
     float std = context.op_.GetAttr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>(0);
+    auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
-    // TODO(dzh): attribute does not support unsigned int.
-    // And we need a global random seed configuration.
-    int seed = context.op_.GetAttr<int>("seed");
+    unsigned int seed =
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
+    std::minstd_rand engine;
     if (seed == 0) {
       seed = std::random_device()();
     }
-    std::mt19937 g(seed);
-    std::normal_distribution<T> distribution(mean, std);
+    engine.seed(seed);
+    std::normal_distribution<T> dist(mean, std);
     ssize_t size = framework::product(tensor->dims());
-    for (int i = 0; i < size; ++i) {
-      data[i] = distribution(g);
+    for (ssize_t i = 0; i < size; ++i) {
+      data[i] = dist(engine);
     }
   }
 };
@@ -48,7 +45,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>(0);
+    auto* tensor = context.Output<framework::Tensor>("Out");
     auto dims = GetAttr<std::vector<int>>("dims");
     PADDLE_ENFORCE(dims.size() > 0UL,
                    "dims can be one int or array. dims must be set.");
@@ -68,8 +65,8 @@ Use to initialize tensor with gaussian random generator.
 )DOC");
 
     AddAttr<std::vector<int>>("dims", "The dimension of random tensor.");
-    AddAttr<float>("mean", "mean value of random.").SetDefault(.0f);
-    AddAttr<float>("std", "minimum value of random value.").SetDefault(1.0f);
+    AddAttr<float>("mean", "mean of random tensor.").SetDefault(.0f);
+    AddAttr<float>("std", "std of random tensor.").SetDefault(1.0f);
     AddAttr<int>("seed",
                  "Random seed of generator."
                  "0 means use system wide seed")
@@ -83,4 +80,4 @@ Use to initialize tensor with gaussian random generator.
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
                              ops::GaussianRandomOpMaker);
-REGISTER_OP_CPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>);
+REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>);
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
index 1340b1e1e9..018a4bfcb2 100644
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@@ -1,53 +1,65 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
-
    http://www.apache.org/licenses/LICENSE-2.0
-
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <memory>
-#include <random>
-#include "paddle/platform/dynload/curand.h"
-#include "paddle/platform/gpu_info.h"
-
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename T>
-class GaussianRandomKernel : public framework::OpKernel {
+struct GaussianGenerator {
+  T mean_, std_;
+  unsigned int seed_;
+
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed)
+      : mean_(mean), std_(std), seed_(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::normal_distribution<T> dist(mean_, std_);
+    rng.discard(n);
+    return dist(rng);
+  }
+};
+
+template <typename T>
+class GPUGaussianRandomKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.op_.GetAttr<float>("mean");
-    float std = context.op_.GetAttr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>(0);
+    auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
-
-    int seed = context.op_.GetAttr<int>("seed");
+    unsigned int seed =
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
     }
-    curandGenerator_t g;
-    PADDLE_ENFORCE(platform::dynload::curandCreateGenerator(
-        &g, CURAND_RNG_PSEUDO_DEFAULT));
-    PADDLE_ENFORCE(
-        platform::dynload::curandSetPseudoRandomGeneratorSeed(g, seed));
-    platform::dynload::curandGenerateNormal(
-        g, data, framework::product(tensor->dims()), mean, std);
+    T mean = static_cast<T>(context.op_.GetAttr<float>("mean"));
+    T std = static_cast<T>(context.op_.GetAttr<float>("std"));
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    ssize_t N = framework::product(tensor->dims());
+    thrust::transform(index_sequence_begin, index_sequence_begin + N,
+                      thrust::device_ptr<T>(data),
+                      GaussianGenerator<T>(mean, std, seed));
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>);
+REGISTER_OP_GPU_KERNEL(gaussian_random,
+                       paddle::operators::GPUGaussianRandomKernel<float>);
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
new file mode 100644
index 0000000000..94d40890a7
--- /dev/null
+++ b/paddle/operators/lookup_table_op.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/lookup_table_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LookupTableOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &context) const override {
+    auto table_t = context.Input<Tensor>("W");
+    auto ids_t = context.Input<Tensor>("Ids");
+    auto output_t = context.Output<Tensor>("Out");
+
+    output_t->Resize({ids_t->dims()[0], table_t->dims()[1]});
+  }
+};
+
+class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LookupTableOpMaker(framework::OpProto *proto,
+                     framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("W",
+             "An input represents embedding tensors,"
+             " which is a learnable parameter.");
+    AddInput("Ids",
+             "An input with type int32 or int64"
+             "contains the ids to be looked up in W.");
+    AddOutput("Out", "The lookup results, which have the same type with W.");
+    AddComment(
+        "This operator is used to perform lookups on the parameter W,"
+        "then concatenated into a dense tensor.");
+  }
+};
+
+class LookupTableOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &context) const override {
+    auto table = context.Input<Tensor>("W");
+    auto d_table = context.Output<Tensor>(framework::GradVarName("W"));
+    d_table->Resize(table->dims());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lookup_table, ops::LookupTableOp, ops::LookupTableOpMaker,
+            lookup_table_grad, ops::LookupTableOpGrad);
+
+REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>);
+REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel<float>);
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
new file mode 100644
index 0000000000..27eee3436a
--- /dev/null
+++ b/paddle/operators/lookup_table_op.cu
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
+__global__ void LookupTable(T* output, const T* table, const int32_t* ids,
+                            const int N, const int K, const int D) {
+  int idx = threadIdx.x;
+  int idy = blockIdx.x + threadIdx.y * GridDimX;
+
+  while (idy < K) {
+    int id = ids[idy];
+    PADDLE_ASSERT(id >= 0);
+    PADDLE_ASSERT(id < N);
+    T* out = output + idy * D;
+    const T* tab = table + id * D;
+    for (int i = idx; i < D; i += BlockDimX) {
+      out[i] = tab[i];
+    }
+    idy += BlockDimY * GridDimX;
+  }
+}
+
+template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
+__global__ void LookupTableGrad(T* table, const T* output, const int32_t* ids,
+                                const int N, const int K, const int D) {
+  int idx = threadIdx.x;
+  int idy = blockIdx.x + threadIdx.y * GridDimX;
+
+  while (idy < K) {
+    int id = ids[idy];
+    PADDLE_ASSERT(id >= 0);
+    PADDLE_ASSERT(id < N);
+    const T* out = output + idy * D;
+    T* tab = table + id * D;
+    for (int i = idx; i < D; i += BlockDimX) {
+      paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
+    }
+    idy += BlockDimY * GridDimX;
+  }
+}
+
+template <typename T>
+class LookupTableCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto table_t = context.Input<Tensor>("W");
+    auto ids_t = context.Input<Tensor>("Ids");
+    auto output_t = context.Output<Tensor>("Out");
+
+    size_t N = table_t->dims()[0];
+    size_t D = table_t->dims()[1];
+    size_t K = product(ids_t->dims());
+    auto ids = ids_t->data<int32_t>();
+    auto table = table_t->data<T>();
+    auto output = output_t->mutable_data<T>(context.GetPlace());
+
+    dim3 threads(128, 8);
+    dim3 grids(8, 1);
+    LookupTable<T, 128, 8, 8><<<grids, threads>>>(output, table, ids, N, K, D);
+  }
+};
+
+template <typename T>
+class LookupTableGradCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto ids_t = context.Input<Tensor>("Ids");
+    auto d_output_t = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto d_table_t = context.Output<Tensor>(framework::GradVarName("W"));
+
+    int N = d_table_t->dims()[0];
+    int D = d_table_t->dims()[1];
+    int K = product(ids_t->dims());
+    const int32_t* ids = ids_t->data<int32_t>();
+    const T* d_output = d_output_t->data<T>();
+    T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
+
+    auto t = framework::EigenVector<T>::Flatten(*d_table_t);
+    t.device(context.GetEigenDevice<platform::GPUPlace>()) =
+        t.constant(static_cast<T>(0));
+
+    dim3 threads(128, 8);
+    dim3 grids(8, 1);
+    LookupTableGrad<T, 128, 8, 8><<<grids, threads>>>(d_table, d_output, ids, N,
+                                                      K, D);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(lookup_table_grad,
+                       ops::LookupTableGradCUDAKernel<float>);
diff --git a/paddle/operators/lookup_table_op.h b/paddle/operators/lookup_table_op.h
new file mode 100644
index 0000000000..4da8079b91
--- /dev/null
+++ b/paddle/operators/lookup_table_op.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class LookupTableKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto table_t = context.Input<Tensor>("W");      // float tensor
+    auto ids_t = context.Input<Tensor>("Ids");      // int tensor
+    auto output_t = context.Output<Tensor>("Out");  // float tensor
+
+    size_t N = table_t->dims()[0];
+    size_t D = table_t->dims()[1];
+    auto ids = ids_t->data<int32_t>();
+    auto table = table_t->data<T>();
+    auto output = output_t->mutable_data<T>(context.GetPlace());
+    for (size_t i = 0; i < product(ids_t->dims()); ++i) {
+      PADDLE_ENFORCE_LT(ids[i], N);
+      PADDLE_ENFORCE_GE(ids[i], 0);
+      memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+    }
+  }
+};
+
+template <typename T>
+class LookupTableGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto ids_t = context.Input<Tensor>("Ids");
+    auto d_output_t = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto d_table_t = context.Output<Tensor>(framework::GradVarName("W"));
+
+    size_t N = d_table_t->dims()[0];
+    size_t D = d_table_t->dims()[1];
+    auto ids = ids_t->data<int32_t>();
+    const T* d_output = d_output_t->data<T>();
+    T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
+
+    auto t = framework::EigenVector<T>::Flatten(*d_table_t);
+    t.device(context.GetEigenDevice<platform::CPUPlace>()) =
+        t.constant(static_cast<T>(0));
+
+    for (size_t i = 0; i < product(ids_t->dims()); ++i) {
+      PADDLE_ENFORCE_LT(ids[i], N);
+      PADDLE_ENFORCE_GE(ids[i], 0);
+      for (size_t j = 0; j < D; ++j) {
+        d_table[ids[i] * D + j] += d_output[i * D + j];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index affdd1ac2c..1e86fc3d16 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -25,8 +25,8 @@ void gemm<platform::CPUPlace, float>(const CBLAS_TRANSPOSE transA,
                                      const float alpha, const float* A,
                                      const float* B, const float beta, float* C,
                                      platform::DeviceContext* context) {
-  int lda = K;
-  int ldb = N;
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
   int ldc = N;
   cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
               beta, C, ldc);
@@ -40,8 +40,8 @@ void gemm<platform::CPUPlace, double>(const CBLAS_TRANSPOSE transA,
                                       const double* B, const double beta,
                                       double* C,
                                       platform::DeviceContext* context) {
-  int lda = K;
-  int ldb = N;
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
   int ldc = N;
   cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
               beta, C, ldc);
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index 49d0f43508..d3d0e55a67 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -34,7 +34,7 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
   MeanOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of mean op");
-    AddOutput("Out", "The output of mean op").AsNoGradient();
+    AddOutput("Out", "The output of mean op").NotInGradient();
     AddComment("Mean Operator");
   }
 };
diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h
index fcb703e63b..9848af280b 100644
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@@ -55,9 +55,10 @@ class MeanGradKernel : public framework::OpKernel {
     IG->mutable_data<T>(context.GetPlace());
 
     T ig_size = (T)framework::product(IG->dims());
+    Eigen::DSizes<int, 1> bcast(ig_size);
 
     EigenVector<T>::Flatten(*IG).device(context.GetEigenDevice<Place>()) =
-        EigenScalar<T>::From(*OG) / ig_size;
+        (EigenVector<T>::From(*OG) / ig_size).broadcast(bcast);
   }
 };
 
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
new file mode 100644
index 0000000000..1eee9644ba
--- /dev/null
+++ b/paddle/operators/minus_op.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/minus_op.h"
+#include "paddle/operators/net_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MinusOp : public framework::OperatorWithKernel {
+ public:
+  MinusOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto *left_tensor = ctx.Input<framework::Tensor>("X");
+    auto *right_tensor = ctx.Input<framework::Tensor>("Y");
+
+    PADDLE_ENFORCE_EQ(
+        framework::product(left_tensor->dims()),
+        framework::product(right_tensor->dims()),
+        "Minus operator must take two tensor with same num of elements");
+    ctx.Output<framework::Tensor>("Out")->Resize(left_tensor->dims());
+  }
+};
+
+class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MinusOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The left tensor of minus operator.").NotInGradient();
+    AddInput("Y", "The right tensor of minus operator.").NotInGradient();
+    AddOutput("Out", "The output tensor of minus operator.").NotInGradient();
+
+    AddComment(R"DOC(Minus Operator
+
+Equation: Out = X - Y
+)DOC");
+  }
+};
+template <typename AttrType>
+class MinusGradOp : public NetOp {
+ public:
+  MinusGradOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    auto out_grad = Input(framework::GradVarName("Out"));
+    auto x_grad = Output(framework::GradVarName("X"));
+    auto y_grad = Output(framework::GradVarName("Y"));
+
+    // x_grad = out_grad
+    AppendOp(framework::OpRegistry::CreateOp("identity", {{"X", {out_grad}}},
+                                             {{"Out", {x_grad}}}, {}));
+
+    framework::AttributeMap scale_attr;
+    scale_attr["scale"] = static_cast<AttrType>(-1);
+    AppendOp(framework::OpRegistry::CreateOp("scale", {{"X", {out_grad}}},
+                                             {{"Out", {y_grad}}}, scale_attr));
+    CompleteAddOp(false);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+USE_OP(scale);
+USE_OP_ITSELF(identity);
+namespace ops = paddle::operators;
+REGISTER_OP(minus, ops::MinusOp, ops::MinusOpMaker, minus_grad,
+            ops::MinusGradOp<float>);
+REGISTER_OP_CPU_KERNEL(minus,
+                       ops::MinusKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/minus_op.cu b/paddle/operators/minus_op.cu
new file mode 100644
index 0000000000..a8375cc630
--- /dev/null
+++ b/paddle/operators/minus_op.cu
@@ -0,0 +1,18 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/minus_op.h"
+
+REGISTER_OP_GPU_KERNEL(
+    minus, paddle::operators::MinusKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/minus_op.h b/paddle/operators/minus_op.h
new file mode 100644
index 0000000000..6310a4fd51
--- /dev/null
+++ b/paddle/operators/minus_op.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class MinusKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* left_tensor = context.Input<framework::Tensor>("X");
+    auto* right_tensor = context.Input<framework::Tensor>("Y");
+    auto* out_tensor = context.Output<framework::Tensor>("Out");
+
+    out_tensor->mutable_data<T>(context.GetPlace());
+    auto& dev = context.GetEigenDevice<Place>();
+    framework::EigenVector<T>::Flatten(*out_tensor).device(dev) =
+        framework::EigenVector<T>::Flatten(*left_tensor) -
+        framework::EigenVector<T>::Flatten(*right_tensor);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 95d19fb6aa..173cc3850c 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -13,11 +13,12 @@
    limitations under the License. */
 
 #include "paddle/operators/mul_op.h"
-#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 
+using framework::Tensor;
+
 class MulOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -59,10 +60,23 @@ class MulOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {}
-  std::string DebugString() const override {
-    LOG(INFO) << "MulGrad";
-    return "";
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto y_dims = ctx.Input<Tensor>("Y")->dims();
+    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *y_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    PADDLE_ENFORCE(x_dims[0] == out_dims[0],
+                   "Out@GRAD M X N must equal to X dims 0, M ");
+    PADDLE_ENFORCE(y_dims[1] == out_dims[1],
+                   "Out@GRAD M X N must equal to Y dims 1, N ");
+
+    x_grad->Resize(x_dims);
+    y_grad->Resize(y_dims);
   }
 };
 
@@ -72,3 +86,5 @@ class MulOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
 REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mul_grad,
+                       ops::MulGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu
index 346a7e505d..a81444dbe6 100644
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -17,3 +17,5 @@
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(mul_grad,
+                       ops::MulGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index b7812fd1a7..8facc02814 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -31,18 +31,34 @@ template <typename Place, typename T>
 class MulKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = {
-        {Eigen::IndexPair<Eigen::DenseIndex>(1, 0)}};
-    auto* input0 = context.Input<Tensor>("X");
-    auto* input1 = context.Input<Tensor>("Y");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
-    auto X = EigenMatrix<T>::From(*input0);
-    auto Y = EigenMatrix<T>::From(*input1);
-    auto Z = EigenMatrix<T>::From(*output);
-    auto& place = context.GetEigenDevice<Place>();
-
-    Z.device(place) = X.contract(Y, dim_pair);
+    auto* X = context.Input<Tensor>("X");
+    auto* Y = context.Input<Tensor>("Y");
+    auto* Z = context.Output<Tensor>("Out");
+    Z->mutable_data<T>(context.GetPlace());
+    auto* device_context =
+        const_cast<platform::DeviceContext*>(context.device_context_);
+    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+  }
+};
+
+template <typename Place, typename T>
+class MulGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<Tensor>("X");
+    auto* Y = ctx.Input<Tensor>("Y");
+    auto* dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dY = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    dX->mutable_data<T>(ctx.GetPlace());
+    dY->mutable_data<T>(ctx.GetPlace());
+    auto* device_context =
+        const_cast<platform::DeviceContext*>(ctx.device_context_);
+    // dX = dOut * Y'. dX: M x K, dOut : M x N, Y : K x N
+    math::matmul<Place, T>(*dOut, false, *Y, true, 1, dX, 0, device_context);
+    // dY = X' * dOut. dY: K x N, dOut : M x N, X : M x K
+    math::matmul<Place, T>(*X, true, *dOut, false, 1, dY, 0, device_context);
   }
 };
 
diff --git a/paddle/operators/net_op.cc b/paddle/operators/net_op.cc
index c36fe8d6b5..44d925f0b0 100644
--- a/paddle/operators/net_op.cc
+++ b/paddle/operators/net_op.cc
@@ -68,10 +68,15 @@ std::string NetOp::DebugString() const {
 bool NetOp::IsNetOp() const { return true; }
 
 std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
+  std::vector<std::string> all;
+  for (auto& pair : this->outputs_) {
+    for (auto& var_name : pair.second) {
+      all.push_back(var_name);
+    }
+  }
   if (has_intermediate) {
-    return this->outputs_.at(kAll);
+    return all;
   }
-  auto& all = this->outputs_.at(kAll);
   std::vector<std::string> ret_val;
   for (auto& each : all) {
     if (!Contains(intermediate_outputs_, each)) {
@@ -81,11 +86,17 @@ std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
   return ret_val;
 }
 
-NetOp::NetOp(const std::string& type,
-             const framework::OperatorBase::VarNameMap& inputs,
-             const framework::OperatorBase::VarNameMap& outputs,
+NetOp::NetOp(const std::string& type, const framework::VariableNameMap& inputs,
+             const framework::VariableNameMap& outputs,
              const framework::AttributeMap& attrs)
-    : OperatorBase(type, inputs, outputs, attrs) {}
+    : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+std::unique_ptr<framework::OperatorBase> NetOp::Clone() const {
+  PADDLE_ENFORCE(
+      add_op_done_,
+      "Must clone a sealed NetOp, invoke Net::CompleteAddOp before clone");
+  return std::unique_ptr<OperatorBase>(new NetOp(*this));
+}
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h
index 4a3408c158..fcd8134b2c 100644
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -38,8 +38,20 @@ class NetOp : public framework::OperatorBase {
  public:
   static const char kAll[];
   NetOp() : framework::OperatorBase("plain_net", {}, {}, {}) {}
-  NetOp(const std::string& type, const VarNameMap& inputs,
-        const VarNameMap& outputs, const framework::AttributeMap& attrs);
+
+  NetOp(const std::string& type, const framework::VariableNameMap& inputs,
+        const framework::VariableNameMap& outputs,
+        const framework::AttributeMap& attrs);
+
+  NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) {
+    this->ops_.reserve(o.ops_.size());
+    std::transform(
+        o.ops_.begin(), o.ops_.end(), std::back_inserter(this->ops_),
+        [](const std::unique_ptr<framework::OperatorBase>& op) {
+          return std::unique_ptr<framework::OperatorBase>(op->Clone());
+        });
+    this->CompleteAddOp();
+  }
 
   /**
    * Infer all the operators' input and output variables' shapes, will be called
@@ -74,21 +86,28 @@ class NetOp : public framework::OperatorBase {
     return true;
   }
 
+  void AppendOp(const framework::OperatorBase& op) { AppendOp(op.Clone()); }
+
   /**
    * @brief Add an operator by ptr
    */
-  void AddOp(const std::shared_ptr<OperatorBase>& op) {
-    PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
+  void AppendOp(std::unique_ptr<framework::OperatorBase> op) {
+    PADDLE_ENFORCE(!add_op_done_,
+                   "Cannot AppendOp when this network is sealed");
     PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
-    ops_.push_back(op);
+    ops_.push_back(std::move(op));
   }
 
-  void InsertOp(size_t pos, const std::shared_ptr<OperatorBase>& op) {
+  void InsertOp(size_t pos, std::unique_ptr<framework::OperatorBase> op) {
     PADDLE_ENFORCE(!add_op_done_,
                    "Cannot InsertOp when this network is sealed");
     PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
     PADDLE_ENFORCE_LE(pos, ops_.size(), "Out of range");
-    ops_.insert(ops_.begin() + pos, op);
+    ops_.insert(ops_.begin() + pos, std::move(op));
+  }
+
+  void InsertOp(size_t pos, const framework::OperatorBase& op) {
+    InsertOp(pos, op.Clone());
   }
 
   void CompleteAddOp(bool calculate = true);
@@ -98,7 +117,9 @@ class NetOp : public framework::OperatorBase {
   bool IsNetOp() const override;
   std::vector<std::string> OutputVars(bool has_intermediate) const override;
 
-  std::vector<std::shared_ptr<OperatorBase>> ops_;
+  std::unique_ptr<framework::OperatorBase> Clone() const override;
+
+  std::vector<std::unique_ptr<framework::OperatorBase>> ops_;
 
  private:
   bool add_op_done_{false};
diff --git a/paddle/operators/net_op_test.cc b/paddle/operators/net_op_test.cc
index 0cef71de6a..f2e98ee7a1 100644
--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
@@ -13,6 +13,7 @@ static int run_cnt = 0;
 class TestOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
+  DEFINE_OP_CLONE_METHOD(TestOp);
   void InferShape(const Scope& scope) const override { ++infer_shape_cnt; }
   void Run(const Scope& scope,
            const platform::DeviceContext& dev_ctx) const override {
@@ -37,15 +38,12 @@ TEST(OpKernel, all) {
   auto net = std::make_shared<NetOp>();
   ASSERT_NE(net, nullptr);
 
-  auto op1 = std::shared_ptr<TestOp>(
+  net->AppendOp(std::unique_ptr<TestOp>(
       new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                 {{"Out", {"y"}}}, {}));
-  net->AddOp(op1);
-
-  auto op2 = std::shared_ptr<TestOp>(
+                 {{"Out", {"y"}}}, {})));
+  net->AppendOp(std::unique_ptr<TestOp>(
       new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
-                 {{"Out", {"z"}}}, {}));
-  net->AddOp(op2);
+                 {{"Out", {"z"}}}, {})));
 
   net->CompleteAddOp();
   AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"},
@@ -60,15 +58,31 @@ TEST(OpKernel, all) {
 
 TEST(NetOp, insert_op) {
   NetOp net;
-  auto op1 = std::shared_ptr<framework::NOP>(
+  auto op1 = std::unique_ptr<framework::NOP>(
       new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
                          {{"Out", {"y"}}}, {}));
-  net.AddOp(op1);
-  net.InsertOp(0, op1);
+  net.AppendOp(*op1);
+  net.InsertOp(0, *op1);
   ASSERT_EQ(2UL, net.ops_.size());
-  net.InsertOp(2, op1);
+  net.InsertOp(2, std::move(op1));
   ASSERT_EQ(3UL, net.ops_.size());
 }
 
+TEST(NetOp, Clone) {
+  NetOp net;
+  net.AppendOp(
+      std::unique_ptr<framework::NOP>(new framework::NOP{"empty", {}, {}, {}}));
+  net.AppendOp(std::unique_ptr<framework::NOP>(
+      new framework::NOP{"empty2", {}, {}, {}}));
+  net.CompleteAddOp(true);
+  auto new_net_op = net.Clone();
+  ASSERT_NE(new_net_op, nullptr);
+  ASSERT_TRUE(new_net_op->IsNetOp());
+  auto* new_net = static_cast<NetOp*>(new_net_op.get());
+  ASSERT_EQ(2UL, new_net->ops_.size());
+  ASSERT_EQ(new_net->ops_[0]->Type(), "empty");
+  ASSERT_EQ(new_net->ops_[1]->Type(), "empty2");
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 78ce0ba3c0..16bd249cb3 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -131,8 +131,8 @@ const rnn::ArgumentName RecurrentGradientOp::kArgName{
     "memories",    "pre_memories", "boot_memories@grad"};
 
 RecurrentOp::RecurrentOp(const std::string& type,
-                         const framework::OperatorBase::VarNameMap& inputs,
-                         const framework::OperatorBase::VarNameMap& outputs,
+                         const framework::VariableNameMap& inputs,
+                         const framework::VariableNameMap& outputs,
                          const framework::AttributeMap& attrs)
     : OperatorBase(type, inputs, outputs, attrs) {
   rnn::InitArgument(kArgName, &arg_, *this);
@@ -223,8 +223,8 @@ void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const {
 }
 
 RecurrentGradientOp::RecurrentGradientOp(
-    const std::string& type, const framework::OperatorBase::VarNameMap& inputs,
-    const framework::OperatorBase::VarNameMap& outputs,
+    const std::string& type, const framework::VariableNameMap& inputs,
+    const framework::VariableNameMap& outputs,
     const framework::AttributeMap& attrs)
     : OperatorBase(type, inputs, outputs, attrs) {
   rnn::InitArgument(kArgName, &arg_, *this);
diff --git a/paddle/operators/recurrent_op.h b/paddle/operators/recurrent_op.h
index 171a0bd2ae..1033d657a3 100644
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
@@ -34,7 +34,8 @@ class RecurrentAlgorithm {
   void Run(const framework::Scope& scope,
            const platform::DeviceContext& dev_ctx) const;
 
-  void Init(rnn::Argument* arg, std::shared_ptr<NetOp>* stepnet) {
+  void Init(rnn::Argument* arg,
+            std::unique_ptr<framework::OperatorBase>* stepnet) {
     PADDLE_ENFORCE_NOT_NULL(stepnet, "stepnet should be set before.");
     arg_ = arg;
     stepnet_ = stepnet;
@@ -63,7 +64,7 @@ class RecurrentAlgorithm {
   void InitMemories(framework::Scope* step_scopes, bool infer_shape_mode) const;
 
  private:
-  std::shared_ptr<NetOp>* stepnet_;
+  std::unique_ptr<framework::OperatorBase>* stepnet_;
   rnn::Argument* arg_;
   mutable size_t seq_len_;
 };
@@ -80,7 +81,8 @@ class RecurrentGradientAlgorithm {
    * operator.
    */
  public:
-  void Init(rnn::Argument* arg, std::shared_ptr<NetOp>* stepnet) {
+  void Init(rnn::Argument* arg,
+            std::unique_ptr<framework::OperatorBase>* stepnet) {
     PADDLE_ENFORCE_NOT_NULL(stepnet, "stepnet should be set before.");
     arg_ = std::move(arg);
     stepnet_ = stepnet;
@@ -107,16 +109,24 @@ class RecurrentGradientAlgorithm {
  private:
   rnn::Argument* arg_;
   mutable size_t seq_len_;
-  std::shared_ptr<NetOp>* stepnet_;
+  std::unique_ptr<framework::OperatorBase>* stepnet_;
 };
 
-class RecurrentOp final : public framework::OperatorBase {
+class RecurrentOp : public framework::OperatorBase {
  public:
-  RecurrentOp(const std::string& type, const VarNameMap& inputs,
-              const VarNameMap& outputs, const framework::AttributeMap& attrs);
+  RecurrentOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs);
+
+  RecurrentOp(const RecurrentOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    // TODO(yuyang18): Implement copy ctor well.
+    PADDLE_THROW("Not implemented");
+  }
   /**
-     * InferShape must be called before Run.
-     */
+   * InferShape must be called before Run.
+   */
   void InferShape(const framework::Scope& scope) const override {
     alg_.InferShape(scope);
   }
@@ -126,23 +136,33 @@ class RecurrentOp final : public framework::OperatorBase {
     alg_.Run(scope, dev_ctx);
   }
 
-  void set_stepnet(std::shared_ptr<NetOp> net) { stepnet_ = net; }
-  const NetOp& stepnet() const { return *stepnet_; }
+  void set_stepnet(std::unique_ptr<OperatorBase> net) {
+    stepnet_ = std::move(net);
+  }
+  const OperatorBase& stepnet() const { return *stepnet_; }
 
   static const rnn::ArgumentName kArgName;
 
  private:
   RecurrentAlgorithm alg_;
   rnn::Argument arg_;
-  std::shared_ptr<NetOp> stepnet_;
+  std::unique_ptr<OperatorBase> stepnet_;
 };
 
-class RecurrentGradientOp final : public framework::OperatorBase {
+class RecurrentGradientOp : public framework::OperatorBase {
  public:
-  RecurrentGradientOp(const std::string& type, const VarNameMap& inputs,
-                      const VarNameMap& outputs,
+  RecurrentGradientOp(const std::string& type,
+                      const framework::VariableNameMap& inputs,
+                      const framework::VariableNameMap& outputs,
                       const framework::AttributeMap& attrs);
 
+  RecurrentGradientOp(const RecurrentGradientOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    // TODO(yuyang18): Implement Copy ctor.
+    PADDLE_THROW("Not Implemented");
+  }
+
   /**
    * InferShape must be called before Run.
    */
@@ -157,12 +177,14 @@ class RecurrentGradientOp final : public framework::OperatorBase {
 
   static const rnn::ArgumentName kArgName;
 
-  void set_stepnet(const std::shared_ptr<NetOp>& net) { stepnet_ = net; }
-  const NetOp& stepnet() const { return *stepnet_; }
+  void set_stepnet(std::unique_ptr<OperatorBase> net) {
+    stepnet_ = std::move(net);
+  }
+  const OperatorBase& stepnet() const { return *stepnet_; }
 
  private:
   RecurrentGradientAlgorithm alg_;
-  std::shared_ptr<NetOp> stepnet_;
+  std::unique_ptr<OperatorBase> stepnet_;
   rnn::Argument arg_;
 };
 
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index 8375d98804..6825dce332 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -17,7 +17,9 @@
 namespace paddle {
 namespace operators {
 
-class RowWiseAddOp : public framework::OperatorWithKernel {
+using framework::Tensor;
+
+class RowwiseAddOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -34,9 +36,9 @@ class RowWiseAddOp : public framework::OperatorWithKernel {
   }
 };
 
-class RowWiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
+class RowwiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RowWiseAddOpMaker(framework::OpProto *proto,
+  RowwiseAddOpMaker(framework::OpProto *proto,
                     framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The left input of row-wise add op, must be matrix");
@@ -49,12 +51,32 @@ for i in xrange(X.shape[0]):
 )DOC");
   }
 };
+class RowwiseAddGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "X should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("b"), "b should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) should not be null");
+    auto dims0 = ctx.Input<Tensor>("X")->dims();
+    auto dims1 = ctx.Input<Tensor>("b")->dims();
+    PADDLE_ENFORCE_EQ(1, dims1.size(), "b dims should be 1")
+    ctx.Output<Tensor>(framework::GradVarName("X"))->Resize(dims0);
+    ctx.Output<Tensor>(framework::GradVarName("b"))->Resize(dims1);
+  }
+};
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(rowwise_add, ops::RowWiseAddOp,
-                             ops::RowWiseAddOpMaker);
+REGISTER_OP(rowwise_add, ops::RowwiseAddOp, ops::RowwiseAddOpMaker,
+            rowwise_add_grad, ops::RowwiseAddGradOp);
+REGISTER_OP_CPU_KERNEL(
+    rowwise_add, ops::RowwiseAddKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    rowwise_add, ops::RowWiseAddKernel<paddle::platform::CPUPlace, float>);
+    rowwise_add_grad,
+    ops::RowwiseAddGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu
index 86f80b8122..4a57f64c89 100644
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@@ -17,4 +17,7 @@
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    rowwise_add, ops::RowWiseAddKernel<paddle::platform::GPUPlace, float>);
+    rowwise_add, ops::RowwiseAddKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    rowwise_add_grad,
+    ops::RowwiseAddGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
index 01f88f2198..1cbd8bb31a 100644
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class RowWiseAddKernel : public framework::OpKernel {
+class RowwiseAddKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto out = context.Output<Tensor>("Out");
@@ -47,5 +47,25 @@ class RowWiseAddKernel : public framework::OpKernel {
   }
 };
 
+template <typename Place, typename T>
+class RowwiseAddGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* db = context.Output<Tensor>(framework::GradVarName("b"));
+    dX->mutable_data<T>(context.GetPlace());
+    db->mutable_data<T>(context.GetPlace());
+
+    auto OutGrad = EigenMatrix<T>::From(*dOut);
+    auto place = context.GetEigenDevice<Place>();
+    EigenMatrix<T>::From(*dX).device(place) = OutGrad;
+
+    // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html
+    // colwise add
+    Eigen::array<int, 1> dims{{0}}; /* dimension to reduce */
+    EigenVector<T>::Flatten(*db).device(place) = OutGrad.sum(dims);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
new file mode 100644
index 0000000000..8e96a74c94
--- /dev/null
+++ b/paddle/operators/scale_op.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/scale_op.h"
+#include "paddle/operators/net_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ScaleOp : public framework::OperatorWithKernel {
+ public:
+  ScaleOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto *in = ctx.Input<framework::Tensor>("X");
+    auto *out = ctx.Output<framework::Tensor>("Out");
+    out->Resize(in->dims());
+  }
+};
+
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of scale operator.").NotInGradient();
+    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
+    AddComment(R"DOC(Scale operator
+
+The equation is: Out = scale*X
+)DOC");
+    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+  }
+};
+
+// Identity Op's gradient is identity op, too.
+// Grad(Out=scale(X)) => Grad(X) = scale(Grad(Out))
+template <typename AttrType>
+class ScaleGradOp : public NetOp {
+ public:
+  ScaleGradOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    AppendOp(framework::OpRegistry::CreateOp(
+        "scale", {{"X", {Input(framework::GradVarName("Out"))}}},
+        {{"Out", {Output(framework::GradVarName("X"))}}},
+        {{"scale", GetAttr<AttrType>("scale")}}));
+    CompleteAddOp(false);
+  }
+};
+
+// identity is a alias of scale op. This is also a example for creating a alias
+// operator.
+template <typename AttrType>
+class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IdentityOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "input tensor of identity op");
+    AddOutput("Out", "output tensor of identity op");
+    AddComment("identity operator. Just a alias of scale op which scale = 1.0");
+  }
+};
+
+template <typename AttrType>
+class IdentityOp : public NetOp {
+ public:
+  IdentityOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    AppendOp(framework::OpRegistry::CreateOp(
+        "scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
+        {{"scale", static_cast<AttrType>(1)}}));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(scale, ops::ScaleOp, ops::ScaleOpMaker<float>, scale_grad,
+            ops::ScaleGradOp<float>);
+REGISTER_OP_CPU_KERNEL(scale,
+                       ops::ScaleKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp<float>,
+                             ops::IdentityOpMaker<float>);
diff --git a/paddle/operators/scale_op.cu b/paddle/operators/scale_op.cu
new file mode 100644
index 0000000000..63efbe0da8
--- /dev/null
+++ b/paddle/operators/scale_op.cu
@@ -0,0 +1,18 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/scale_op.h"
+
+REGISTER_OP_GPU_KERNEL(
+    scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/scale_op.h b/paddle/operators/scale_op.h
new file mode 100644
index 0000000000..aea64f1b04
--- /dev/null
+++ b/paddle/operators/scale_op.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename Place, typename T, typename AttrType = T>
+class ScaleKernel : public framework::OpKernel {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    tensor->mutable_data<T>(in->place());
+
+    auto scale = static_cast<T>(context.op_.GetAttr<AttrType>("scale"));
+
+    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto& dev = context.GetEigenDevice<Place>();
+    eigen_out.device(dev) = scale * eigen_in;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
new file mode 100644
index 0000000000..f901edefa2
--- /dev/null
+++ b/paddle/operators/scatter_op.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/scatter_op.h"
+#include "paddle/framework/ddim.h"
+
+namespace paddle {
+namespace operators {
+
+class ScatterOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Index")->dims().size(), 1,
+                      "Update Index should be 1-D.");
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Ref")->dims().size(),
+                      ctx.Input<Tensor>("Updates")->dims().size(),
+                      "Reference and Updates should have the same shape size");
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Updates")->dims()[0],
+                      ctx.Input<Tensor>("Index")->dims()[0],
+                      "Updates and Index should have same batch-size.");
+    framework::DDim data_dim(ctx.Input<Tensor>("Updates")->dims());
+    for (int i = 1; i < data_dim.size(); ++i)
+      PADDLE_ENFORCE_EQ(data_dim[i], ctx.Input<Tensor>("Updates")->dims()[i]);
+    ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("Ref")->dims());
+  }
+};
+
+class ScatterGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
+    auto *Updates = ctx.Input<Tensor>("Updates");
+    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *Ref = ctx.Input<Tensor>("Ref");
+
+    dRef->Resize(Ref->dims());
+    dUpdates->Resize(Updates->dims());
+  }
+};
+
+class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScatterOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Ref", "The source input of scatter op");
+    AddInput("Index",
+             "The index input of scatter op where Ref will be updated");
+    AddInput("Updates", "The updated value of updates op");
+    AddOutput("Out", "The output of add op");
+    AddComment(R"DOC(
+Scatter Operator by selecting from the first axis, 
+
+Out = Ref
+Out[Index] = Ref[Index] + Updates
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
+            ops::ScatterGradOp);
+REGISTER_OP_CPU_KERNEL(scatter,
+                       ops::ScatterOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    scatter_grad,
+    ops::ScatterGradientOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
new file mode 100644
index 0000000000..6716b47883
--- /dev/null
+++ b/paddle/operators/scatter_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/scatter_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(scatter,
+                       ops::ScatterOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/scatter_op.h b/paddle/operators/scatter_op.h
new file mode 100644
index 0000000000..e9595638a8
--- /dev/null
+++ b/paddle/operators/scatter_op.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "gather.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "scatter.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename Place, typename T>
+class ScatterOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *Ref = ctx.Input<Tensor>("Ref");
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Updates = ctx.Input<Tensor>("Updates");
+    auto *Out = ctx.Output<Tensor>("Out");
+
+    // In place output: Out = Ref, Out[Index] += Updates
+    Out->ShareDataWith<T>(*Ref);
+    // Apply ScatterUpdate: Out[index] += Updates[:]
+    ScatterUpdate<T>(ctx.GetPlace(), Updates, Index, Out);
+  }
+};
+
+template <typename Place, typename T>
+class ScatterGradientOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    // In place gradient: dRef = dO
+    dRef->ShareDataWith<T>(*dOut);
+    dUpdates->mutable_data<T>(ctx.GetPlace());
+    // Gradient by Gather: dUpdates += dO[Index]
+    Gather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/scatter_test.cc b/paddle/operators/scatter_test.cc
index 4449ce6564..26fdaff146 100644
--- a/paddle/operators/scatter_test.cc
+++ b/paddle/operators/scatter_test.cc
@@ -49,4 +49,8 @@ TEST(scatter, ScatterUpdate) {
     EXPECT_EQ(output->data<float>()[i], float(i - 4));
   for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], float(0));
   for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
+
+  delete src;
+  delete index;
+  delete output;
 }
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index bfb449d0b0..a0b5000ffb 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -30,7 +30,7 @@ class SGDOpKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto param = ctx.Input<Tensor>("param");
     auto grad = ctx.Input<Tensor>("grad");
-    auto param_out = ctx.Output<Tensor>(0);
+    auto param_out = ctx.Output<Tensor>("param_out");
     float lr = ctx.op_.GetAttr<float>("learning_rate");
 
     param_out->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index d773a4f2d5..761c6de8d4 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -44,7 +44,8 @@ class SigmoidOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
+    ctx.Output<Tensor>(framework::GradVarName("X"))
+        ->Resize(ctx.Input<Tensor>("Y")->dims());
   }
 };
 
diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h
index 11ab923eb3..b01a9b3f23 100644
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
@@ -37,7 +37,7 @@ class SigmoidKernel : public framework::OpKernel {
     auto Y = EigenVector<T>::Flatten(*output);
     auto place = context.GetEigenDevice<Place>();
 
-    Y.device(place) = 1.0 / (1.0 + (-1.0 * X).exp());
+    Y.device(place) = 1. / (1. + (-X).exp());
   }
 };
 
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index a0a0d4d914..29491137e6 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
-
    http://www.apache.org/licenses/LICENSE-2.0
-
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -39,7 +36,8 @@ class CPUUniformRandomKernel : public framework::OpKernel {
     std::uniform_real_distribution<T> dist(
         static_cast<T>(context.op_.GetAttr<float>("min")),
         static_cast<T>(context.op_.GetAttr<float>("max")));
-    for (ssize_t i = 0; i < framework::product(tensor->dims()); ++i) {
+    ssize_t size = framework::product(tensor->dims());
+    for (ssize_t i = 0; i < size; ++i) {
       data[i] = dist(engine);
     }
   }
@@ -66,7 +64,6 @@ class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Out", "The output tensor of uniform random op");
     AddComment(R"DOC(Uniform random operator.
-
 Used to initialize tensor with uniform random generator.
 )DOC");
     AddAttr<std::vector<int>>("dims", "the dimension of random tensor");
diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu
index 7a243555b6..1d6709934c 100644
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
-
    http://www.apache.org/licenses/LICENSE-2.0
-
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 0547ac93cd..2b945de18a 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -276,17 +276,21 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
 void Argument::concat(const std::vector<Argument>& args,
                       const std::vector<int>& selectRows,
                       const std::vector<int>& seqStartPos,
+                      const std::vector<int>& copySize,
                       bool useGpu,
                       hl_stream_t stream,
                       PassType passType) {
   CHECK(!subSequenceStartPositions)
       << "undefined behavior for subsequence positions";
 
-  size_t batchSize = selectRows.size();
+  size_t batchSize = 0;
+  for (size_t i = 0; i < copySize.size(); ++i)
+    batchSize += copySize[i] * (seqStartPos[i + 1] - seqStartPos[i]);
+
   auto copyArg = [batchSize, stream](MatrixPtr& dst,
                                      MatrixPtr src,
-                                     int startRow,
-                                     int pos,
+                                     int desStartRow,
+                                     int srcStartRow,
                                      int size,
                                      bool useGpu) {
     if (!src) {
@@ -300,14 +304,14 @@ void Argument::concat(const std::vector<Argument>& args,
       dst->resize(batchSize, width);
     }
 
-    MatrixPtr tmpMatrix = dst->subMatrix(startRow, size);
-    tmpMatrix->copyFrom(*src->subMatrix(pos, size), stream);
+    MatrixPtr tmpMatrix = dst->subMatrix(desStartRow, size);
+    tmpMatrix->copyFrom(*src->subMatrix(srcStartRow, size), stream);
   };
 
   auto copyIds = [batchSize, stream](IVectorPtr& dst,
                                      const IVectorPtr& src,
-                                     int startRow,
-                                     int pos,
+                                     int desStartRow,
+                                     int srcStartRow,
                                      int size,
                                      bool useGpu) {
     if (!src) {
@@ -315,13 +319,14 @@ void Argument::concat(const std::vector<Argument>& args,
       return;
     }
     IVector::resizeOrCreate(dst, batchSize, useGpu);
-    dst->subVec(startRow, size)->copyFrom(*src->subVec(pos, size), stream);
+    dst->subVec(desStartRow, size)
+        ->copyFrom(*src->subVec(srcStartRow, size), stream);
   };
 
   auto copyStrs = [batchSize, stream](SVectorPtr& dst,
                                       const SVectorPtr& src,
-                                      int startRow,
-                                      int pos,
+                                      int desStartRow,
+                                      int srcStartRow,
                                       int size,
                                       bool useGpu) {
     if (!src) {
@@ -333,30 +338,31 @@ void Argument::concat(const std::vector<Argument>& args,
     } else {
       dst->resize(batchSize);
     }
-    std::copy(
-        src->begin() + pos, src->begin() + pos + size, dst->begin() + startRow);
+    std::copy(src->begin() + srcStartRow,
+              src->begin() + srcStartRow + size,
+              dst->begin() + desStartRow);
   };
 
   dataId = args[0].dataId;
   CHECK_NE(seqStartPos.size(), 0UL);
-  size_t sampleNum = seqStartPos.size() - 1;
-  for (size_t i = 0; i < sampleNum; ++i) {
+  int desStartRow = 0;
+  for (size_t i = 0; i < copySize.size(); ++i) {
     int startPos = seqStartPos[i];
     int endPos = seqStartPos[i + 1];
     CHECK_GE(args.size(), static_cast<size_t>(endPos - startPos));
     for (int j = startPos; j < endPos; ++j) {
       const Argument& arg = args[j - startPos];
-      CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have"
-                                   << " same dataId";
-      const int copySize = 1;
-      const int rowIdx = selectRows[j];
-      copyArg(in, arg.in, j, rowIdx, copySize, useGpu);
-      copyArg(value, arg.value, j, rowIdx, copySize, useGpu);
+      CHECK_EQ(arg.dataId, dataId) << "Arguments to concatenate should have "
+                                   << "the same dataId.";
+      const int srcStartRow = selectRows[j];
+      copyArg(in, arg.in, desStartRow, srcStartRow, copySize[i], useGpu);
+      copyArg(value, arg.value, desStartRow, srcStartRow, copySize[i], useGpu);
       if (passType != PASS_TEST) {
-        copyArg(grad, arg.grad, j, rowIdx, copySize, useGpu);
+        copyArg(grad, arg.grad, desStartRow, srcStartRow, copySize[i], useGpu);
       }
-      copyIds(ids, arg.ids, j, rowIdx, copySize, useGpu);
-      copyStrs(strs, arg.strs, j, rowIdx, copySize, useGpu);
+      copyIds(ids, arg.ids, desStartRow, srcStartRow, copySize[i], useGpu);
+      copyStrs(strs, arg.strs, desStartRow, srcStartRow, copySize[i], useGpu);
+      desStartRow += copySize[i];
     }
   }
   ICpuGpuVector::resizeOrCreate(
@@ -670,19 +676,28 @@ void Argument::reorganizeSeqInfo(
     const ICpuGpuVectorPtr seqStartPos,
     const ICpuGpuVectorPtr subSeqStartPos,
     std::vector<std::vector<int>>& reorganizedSeqInfo) {
-  int* seqStarts = seqStartPos->getMutableData(false);
-  int* subSeqStarts = subSeqStartPos->getMutableData(false);
+  CHECK(seqStartPos);
 
   int seqNum = seqStartPos->getSize() - 1;
-  reorganizedSeqInfo.resize(seqNum, std::vector<int>());
-  int seqIdx = 0;
-  for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
-    reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
-    if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
-      seqIdx++;
-      if (seqIdx == seqNum) return;
+  int* seqStarts = seqStartPos->getMutableData(false);
+
+  if (subSeqStartPos) {
+    int* subSeqStarts = subSeqStartPos->getMutableData(false);
+    reorganizedSeqInfo.resize(seqNum, std::vector<int>());
+    int seqIdx = 0;
+    for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
       reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+      if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
+        seqIdx++;
+        if (seqIdx == seqNum) return;
+        reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+      }
     }
+  } else {
+    reorganizedSeqInfo.resize(1, std::vector<int>(seqNum + 1, 0));
+    memcpy(reorganizedSeqInfo[0].data(),
+           seqStarts,
+           sizeof(int) * seqStartPos->getSize());
   }
 }
 
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index d8d7a4398f..38797a76f5 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -240,6 +240,7 @@ struct Argument {
   void concat(const std::vector<Argument>& args,
               const std::vector<int>& selectRows,
               const std::vector<int>& seqStartPos,
+              const std::vector<int>& copySize,
               bool useGpu,
               hl_stream_t stream,
               PassType passType);
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index ebe36d4937..f031109501 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -48,7 +48,8 @@ Parameter::Parameter(const ParameterConfig& config, bool useGpu, bool doInit)
       deviceId_(-1),
       sharedCount_(0),
       updateCounter_(0),
-      updated_(false) {
+      updated_(false),
+      headerFormat_(PARAM_FORMAT_ORIGINAL) {
   setID(-1); /* capture uninitialized id */
   if (useGpu_ && FLAGS_parallel_nn) {
     /* gpu environment is specified by device property */
@@ -285,7 +286,7 @@ bool Parameter::save(const std::string& filename) const {
 bool Parameter::save(std::ostream& s) const {
   CpuVector vec(*bufs_[PARAMETER_VALUE].get());
   Header header;
-  header.version = kFormatVersion;
+  header.format = headerFormat_;
   header.valueSize = sizeof(real);
   header.size = getSize();
 
@@ -344,8 +345,9 @@ bool Parameter::load(std::istream& s) {
   Header header;
   CHECK(s.read(reinterpret_cast<char*>(&header), sizeof(header)))
       << "Fail to read parameter " << getName();
-  CHECK_EQ(header.version, kFormatVersion) << "Incorrect format version: "
-                                           << header.version;
+  CHECK(isHeaderFormatSupported(header.format)) << "Incorrect format version: "
+                                                << header.format;
+  headerFormat_ = header.format;
   CHECK_EQ(header.size, getSize())
       << "The size (" << header.size << ") in the file does not match the size "
       << "(" << getSize() << ") of the parameter: " << getName();
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 0bac76f068..321f4275d8 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -34,6 +34,20 @@ limitations under the License. */
 
 namespace paddle {
 
+typedef enum {
+  /// The paddle original basic format
+  PARAM_FORMAT_ORIGINAL = 0,
+
+  /// See mkldnn_memory_format_t in
+  /// https://github.com/01org/mkl-dnn/blob/master/include/mkldnn_types.h
+  /// for a detailed description.
+  /// 2D weights tensor in the format (output channels, input channels).
+  PARAM_FORMAT_MKLDNN_OI,
+
+  /// The total format items numbers
+  PARAM_FORMAT_ITEMS,
+} PARAM_FORMAT;
+
 class SparsePrefetchRowCpuMatrix;
 
 class Parameter;
@@ -51,7 +65,10 @@ public:
   size_t getSize() const { return config_.size(); }
 
   bool isFullSize() const {
-    return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
+    if (bufs_[PARAMETER_VALUE]) {
+      return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
+    }
+    return false;
   }
 
   inline bool useGpu() const { return useGpu_; }
@@ -242,14 +259,30 @@ public:
   /// Initialize the value to 0
   void zeroMem();
 
-  static const int kFormatVersion = 0;
   /// file header structure
   struct Header {
-    int32_t version;     // = 0, file format version
+    int32_t format;      // = PARAM_FORMAT
     uint32_t valueSize;  // = sizeof(real)
     uint64_t size;       // = getSize()
   };
 
+  /**
+   * @brief Is the header format supported.
+   */
+  static bool isHeaderFormatSupported(int32_t fmt) {
+    return fmt < PARAM_FORMAT_ITEMS;
+  }
+
+  /**
+   * @brief Get the format in header.
+   */
+  int getHeaderFormat() { return headerFormat_; }
+
+  /**
+   * @brief Set the format in header.
+   */
+  void setHeaderFormat(int32_t fmt) { headerFormat_ = fmt; }
+
   /**
    * @brief  Parameter Update Hook.
    *
@@ -321,6 +354,9 @@ protected:
   bool updated_;
   SparseFormat format_;
 
+  /// The header format for saving or loading param
+  int32_t headerFormat_;
+
   std::vector<std::shared_ptr<IParameterUpdaterHook>> updaterHooks_;
 
 public:
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 4154aad15c..120eb1e4af 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog)
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
-nv_library(gpu_info SRCS gpu_info.cc DEPS gflags)
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
@@ -9,6 +9,7 @@ cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 add_subdirectory(dynload)
 
 cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece)
+cc_test(environment_test SRCS environment_test.cc DEPS stringpiece)
 
 IF(WITH_GPU)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
@@ -16,5 +17,8 @@ ELSE()
     set(GPU_CTX_DEPS)
 ENDIF()
 
-cc_library(device_context SRCS device_context.cc DEPS place eigen3 ${GPU_CTX_DEPS})
+# memcpy deoends on device_context, here add deps individually for
+# avoiding cycle dependencies
+cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
+    system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
diff --git a/paddle/platform/cuda_helper.h b/paddle/platform/cuda_helper.h
new file mode 100644
index 0000000000..6feec0d7f8
--- /dev/null
+++ b/paddle/platform/cuda_helper.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cuda.h>
+
+namespace paddle {
+namespace platform {
+
+#define CUDA_ATOMIC_WRAPPER(op, T) \
+  __device__ __forceinline__ T CudaAtomic##op(T* address, const T val)
+
+#define USE_CUDA_ATOMIC(op, T) \
+  CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
+
+// For atomicAdd.
+USE_CUDA_ATOMIC(Add, float);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+USE_CUDA_ATOMIC(Add, double);
+#else
+CUDA_ATOMIC_WRAPPER(Add, double) {
+  unsigned long long int* address_as_ull =
+      reinterpret_cast<unsigned long long int*>(address);
+  unsigned long long int old = *address_as_ull, assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
+}
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index a928e09778..ad212c5b2c 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/platform/device_context.h"
+#include "paddle/memory/memory.h"
 
 namespace paddle {
 namespace platform {
@@ -36,6 +37,59 @@ Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
 
 #ifndef PADDLE_ONLY_CPU
 
+class EigenCudaStreamDevice : public Eigen::StreamInterface {
+ public:
+  EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
+    Eigen::initializeDeviceProp();
+  }
+  ~EigenCudaStreamDevice() override {}
+
+  void Reinitialize(const cudaStream_t* cuda_stream, GPUPlace place) {
+    stream_ = cuda_stream;
+    place_ = place;
+    device_prop_ = &Eigen::m_deviceProperties[place.device];
+  }
+
+  const cudaStream_t& stream() const override { return *stream_; }
+
+  const cudaDeviceProp& deviceProperties() const override {
+    return *device_prop_;
+  }
+
+  void* allocate(size_t num_bytes) const override {
+    return paddle::memory::Alloc(place_, num_bytes);
+  }
+
+  void deallocate(void* buffer) const override {
+    paddle::memory::Free(place_, buffer);
+  }
+
+  void* scratchpad() const override {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(Eigen::kCudaScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  unsigned int* semaphore() const override {
+    if (semaphore_ == NULL) {
+      char* scratch =
+          static_cast<char*>(scratchpad()) + Eigen::kCudaScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      PADDLE_ENFORCE(
+          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
+    }
+    return semaphore_;
+  }
+
+ private:
+  GPUPlace place_;
+  const cudaStream_t* stream_;         // not owned;
+  const cudaDeviceProp* device_prop_;  // not owned;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+};
+
 template <>
 Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
   return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
@@ -43,19 +97,9 @@ Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
 
 CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
   SetDeviceId(place_.device);
-  // TODO(qijun) Pass a created cuda stream to Eigen::CudaStreamDevice directly
-  // here will cause segment fault. We must implement a class derived from
-  // Eigen::StreamInterface, and reinitialize it with a cuda stream and a gpu id
-  // later. Please refer to the implementation of class EigenCudaStreamDevice
-  // in TensorFlow.
-  //
-  // We find that CUDA 7 introduces a new option, the per-thread default stream,
-  // that has two effects. Please refer to https://devblogs.nvidia.com/
-  // parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
-  //
-  // So, we decide to use default stream and add –default-stream per-thread nvcc
-  // flag. Than, two threads with two CUDADeviceContexts will run parallelly.
-  eigen_stream_.reset(new Eigen::CudaStreamDevice());
+  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
+  eigen_stream_.reset(new EigenCudaStreamDevice());
+  eigen_stream_->Reinitialize(&stream_, place);
   eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
 }
 
@@ -70,17 +114,15 @@ CUDADeviceContext::~CUDADeviceContext() {
     PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
   }
 
-  if (curand_generator_) {
-    PADDLE_ENFORCE(dynload::curandDestroyGenerator(curand_generator_));
-  }
   eigen_stream_.reset();
   eigen_device_.reset();
+  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
 }
 
 Place CUDADeviceContext::GetPlace() const { return place_; }
 
 void CUDADeviceContext::Wait() const {
-  PADDLE_ENFORCE(cudaStreamSynchronize(0));
+  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
 }
 
 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
@@ -91,6 +133,7 @@ cublasHandle_t CUDADeviceContext::cublas_handle() {
   if (!cublas_handle_) {
     SetDeviceId(place_.device);
     PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
+    PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
   }
   return cublas_handle_;
 }
@@ -99,20 +142,12 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() {
   if (!cudnn_handle_) {
     SetDeviceId(place_.device);
     PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
+    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
   }
   return cudnn_handle_;
 }
 
-curandGenerator_t CUDADeviceContext::curand_generator() {
-  if (!curand_generator_) {
-    SetDeviceId(place_.device);
-    PADDLE_ENFORCE(dynload::curandCreateGenerator(&curand_generator_,
-                                                  CURAND_RNG_PSEUDO_DEFAULT));
-    PADDLE_ENFORCE(
-        dynload::curandSetPseudoRandomGeneratorSeed(curand_generator_, seed_));
-  }
-  return curand_generator_;
-}
+cudaStream_t CUDADeviceContext::stream() { return stream_; }
 
 #endif  // PADDLE_ONLY_CPU
 
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 08b5b2cff9..11528e1194 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #ifndef PADDLE_ONLY_CPU
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
-#include "paddle/platform/dynload/curand.h"
 #include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
@@ -40,7 +39,7 @@ class DeviceContext {
 class CPUDeviceContext : public DeviceContext {
  public:
   CPUDeviceContext();
-  explicit CPUDeviceContext(CPUPlace);
+  explicit CPUDeviceContext(CPUPlace place);
   virtual ~CPUDeviceContext() {}
 
   Eigen::DefaultDevice* eigen_device() const;
@@ -52,10 +51,11 @@ class CPUDeviceContext : public DeviceContext {
 };
 
 #ifndef PADDLE_ONLY_CPU
+class EigenCudaStreamDevice;
 
 class CUDADeviceContext : public DeviceContext {
  public:
-  explicit CUDADeviceContext(GPUPlace);
+  explicit CUDADeviceContext(GPUPlace place);
   virtual ~CUDADeviceContext();
 
   /*! \brief  Wait for all operations completion in the stream. */
@@ -74,24 +74,20 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Return cudnn  handle in the device context. */
   cudnnHandle_t     cudnn_handle();
 
-  /*! \brief  Return curand handle in the device context. */
-  curandGenerator_t curand_generator();
+  /*! \brief  Return cuda stream in the device context. */
+  cudaStream_t      stream();
   // clang-format on
 
  private:
   GPUPlace place_;
 
- private:
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;
-  std::unique_ptr<Eigen::CudaStreamDevice> eigen_stream_;
-
- private:
-  uint64_t seed_;
+  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
 
   // clang-format off
-  cudnnHandle_t     cudnn_handle_     = nullptr;
-  cublasHandle_t    cublas_handle_    = nullptr;
-  curandGenerator_t curand_generator_ = nullptr;
+  cudaStream_t       stream_{nullptr};
+  cudnnHandle_t      cudnn_handle_{nullptr};
+  cublasHandle_t     cublas_handle_{nullptr};
   // clang-format on
 };
 
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index 65345c433c..5883a55272 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -43,8 +43,7 @@ TEST(Device, CUDADeviceContext) {
     ASSERT_NE(nullptr, cudnn_handle);
     cublasHandle_t cublas_handle = device_context->cublas_handle();
     ASSERT_NE(nullptr, cublas_handle);
-    curandGenerator_t curand_handle = device_context->curand_generator();
-    ASSERT_NE(nullptr, curand_handle);
+    ASSERT_NE(nullptr, device_context->stream());
     delete device_context;
   }
 }
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index 15fdf7a94f..81448897e9 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -86,7 +86,7 @@ struct EnforceNotMet : public std::exception {
                                   2 + sizeof(void*) * 2, call_stack[i],
                                   demangled, addr_offset);
         } else {
-          sout << string::Sprintf("%-3d %*0p %s\n", i, 2 + sizeof(void*) * 2,
+          sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
                                   call_stack[i]);
         }
       }
diff --git a/paddle/platform/environment.h b/paddle/platform/environment.h
new file mode 100644
index 0000000000..4edcce932e
--- /dev/null
+++ b/paddle/platform/environment.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <vector>
+
+#include "paddle/platform/enforce.h"
+#include "paddle/string/piece.h"
+
+extern char** environ;  // for environment variables
+
+namespace paddle {
+namespace platform {
+
+inline void SetEnvVariable(const std::string& name, const std::string& value) {
+  PADDLE_ENFORCE_NE(setenv(name.c_str(), value.c_str(), 1), -1,
+                    "Failed to set environment variable %s=%s", name, value);
+}
+
+inline void UnsetEnvVariable(const std::string& name) {
+  PADDLE_ENFORCE_NE(unsetenv(name.c_str()), -1,
+                    "Failed to unset environment variable %s", name);
+}
+
+inline bool IsEnvVarDefined(const std::string& name) {
+  return std::getenv(name.c_str()) != nullptr;
+}
+
+inline std::string GetEnvValue(const std::string& name) {
+  PADDLE_ENFORCE(IsEnvVarDefined(name),
+                 "Tried to access undefined environment variable %s", name);
+  return std::getenv(name.c_str());
+}
+
+inline std::vector<std::string> GetAllEnvVariables() {
+  std::vector<std::string> vars;
+  for (auto var = environ; *var != nullptr; ++var) {
+    auto tail = string::Index(*var, "=");
+    auto name = string::SubStr(*var, 0, tail).ToString();
+    vars.push_back(name);
+  }
+  return vars;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/environment_test.cc b/paddle/platform/environment_test.cc
new file mode 100644
index 0000000000..5f13652721
--- /dev/null
+++ b/paddle/platform/environment_test.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/environment.h"
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+TEST(ENVIRONMENT, ACCESS) {
+  namespace platform = paddle::platform;
+  namespace string = paddle::string;
+
+  platform::SetEnvVariable("PADDLE_USE_ENV", "TRUE");
+
+  EXPECT_TRUE(platform::IsEnvVarDefined("PADDLE_USE_ENV"));
+  EXPECT_EQ(platform::GetEnvValue("PADDLE_USE_ENV"), "TRUE");
+
+  platform::UnsetEnvVariable("PADDLE_USE_ENV");
+  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV"));
+
+  platform::SetEnvVariable("PADDLE_USE_ENV1", "Hello ");
+  platform::SetEnvVariable("PADDLE_USE_ENV2", "World, ");
+  platform::SetEnvVariable("PADDLE_USE_ENV3", "PaddlePaddle!");
+
+  std::string env_info;
+  auto vars = platform::GetAllEnvVariables();
+  for_each(vars.begin(), vars.end(), [&](const std::string& var) {
+    env_info += platform::GetEnvValue(var);
+  });
+
+  EXPECT_TRUE(string::Contains(env_info, "Hello World, PaddlePaddle!"));
+  platform::UnsetEnvVariable("PADDLE_USE_ENV1");
+  platform::UnsetEnvVariable("PADDLE_USE_ENV2");
+  platform::UnsetEnvVariable("PADDLE_USE_ENV3");
+
+  env_info.clear();
+  vars = platform::GetAllEnvVariables();
+  for_each(vars.begin(), vars.end(), [&](const std::string& var) {
+    env_info += platform::GetEnvValue(var);
+  });
+
+  EXPECT_FALSE(string::Contains(env_info, "Hello World, PaddlePaddle!"));
+  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV1"));
+  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV2"));
+  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV3"));
+}
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index edeb3ecd7b..be381a4e26 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/platform/gpu_info.h"
+
 #include "gflags/gflags.h"
+
 #include "paddle/platform/enforce.h"
+#include "paddle/platform/environment.h"
 
 DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
               "Default use 95% of GPU memory for PaddlePaddle,"
@@ -70,6 +73,13 @@ size_t GpuMaxChunkSize() {
 
   GpuMemoryUsage(available, total);
 
+  if (IsEnvVarDefined(kEnvFractionGpuMemoryToUse)) {
+    auto val = std::stod(GetEnvValue(kEnvFractionGpuMemoryToUse));
+    PADDLE_ENFORCE_GT(val, 0.0);
+    PADDLE_ENFORCE_LE(val, 1.0);
+    FLAGS_fraction_of_gpu_memory_to_use = val;
+  }
+
   // Reserving the rest memory for page tables, etc.
   size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;
 
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index d3a5f5f13f..ed2420b874 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -18,10 +18,15 @@ limitations under the License. */
 
 #include <cuda_runtime.h>
 #include <stddef.h>
+#include <string>
 
 namespace paddle {
 namespace platform {
 
+//! Environment variable: fraction of GPU memory to use on each device.
+const std::string kEnvFractionGpuMemoryToUse =
+    "PADDLE_FRACTION_GPU_MEMORY_TO_USE";
+
 //! Get the total number of GPU devices in system.
 int GetDeviceCount();
 
diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp
index f7e391f763..54063a809a 100644
--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
@@ -65,7 +65,6 @@ void ParameterClient2::initThreads() {
     LOG(INFO) << "parallel_thread_num dosent need to set";
   }
   syncThreadPool_.reset(new SyncThreadPool(threadNum_));
-
   startThreads();
 }
 
@@ -224,6 +223,14 @@ void ParameterClient2::prepareSendData(
     request.set_cost(cost);
     request.set_batch_status(batchStatus);
     CHECK_EQ(request.blocks_size(), 0);
+    VLOG(10) << "request: trainer_id: " << request.trainer_id()
+             << " update_mode" << request.update_mode()
+             << " send_back_parameter: " << request.send_back_parameter()
+             << " send_back_parameter_type: "
+             << request.send_back_parameter_type()
+             << " num_samples: " << request.num_samples()
+             << " cost: " << request.cost()
+             << " batch_status: " << request.batch_status();
   }
   for (const auto& segments : parameterSegments) {
     const auto it = parameterMap_.find(segments.id);
@@ -251,11 +258,17 @@ void ParameterClient2::prepareSendData(
       CHECK(sendMat != nullptr) << "sendMat is nullptr";
 
       syncThreadPool_->exec([&](int tid, size_t numThreads) {
+        std::lock_guard<std::mutex> guard(sparseAutoGrowthMutex_);
         const auto& localIndices = prefetchMat->getLocalIndices();
         /// num of sparse rows
         size_t nLocalBlocks = localIndices.size();
         uint64_t beginDim = 0;
         uint64_t endDim = 0;
+
+        // FIXME(typhoonzero): let it resize first
+        prefetchMat->getLocalRow(nLocalBlocks + 1);
+        sendMat->getLocalRow(nLocalBlocks + 1);
+
         for (size_t row = 0; row < nLocalBlocks; ++row) {
           int64_t blockId = localIndices[row];  // local row -> sparse row
           int serverId = std::abs((blockId + nameHash) % serviceNum_);
@@ -275,7 +288,6 @@ void ParameterClient2::prepareSendData(
           block->set_begin_pos(row * blockSize);
           /// block len
           block->set_block_size(endDim - beginDim);
-
           if (sendingPara) {
             sendJob->parallelInputIovs[serverId].push_back(
                 {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize});
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
index 89b3ddd502..29b9eeacdd 100644
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -583,6 +583,7 @@ protected:
 #ifndef PADDLE_DISABLE_TIMER
   uint64_t forwardbackwordTime_;
 #endif
+  std::mutex sparseAutoGrowthMutex_;
 
   /// map id to parameter used for decoding protobuf data
   std::unordered_map<size_t, ParameterPtr> parameterMap_;
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index d7c1d4f788..54f5c4c0fb 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -1032,8 +1032,8 @@ void ParameterServer2::loadValueVector(const LoadValueRequest& request,
   Parameter::Header header;
   CHECK(fs.read(reinterpret_cast<char*>(&header), sizeof(header)))
       << "Fail to read parameters in pserver";
-  CHECK_EQ(header.version, Parameter::kFormatVersion)
-      << "Incorrect format version: " << header.version;
+  CHECK(Parameter::isHeaderFormatSupported(header.format))
+      << "Incorrect format version: " << header.format;
   CHECK_EQ(header.size, (size_t)size_)
       << "The size (" << header.size << ") in the file does not match the size "
       << "(" << size_ << ") of the pserver: " << serverId_;
@@ -1063,7 +1063,8 @@ void ParameterServer2::saveValueVector(const SaveValueRequest& request,
   CpuVector& vec = vectors_[PARAMETER_APPLY] ? *vectors_[PARAMETER_APPLY]
                                              : *vectors_[PARAMETER_VALUE];
   Parameter::Header header;
-  header.version = Parameter::kFormatVersion;
+  // TODO(TJ): save param headerFormat_
+  header.format = PARAM_FORMAT_ORIGINAL;
   header.valueSize = sizeof(real);
   header.size = size_;
 
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
new file mode 100644
index 0000000000..37e186a408
--- /dev/null
+++ b/paddle/pybind/CMakeLists.txt
@@ -0,0 +1,22 @@
+if(WITH_PYTHON)
+cc_library(paddle_pybind SHARED
+    SRCS pybind.cc
+    DEPS pybind python backward
+    sgd_op
+    gather_op
+    scatter_op
+    add_op
+    mul_op
+    rowwise_add_op
+    sigmoid_op
+    softmax_op
+    mean_op
+    cross_entropy_op
+    recurrent_op
+    uniform_random_op
+    gaussian_random_op
+    fill_zeros_like_op
+    lookup_table_op
+    scale_op
+    minus_op)
+endif(WITH_PYTHON)
diff --git a/paddle/framework/pybind.cc b/paddle/pybind/pybind.cc
similarity index 64%
rename from paddle/framework/pybind.cc
rename to paddle/pybind/pybind.cc
index fe0c87bc57..3bc150ccb7 100644
--- a/paddle/framework/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -18,11 +18,11 @@ limitations under the License. */
 
 #include "paddle/framework/backward.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/framework/tensor_py.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
+#include "paddle/pybind/tensor_py.h"
 #include "paddle/string/to_string.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
@@ -31,7 +31,7 @@ limitations under the License. */
 namespace py = pybind11;
 
 USE_OP(add_two);
-USE_CPU_ONLY_OP(onehot_cross_entropy);
+USE_OP(onehot_cross_entropy);
 USE_OP(sgd);
 USE_OP(mul);
 USE_OP(mean);
@@ -42,35 +42,18 @@ USE_OP(fill_zeros_like);
 USE_OP_ITSELF(recurrent_op);
 USE_OP(gaussian_random);
 USE_OP(uniform_random);
+USE_OP(lookup_table);
+USE_OP(scale);
+USE_OP_ITSELF(identity);
+USE_OP(minus);
+USE_CPU_ONLY_OP(gather);
+USE_CPU_ONLY_OP(scatter);
 
 namespace paddle {
 namespace framework {
 
 using Tensor = framework::Tensor;
 
-template <typename ClassType>
-void ExposeOperator(ClassType &m) {
-  m.def("infer_shape", &ClassType::type::InferShape)
-      .def("run", &ClassType::type::Run)
-      .def("type",
-           [](const typename ClassType::type &op) -> std::string {
-             return op.Type();
-           })
-      .def("outputs",
-           [](const typename ClassType::type &op)
-               -> std::map<std::string, std::vector<std::string>> {
-                 return op.Outputs();
-               })
-      .def("inputs",
-           [](const typename ClassType::type &op) { return op.Inputs(); })
-      .def("__str__", &ClassType::type::DebugString)
-      .def("no_intermediate_outputs",
-           [](const typename ClassType::type &op) {
-             return op.OutputVars(false);
-           })
-      .def("support_gpu", &ClassType::type::SupportGPU);
-}
-
 static size_t UniqueIntegerGenerator() {
   static std::atomic<size_t> generator;
   return generator.fetch_add(1);
@@ -154,26 +137,24 @@ All parameter, weight, gradient are variables in Paddle.
            py::return_value_policy::reference)
       .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
       .def(py::init<>())
-      .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
+      .def("new_scope",
+           [](Scope &self) -> Scope * { return &self.NewScope(); },
            py::return_value_policy::reference)
       .def("drop_kids", &Scope::DropKids);
 
   //! @note: Be careful! PyBind will return std::string as an unicode, not
   //! Python str. If you want a str object, you should cast them in Python.
   m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
-    auto &op_info_map = OpRegistry::op_info_map();
     std::vector<py::bytes> ret_values;
-    for (auto it = op_info_map.begin(); it != op_info_map.end(); ++it) {
-      const OpProto *proto = it->second.proto_;
-      if (proto == nullptr) {
-        continue;
-      }
-      PADDLE_ENFORCE(proto->IsInitialized(), "OpProto must all be initialized");
+
+    OpInfoMap::Instance().IterAllInfo([&ret_values](const std::string &type,
+                                                    const OpInfo &info) {
+      if (!info.HasOpProtoAndChecker()) return;
       std::string str;
-      PADDLE_ENFORCE(proto->SerializeToString(&str),
+      PADDLE_ENFORCE(info.Proto().SerializeToString(&str),
                      "Serialize OpProto Error. This could be a bug of Paddle.");
-      ret_values.push_back(py::bytes(str));
-    }
+      ret_values.emplace_back(str);
+    });
     return ret_values;
   });
   m.def_submodule(
@@ -207,75 +188,70 @@ All parameter, weight, gradient are variables in Paddle.
       .def(py::init<>())
       .def("__str__", string::to_string<const platform::CPUPlace &>);
 
-  py::class_<OperatorBase, std::shared_ptr<OperatorBase>> operator_base(
-      m, "Operator");
-
-  operator_base.def_static("create", [](py::bytes protobin) {
-    OpDesc desc;
-    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
-                   "Cannot parse user input to OpDesc");
-    PADDLE_ENFORCE(desc.IsInitialized(),
-                   "User OpDesc is not initialized, reason %s",
-                   desc.InitializationErrorString());
-    return OpRegistry::CreateOp(desc);
-  });
-
-  operator_base.def("backward",
-                    [](const OperatorBase &forwardOp,
-                       const std::unordered_set<std::string> &no_grad_vars) {
-                      return Backward(forwardOp, no_grad_vars);
-                    });
-
-  ExposeOperator(operator_base);
-
-  py::class_<operators::NetOp, std::shared_ptr<operators::NetOp>> net(m, "Net");
-
-  net.def_static("create",
-                 []() -> std::shared_ptr<operators::NetOp> {
-                   auto retv = std::make_shared<operators::NetOp>();
-                   retv->SetType("plain_net");
-                   return retv;
-                 })
-      .def("add_op", &operators::NetOp::AddOp)
-      .def("add_op",
-           [](operators::NetOp &self,
-              const std::shared_ptr<operators::NetOp> &net) -> void {
-             self.AddOp(std::static_pointer_cast<OperatorBase>(net));
+  py::class_<OperatorBase>(m, "Operator")
+      .def_static("create",
+                  [](py::bytes protobin) {
+                    OpDesc desc;
+                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                                   "Cannot parse user input to OpDesc");
+                    PADDLE_ENFORCE(desc.IsInitialized(),
+                                   "User OpDesc is not initialized, reason %s",
+                                   desc.InitializationErrorString());
+                    return OpRegistry::CreateOp(desc);
+                  })
+      .def("backward",
+           [](const OperatorBase &forwardOp,
+              const std::unordered_set<std::string> &no_grad_vars) {
+             return Backward(forwardOp, no_grad_vars).release();
            })
-      .def("add_op",
-           [](operators::NetOp &self,
-              const std::shared_ptr<operators::RecurrentOp> &rnn) -> void {
-             self.AddOp(std::static_pointer_cast<OperatorBase>(rnn));
+      .def("infer_shape", &OperatorBase::InferShape)
+      .def("run", &OperatorBase::Run)
+      .def("type",
+           [](const OperatorBase &op) -> std::string { return op.Type(); })
+      .def("outputs",
+           [](const OperatorBase &op)
+               -> std::map<std::string, std::vector<std::string>> {
+                 return op.Outputs();
+               })
+      .def("inputs", [](const OperatorBase &op) { return op.Inputs(); })
+      .def("__str__", &OperatorBase::DebugString)
+      .def("no_intermediate_outputs",
+           [](const OperatorBase &op) { return op.OutputVars(false); })
+      .def("support_gpu", &OperatorBase::SupportGPU);
+
+  py::class_<operators::NetOp, OperatorBase>(m, "Net")
+      .def_static("create",
+                  []() -> operators::NetOp * {
+                    auto *retv = new operators::NetOp;
+                    retv->SetType("plain_net");
+                    return retv;
+                  })
+      .def("append_op",
+           [](operators::NetOp &self, const OperatorBase &op) {
+             self.AppendOp(op);
            })
       .def("complete_add_op", &operators::NetOp::CompleteAddOp)
       .def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
         self->CompleteAddOp();
       });
 
-  ExposeOperator(net);
-
   // recurrent_op
-  py::class_<operators::RecurrentOp, std::shared_ptr<operators::RecurrentOp>>
-      rnn(m, "RecurrentOp");
-
-  rnn.def_static(
-         "create",
-         [](py::bytes protobin) -> std::shared_ptr<operators::RecurrentOp> {
-           OpDesc desc;
-           PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
-                          "Cannot parse user input to OpDesc");
-           PADDLE_ENFORCE(desc.IsInitialized(),
-                          "User OpDesc is not initialized, reason %s",
-                          desc.InitializationErrorString());
-           auto rnn_op = OpRegistry::CreateOp(desc);
-           return std::dynamic_pointer_cast<operators::RecurrentOp>(rnn_op);
-         })
+  py::class_<operators::RecurrentOp, OperatorBase>(m, "RecurrentOp")
+      .def_static(
+          "create",
+          [](py::bytes protobin) -> operators::RecurrentOp * {
+            OpDesc desc;
+            PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                           "Cannot parse user input to OpDesc");
+            PADDLE_ENFORCE(desc.IsInitialized(),
+                           "User OpDesc is not initialized, reason %s",
+                           desc.InitializationErrorString());
+            auto rnn_op = OpRegistry::CreateOp(desc);
+            return static_cast<operators::RecurrentOp *>(rnn_op.release());
+          })
       .def("set_stepnet",
-           [](operators::RecurrentOp &self,
-              const std::shared_ptr<operators::NetOp> &net) -> void {
-             self.set_stepnet(net);
-           });
-  ExposeOperator(rnn);
+           [](operators::RecurrentOp &self, const operators::NetOp &net)
+               -> void { self.set_stepnet(net.Clone()); });
 
   m.def("unique_integer", UniqueIntegerGenerator);
 
diff --git a/paddle/framework/tensor_py.h b/paddle/pybind/tensor_py.h
similarity index 92%
rename from paddle/framework/tensor_py.h
rename to paddle/pybind/tensor_py.h
index 4e1ab77b15..39ba60b4dc 100644
--- a/paddle/framework/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -63,8 +63,11 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
       }
       return py::buffer_info(
           dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()),
-          sizeof(CUR_TYPE), py::format_descriptor<CUR_TYPE>::format(),
-          (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
+          sizeof(CUR_TYPE),
+          py::format_descriptor<CUR_TYPE>::format(),
+          (size_t)framework::arity(dst_tensor.dims()),
+          dims_outside,
+          strides);
     } else {
       constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
       return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
@@ -107,8 +110,8 @@ void PyCUDATensorSetFromArray(
 
   self.Resize(framework::make_ddim(dims));
   auto *dst = self.mutable_data<T>(place);
-  paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
-                                  cudaMemcpyHostToDevice);
+  paddle::platform::GpuMemcpySync(
+      dst, array.data(), sizeof(T) * array.size(), cudaMemcpyHostToDevice);
 }
 #endif
 
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 6c2f5fed40..1798642022 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -38,7 +38,7 @@ Configuring cmake in /paddle/build ...
       -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
       -DCUDNN_ROOT=/usr/
       -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
-      -DWITH_TESTING=${WITH_TESTING:-OFF}
+      -DWITH_TESTING=${WITH_TESTING:-ON}
       -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 ========================================
 EOF
@@ -56,19 +56,18 @@ cmake .. \
       -DWITH_C_API=${WITH_C_API:-OFF} \
       -DWITH_PYTHON=${WITH_PYTHON:-ON} \
       -DCUDNN_ROOT=/usr/ \
-      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \
-      -DWITH_TESTING=${WITH_TESTING:-OFF} \
+      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
+      -DWITH_TESTING=${WITH_TESTING:-ON} \
       -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 
 cat <<EOF
 ============================================
 Building in /paddle/build ...
-   Build unit tests: ${WITH_TESTING:-OFF}
 ============================================
 EOF
 make -j `nproc`
 
-if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
+if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
 cat <<EOF
 ========================================
 Running unit tests ...
@@ -120,25 +119,6 @@ EOF
     /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
 fi
 
-# generate deb package for current build
-# FIXME(typhoonzero): should we remove paddle/scripts/deb ?
-if [[ ${WITH_DEB:-ON} == "ON" ]]; then
-    cat <<EOF
-========================================
-Generating .deb package ...
-========================================
-EOF
-    set +e
-    cpack -D CPACK_GENERATOR='DEB' -j `nproc` ..
-    err_code=$?
-    if [ ${err_code} -ne 0 ]; then
-        # cat error logs if cpack failed.
-        cat /paddle/build/_CPack_Packages/Linux/DEB/PreinstallOutput.log
-        exit ${err_code}
-    fi
-    set -e
-fi
-
 cat <<EOF
 ========================================
 Generate /paddle/build/Dockerfile ...
@@ -158,15 +138,15 @@ EOF
 fi
 
 cat >> /paddle/build/Dockerfile <<EOF
-# Use different deb file when building different type of images
-ADD *.deb /
+ADD python/dist/*.whl /
 # run paddle version to install python packages first
 RUN apt-get update &&\
     apt-get install -y wget python-pip && pip install -U pip && \
-    dpkg -i /*.deb ; apt-get install -f -y && \
+    pip install /*.whl; apt-get install -f -y && \
     apt-get clean -y && \
-    rm -f /*.deb && \
-    paddle version
+    rm -f /*.whl && \
+    paddle version && \
+    ldconfig
 ${DOCKERFILE_CUDNN_DSO}
 ${DOCKERFILE_GPU_ENV}
 ADD go/cmd/pserver/pserver /usr/bin/
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 2ab7d5b52f..26f9c0fcd4 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -56,8 +56,7 @@ if [ -z "${PADDLE_NO_STAT+x}" ]; then
     fi
 fi
 
-
-MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+PADDLE_BIN_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 if [ ! -z "${DEBUGGER}" ]; then
     echo "Using debug command ${DEBUGGER}"
@@ -93,34 +92,16 @@ else:
   sys.exit(0)
 EOF
 
-if [ $? -eq 1 ]; then  # Older version installed, or not installed at all
-    echo "First time run paddle, need to install some python dependencies."
-    # setuptools normalizes package version, so we need to use normalized
-    # package version for paddle python package
-    PYTHON_PADDLE_VERSION=$(python -c 'import packaging.version
-import setuptools
-print str(packaging.version.Version("@PADDLE_VERSION@"))
-' 2>/dev/null)
-    BASEDIR=$(dirname "$0")
-    pip install ${BASEDIR}/../opt/paddle/share/wheels/*-${PYTHON_PADDLE_VERSION}-*.whl
-    if [ $? -ne 0 ]; then
-	echo "pip install wheels failed. "
-	echo "Please use 'sudo paddle' at the first time you use PaddlePaddle"
-	echo "PaddlePaddle will install some python dependencies automatically."
-	exit 1
-    fi
-    echo "Python dependencies are installed."
-fi
 
 case "$1" in
     "train")
-        ${DEBUGGER} $MYDIR/../opt/paddle/bin/paddle_trainer ${@:2}
+        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2}
         ;;
     "merge_model")
-        ${DEBUGGER} $MYDIR/../opt/paddle/bin/paddle_merge_model ${@:2}
+        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_merge_model ${@:2}
         ;;
     "pserver")
-        ${DEBUGGER} $MYDIR/../opt/paddle/bin/paddle_pserver_main ${@:2}
+        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_pserver_main ${@:2}
         ;;
     "dump_config")
         python -m paddle.utils.dump_config ${@:2}
@@ -129,7 +110,7 @@ case "$1" in
         python -m paddle.utils.make_model_diagram ${@:2}
         ;;
     "usage")
-        $MYDIR/../opt/paddle/bin/paddle_usage ${@:2}
+        $PADDLE_BIN_PATH/paddle_usage ${@:2}
         ;;
     "version")
         version
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
index eba40862b9..a0a365aa0b 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -29,7 +29,6 @@ DECLARE_bool(with_gpu);
 DECLARE_bool(parallel_nn);
 DECLARE_string(config_args);
 DECLARE_bool(use_mkldnn);
-DECLARE_bool(use_mkldnn_wgt);
 
 const char *kConfigParserModuleName = "paddle.trainer.config_parser";
 const char *kConfigParserFuncName = "parse_config_and_serialize";
@@ -47,7 +46,6 @@ TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
              << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu
              << ",parallel_nn=" << FLAGS_parallel_nn
              << ",use_mkldnn=" << FLAGS_use_mkldnn
-             << ",use_mkldnn_wgt=" << FLAGS_use_mkldnn_wgt
              << ",cudnn_version=" << hl_get_cudnn_lib_version();
   if (!FLAGS_config_args.empty()) {
     configArgs << "," << FLAGS_config_args;
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index 600c83a848..ab1c181c62 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -27,7 +27,6 @@ DEFINE_bool(use_mkldnn, false, "Default still keep use CPU training");
 DEFINE_bool(use_mkldnn, false, "Only support CPU training");
 #endif
 
-DEFINE_bool(use_mkldnn_wgt, false, "Init weight from CPU weight");
 DEFINE_bool(parallel_nn,
             false,
             "Whether to use multi-threads to calculate one neural network."
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
index 0aca4c0ee0..1832bb515e 100644
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -41,4 +41,3 @@ DECLARE_string(predict_file);
 DECLARE_bool(prev_batch_state);
 DECLARE_string(init_model_path);
 DECLARE_bool(use_mkldnn);
-DECLARE_bool(use_mkldnn_wgt);
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 4f3d5bf3f6..1ea1e05259 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -82,6 +82,9 @@ message ConvConfig {
 
   // if not set, use img_size
   optional uint32 img_size_y = 14;
+
+  optional uint32 dilation = 15 [ default = 1 ];
+  optional uint32 dilation_y = 16 [ default = 1 ];
 }
 
 message PoolConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index da99e5bd53..efc7676466 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -338,7 +338,8 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name,
         in_links_count += 1
         layer_name = MakeLayerNameInParentSubmodel(name)
         layer = g_layer_map[layer_name]
-        ScatterAgentLayer(name=name, size=layer.size)
+        ScatterAgentLayer(
+            name=name, size=layer.size, width=layer.width, height=layer.height)
 
         pair = g_current_submodel.in_links.add()
         pair.layer_name = layer_name
@@ -869,12 +870,16 @@ class Conv(Cfg):
                  caffe_mode=True,
                  filter_size_y=None,
                  padding_y=None,
-                 stride_y=None):
+                 stride_y=None,
+                 dilation=None,
+                 dilation_y=None):
         self.add_keys(locals())
         if filter_size_y is None:
             self.filter_size_y = filter_size
         if padding_y is None:
             self.padding_y = padding
+        if dilation_y is None:
+            self.dilation_y = dilation
         if stride_y is None:
             self.stride_y = stride
         if output_x is not None:
@@ -2197,8 +2202,8 @@ class MaxOutLayer(LayerBase):
         maxout_conf = self.config.inputs[0].maxout_conf
         parse_maxout(self.inputs[0].maxout, input_layer.name, maxout_conf)
         out_channels = maxout_conf.image_conf.channels / maxout_conf.groups
-        self.set_cnn_layer(name, g_layer_map[input_layer.name].height,
-                           g_layer_map[input_layer.name].width, out_channels)
+        self.set_cnn_layer(name, maxout_conf.image_conf.img_size_y,
+                           maxout_conf.image_conf.img_size, out_channels)
 
 
 @config_layer('row_conv')
@@ -2232,6 +2237,20 @@ class ClipLayer(LayerBase):
         self.config.inputs[0].clip_conf.max = max
 
 
+@config_layer('scale_shift')
+class ScaleShiftLayer(LayerBase):
+    def __init__(self, name, inputs, bias=True, **xargs):
+        super(ScaleShiftLayer, self).__init__(
+            name, 'scale_shift', 0, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'ScaleShiftLayer must have one and only one input.')
+        input_layer = self.get_input_layer(0)
+        self.set_layer_size(input_layer.size)
+        self.create_input_parameter(0, 1, [1, 1])
+        self.create_bias_parameter(bias, 1)
+
+
 # key: cost type
 # value: cost class
 g_cost_map = {}
@@ -2391,9 +2410,11 @@ class GatherAgentLayer(LayerBase):
 
 @config_layer('scatter_agent')
 class ScatterAgentLayer(LayerBase):
-    def __init__(self, name, size, device=None):
+    def __init__(self, name, size, width=None, height=None, device=None):
         super(ScatterAgentLayer, self).__init__(
             name, 'scatter_agent', size, inputs=[], device=device)
+        if height and width:
+            self.set_layer_height_width(height, width)
 
 
 @config_layer('multiplex')
@@ -2677,6 +2698,49 @@ class SubSequenceLayer(LayerBase):
         self.create_bias_parameter(bias, size)
 
 
+@config_layer('seq_slice')
+class SeqSliceLayer(LayerBase):
+    def __init__(self, name, inputs, starts, ends, bias=False, **xargs):
+        if isinstance(inputs, list):
+            assert len(inputs) == 1, ('the first input of sequence slice layer '
+                                      'is a single sequence input.')
+        else:
+            inputs = [inputs]
+
+        if starts is not None:
+            if isinstance(starts, list):
+                assert len(starts) == 1, (
+                    'the start indices for sequence slice layer cannot '
+                    'be a list having more than one element.')
+                starts = starts[0]
+            inputs.append(starts)
+
+        if ends is not None:
+            if isinstance(ends, list):
+                assert len(ends) == 1, (
+                    'the end indices for sequence slice layer cannot '
+                    'be a list having more than one element.')
+                ends = ends[0]
+            inputs.append(ends)
+        assert len(inputs) >= 2, (
+            'the sequence slice layer has at least two inputs.')
+
+        super(SeqSliceLayer, self).__init__(
+            name, 'seq_slice', 0, inputs=inputs, **xargs)
+
+        input_layer0 = self.get_input_layer(0)
+        size = input_layer0.size
+        self.set_layer_size(size)
+
+        if len(inputs) == 3:
+            assert (
+                self.get_input_layer(1).size == self.get_input_layer(2).size), (
+                    'If start and end indices are both given to'
+                    'sequence slice layer, they should have the same width.')
+        elif len(inputs) == 2:
+            self.config.select_first = (starts is not None)
+
+
 @config_layer('sub_nested_seq')
 class SubNestedSequenceLayer(LayerBase):
     def __init__(self, name, inputs, selected_indices, bias=False, **xargs):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 1bc55c8696..862265f2cd 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -16,11 +16,13 @@ import functools
 import collections
 import inspect
 
+import paddle.trainer.config_parser as cp
 from paddle.trainer.config_parser import *
 from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
     ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
 from .evaluators import *
-from .poolings import MaxPooling, AvgPooling, BasePoolingType
+from .poolings import MaxPooling, AvgPooling, BasePoolingType, \
+    CudnnAvgPooling, CudnnMaxPooling
 from .attrs import *
 from .default_decorators import *
 
@@ -132,7 +134,9 @@ __all__ = [
     'sub_nested_seq_layer',
     'clip_layer',
     'slice_projection',
+    'seq_slice_layer',
     'kmax_sequence_score_layer',
+    'scale_shift_layer',
 ]
 
 
@@ -228,8 +232,10 @@ class LayerType(object):
     CROP_LAYER = 'crop'
     SUB_NESTED_SEQ = 'sub_nested_seq'
     CLIP_LAYER = 'clip'
+    SEQ_SLICE = 'seq_slice'
 
     KMAX_SEQ_SCORE = 'kmax_seq_score'
+    SCALE_SHIFT_LAYER = 'scale_shift'
 
     @staticmethod
     def is_layer_type(type_name):
@@ -328,6 +334,14 @@ class LayerOutput(object):
         self.outputs = outputs
         self.reverse = reverse
 
+    @property
+    def width(self):
+        return cp.g_layer_map[self.full_name].width
+
+    @property
+    def height(self):
+        return cp.g_layer_map[self.full_name].height
+
     def set_input(self, input):
         """
         Set the input for a memory layer. Can only be used for memory layer
@@ -909,7 +923,13 @@ def data_layer(name, size, height=None, width=None, layer_attr=None):
         width=width,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
 
-    return LayerOutput(name, LayerType.DATA, size=size)
+    num_filters = None
+    if height is not None and width is not None:
+        num_filters = size / (width * height)
+        assert num_filters * width * height == size, \
+            "size=%s width=%s height=%s" % (size, width, height)
+
+    return LayerOutput(name, LayerType.DATA, size=size, num_filters=num_filters)
 
 
 @wrap_name_default("embedding")
@@ -2322,6 +2342,7 @@ def img_conv_layer(input,
                    groups=1,
                    stride=1,
                    padding=0,
+                   dilation=1,
                    bias_attr=None,
                    param_attr=None,
                    shared_biases=True,
@@ -2329,6 +2350,7 @@ def img_conv_layer(input,
                    filter_size_y=None,
                    stride_y=None,
                    padding_y=None,
+                   dilation_y=None,
                    trans=False,
                    layer_type=None):
     """
@@ -2393,6 +2415,11 @@ def img_conv_layer(input,
     :type padding: int|tuple|list
     :param padding_y: The y dimension of the padding.
     :type padding_y: int
+    :param dilation: The x dimension of the dilation. Or input a tuple for two
+                    image dimension
+    :type dilation: int|tuple|list
+    :param dilation_y: The y dimension of the dilation.
+    :type dilation_y: int
     :param bias_attr: Convolution bias attribute. None means default bias.
                       False means no bias.
     :type bias_attr: ParameterAttribute|False
@@ -2440,6 +2467,13 @@ def img_conv_layer(input,
         else:
             padding_y = padding
 
+    if dilation_y is None:
+        if isinstance(dilation, collections.Sequence):
+            assert len(dilation) == 2
+            dilation, dilation_y = dilation
+        else:
+            dilation_y = dilation
+
     if param_attr.attr.get('initial_smart'):
         # special initial for conv layers.
         init_w = (2.0 / (filter_size**2 * num_channels))**0.5
@@ -2449,6 +2483,8 @@ def img_conv_layer(input,
         param_attr.attr["initial_smart"] = False
 
     if layer_type:
+        if dilation > 1 or dilation_y > 1:
+            assert layer_type in ["cudnn_conv", "cudnn_convt"]
         if trans:
             assert layer_type in ["exconvt", "cudnn_convt"]
         else:
@@ -2464,11 +2500,13 @@ def img_conv_layer(input,
             conv=Conv(
                 filter_size=filter_size,
                 padding=padding,
+                dilation=dilation,
                 stride=stride,
                 channels=num_channels,
                 groups=groups,
                 filter_size_y=filter_size_y,
                 padding_y=padding_y,
+                dilation_y=dilation_y,
                 stride_y=stride_y),
             **param_attr.attr),
         active_type=act.name,
@@ -2574,11 +2612,14 @@ def img_pool_layer(input,
     elif isinstance(pool_type, AvgPooling):
         pool_type.name = 'avg'
 
+    assert type(pool_type) in [AvgPooling, MaxPooling, CudnnAvgPooling,
+                               CudnnMaxPooling], \
+        "only (Cudnn)AvgPooling, (Cudnn)MaxPooling are supported"
+
     type_name = pool_type.name + '-projection' \
         if (
         isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
         else pool_type.name
-
     pool_size_y = pool_size if pool_size_y is None else pool_size_y
     stride_y = stride if stride_y is None else stride_y
     padding_y = padding if padding_y is None else padding_y
@@ -4202,8 +4243,7 @@ def conv_operator(img,
         num_channels = img.num_filters
 
     assert isinstance(filter, LayerOutput)
-    if filter.size is not None:
-        filter.size = filter_size * filter_size_y * num_filters * num_channels
+    assert filter.size is not None
 
     opCls = ConvTransOperator if trans else ConvOperator
 
@@ -4914,7 +4954,6 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    assert input.layer_type == LayerType.CONV_LAYER
     assert isinstance(input.activation, LinearActivation)
     assert groups > 1
     if num_channels is None:
@@ -6174,6 +6213,72 @@ def clip_layer(input, min, max, name=None):
         name, LayerType.CLIP_LAYER, parents=[input], size=input.size)
 
 
+@wrap_name_default()
+def seq_slice_layer(input, starts, ends, name=None):
+    """
+    seq_slice_layer will return one or several sub-sequences from the
+    input sequence layer given start and end indices.
+
+        - If only start indices are given, and end indices are set to None,
+          this layer slices the input sequence from the given start indices
+          to its end.
+        - If only end indices are given, and start indices are set to None,
+          this layer slices the input sequence from its beginning to the
+          given end indices.
+        - If start and end indices are both given, they should have the same
+          number of elements.
+
+    If start or end indices contains more than one elements, the input sequence
+    will be sliced for multiple times.
+
+
+    .. code-block:: python
+
+        seq_silce = seq_slice_layer(input=input_seq,
+                                    starts=start_pos, ends=end_pos)
+
+    :param name: name of this layer.
+    :type name: basestring
+    :param input: input for this layer, it should be a sequence.
+    :type input: LayerOutput
+    :param starts: start indices to slice the input sequence.
+    :type starts: LayerOutput|None
+    :param ends: end indices to slice the input sequence.
+    :type ends: LayerOutput|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+
+    """
+
+    assert isinstance(input, LayerOutput), (
+        'The first input of seq_slice layer must be a PaddlePaddle layer.')
+
+    if starts is not None:
+        assert isinstance(starts, LayerOutput), (
+            'The start indices for seq_slice layer '
+            'must be a PaddlePaddle layer.')
+    if ends is not None:
+        assert isinstance(ends, LayerOutput), (
+            'The end indices for seq_slice layer must be a PaddlePaddle layer.')
+    assert starts is not None or ends is not None, (
+        'start and end indices '
+        'cannot be set to None at the same time, at least one of '
+        'them should be given.')
+    if starts is not None and ends is not None:
+        assert starts.size == ends.size, (
+            'If start and end indices are both given to seq_slice_layer, '
+            'they should have the same width.')
+
+    Layer(
+        name=name,
+        type=LayerType.SEQ_SLICE,
+        inputs=input.name,
+        starts=starts.name if starts is not None else None,
+        ends=ends.name if ends is not None else None)
+    return LayerOutput(
+        name, LayerType.SEQ_SLICE, parents=[input], size=input.size)
+
+
 @wrap_name_default()
 @layer_support()
 def kmax_sequence_score_layer(input, name=None, beam_size=1):
@@ -6210,3 +6315,43 @@ def kmax_sequence_score_layer(input, name=None, beam_size=1):
 
     return LayerOutput(
         name, LayerType.KMAX_SEQ_SCORE, parents=[input], size=input.size)
+
+
+@wrap_name_default("scale_shift")
+@wrap_param_attr_default()
+@wrap_bias_attr_default()
+def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
+    """
+    A layer applies a linear transformation to each element in each row of
+    the input matrix. For each element, the layer first re-scale it and then
+    adds a bias to it.
+
+    This layer is very like the SlopeInterceptLayer, except the scale and
+    bias are trainable.
+
+    .. math::
+
+        y = w * x + b
+
+    .. code-block:: python
+
+        scale_shift = scale_shift_layer(input=input_layer, bias_attr=False)
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput.
+    :param param_attr: The parameter attribute of scaling.
+    :type param_attr: ParameterAttribute
+    :param bias_attr: The parameter attribute of shifting.
+    :type bias_attr: ParameterAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.SCALE_SHIFT_LAYER,
+        inputs=Input(input.name, **param_attr.attr),
+        bias=ParamAttr.to_bias(bias_attr))
+    return LayerOutput(
+        name, LayerType.SCALE_SHIFT_LAYER, parents=[input], size=input.size)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index a61beb871a..1ca5c8a07e 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -8,6 +8,7 @@ test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
-test_kmax_seq_socre_layer test_seq_select_layers)
+test_kmax_seq_socre_layer test_seq_select_layers test_scale_shift_layer
+test_seq_slice_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
index 9fda16a540..01d31ef3fa 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
@@ -12,6 +12,7 @@ img_conv = img_conv_layer(
     num_filters=64,
     filter_size=(32, 32),
     padding=(1, 1),
+    dilation=(1, 1),
     stride=(1, 1),
     act=LinearActivation())
 img_bn = batch_norm_layer(input=img_conv, act=ReluActivation())
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
index 81bd71f68e..3d32220bfb 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
@@ -1,12 +1,6 @@
 type: "nn"
 layers {
-  name: "input"
-  type: "data"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "data"
+  name: "input_seq"
   type: "data"
   size: 128
   active_type: ""
@@ -17,7 +11,7 @@ layers {
   size: 1
   active_type: "exponential"
   inputs {
-    input_layer_name: "data"
+    input_layer_name: "input_seq"
     input_parameter_name: "___fc_layer_0__.w0"
   }
   bias_parameter_name: "___fc_layer_0__.wbias"
@@ -51,15 +45,14 @@ parameters {
   initial_strategy: 0
   initial_smart: false
 }
-input_layer_names: "data"
+input_layer_names: "input_seq"
 output_layer_names: "__kmax_sequence_score_layer_0__"
 sub_models {
   name: "root"
-  layer_names: "input"
-  layer_names: "data"
+  layer_names: "input_seq"
   layer_names: "__fc_layer_0__"
   layer_names: "__kmax_sequence_score_layer_0__"
-  input_layer_names: "data"
+  input_layer_names: "input_seq"
   output_layer_names: "__kmax_sequence_score_layer_0__"
   is_recurrent_layer_group: false
 }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr
new file mode 100644
index 0000000000..35ade126a2
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr
@@ -0,0 +1,72 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__scale_shift_0__"
+  type: "scale_shift"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___scale_shift_0__.w0"
+  }
+}
+layers {
+  name: "__scale_shift_1__"
+  type: "scale_shift"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___scale_shift_1__.w0"
+  }
+  bias_parameter_name: "___scale_shift_1__.wbias"
+}
+parameters {
+  name: "___scale_shift_0__.w0"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 1.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___scale_shift_1__.w0"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 1.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___scale_shift_1__.wbias"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__scale_shift_0__"
+output_layer_names: "__scale_shift_1__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__scale_shift_0__"
+  layer_names: "__scale_shift_1__"
+  input_layer_names: "data"
+  output_layer_names: "__scale_shift_0__"
+  output_layer_names: "__scale_shift_1__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr
new file mode 100644
index 0000000000..5b73d614fe
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr
@@ -0,0 +1,79 @@
+type: "nn"
+layers {
+  name: "word"
+  type: "data"
+  size: 128
+  active_type: ""
+}
+layers {
+  name: "starts"
+  type: "data"
+  size: 5
+  active_type: ""
+}
+layers {
+  name: "ends"
+  type: "data"
+  size: 5
+  active_type: ""
+}
+layers {
+  name: "__seq_slice_layer_0__"
+  type: "seq_slice"
+  size: 128
+  active_type: ""
+  inputs {
+    input_layer_name: "word"
+  }
+  inputs {
+    input_layer_name: "starts"
+  }
+  inputs {
+    input_layer_name: "ends"
+  }
+}
+layers {
+  name: "__seq_slice_layer_1__"
+  type: "seq_slice"
+  size: 128
+  active_type: ""
+  inputs {
+    input_layer_name: "word"
+  }
+  inputs {
+    input_layer_name: "starts"
+  }
+  select_first: true
+}
+layers {
+  name: "__seq_slice_layer_2__"
+  type: "seq_slice"
+  size: 128
+  active_type: ""
+  inputs {
+    input_layer_name: "word"
+  }
+  inputs {
+    input_layer_name: "ends"
+  }
+  select_first: false
+}
+input_layer_names: "word"
+output_layer_names: "__seq_slice_layer_0__"
+output_layer_names: "__seq_slice_layer_1__"
+output_layer_names: "__seq_slice_layer_2__"
+sub_models {
+  name: "root"
+  layer_names: "word"
+  layer_names: "starts"
+  layer_names: "ends"
+  layer_names: "__seq_slice_layer_0__"
+  layer_names: "__seq_slice_layer_1__"
+  layer_names: "__seq_slice_layer_2__"
+  input_layer_names: "word"
+  output_layer_names: "__seq_slice_layer_0__"
+  output_layer_names: "__seq_slice_layer_1__"
+  output_layer_names: "__seq_slice_layer_2__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
index d245c5a41c..48d0cd55da 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
@@ -2,9 +2,7 @@
 #coding=utf-8
 from paddle.trainer_config_helpers import *
 
-data = data_layer(name='input', size=300)
-
-data = data_layer(name="data", size=128)
+data = data_layer(name="input_seq", size=128)
 scores = fc_layer(input=data, size=1, act=ExpActivation())
 kmax_seq_id = kmax_sequence_score_layer(input=scores, beam_size=5)
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
new file mode 100644
index 0000000000..dd589116fa
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
@@ -0,0 +1,9 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='data', size=100)
+
+scale = scale_shift_layer(input=data, bias_attr=False)
+
+scale_shift = scale_shift_layer(input=data)
+
+outputs(scale, scale_shift)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
new file mode 100644
index 0000000000..510ad32208
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+#coding=utf-8
+from paddle.trainer_config_helpers import *
+
+input_seq = data_layer("word", size=128)
+starts = data_layer("starts", size=5)
+ends = data_layer("ends", size=5)
+
+seq_slice1 = seq_slice_layer(input=input_seq, starts=starts, ends=ends)
+seq_slice2 = seq_slice_layer(input=input_seq, starts=starts, ends=None)
+seq_slice3 = seq_slice_layer(input=input_seq, starts=None, ends=ends)
+
+outputs(seq_slice1, seq_slice2, seq_slice3)
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 96fad9b42e..661ebd8964 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -13,6 +13,8 @@ py_test(test_add_two_op SRCS test_add_two_op.py)
 py_test(test_sigmoid_op SRCS test_sigmoid_op.py)
 py_test(test_softmax_op SRCS test_softmax_op.py)
 py_test(test_cross_entropy_op SRCS test_cross_entropy_op.py)
+py_test(test_gather_op SRCS test_gather_op.py)
+py_test(test_scatter_op SRCS test_scatter_op.py)
 py_test(test_fill_zeros_like_op SRCS test_fill_zeros_like_op.py)
 
 py_test(gradient_checker SRCS gradient_checker.py)
@@ -22,6 +24,11 @@ py_test(test_rowwise_add_op SRCS test_rowwise_add_op.py)
 py_test(test_default_scope_funcs SRCS test_default_scope_funcs.py)
 
 py_test(test_operator SRCS test_operator.py)
-# py_test(test_gaussian_random_op SRCS test_gaussian_random_op.py)
+py_test(test_gaussian_random_op SRCS test_gaussian_random_op.py)
 py_test(test_uniform_random_op SRCS test_uniform_random_op.py)
 py_test(test_recurrent_op SRCS test_recurrent_op.py)
+py_test(test_sgd_op SRCS test_sgd_op.py)
+py_test(test_gradient_checker SRCS test_gradient_checker.py)
+py_test(test_lookup_table SRCS test_lookup_table.py)
+py_test(test_scale_and_identity_op SRCS test_scale_and_identity_op.py)
+py_test(mnist SRCS mnist.py)
diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
index 501cf6110f..9a7a7fbf5e 100644
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -1,6 +1,7 @@
 import unittest
 
 import numpy
+import itertools
 import paddle.v2.framework.core as core
 from paddle.v2.framework.op import Operator
 
@@ -8,6 +9,7 @@ __all__ = ['get_numeric_gradient']
 
 
 def create_op(op_type):
+    # TODO need to set attrs
     kwargs = dict()
     for in_name in Operator.get_op_input_names(op_type):
         kwargs[in_name] = in_name
@@ -21,12 +23,17 @@ def grad_var_name(var_name):
     return var_name + "@GRAD"
 
 
+def empty_var_name():
+    return "@EMPTY@"
+
+
 def get_numeric_gradient(op,
                          input_values,
                          output_name,
                          input_to_check,
                          delta=0.005,
-                         local_scope=None):
+                         local_scope=None,
+                         in_place=False):
     """
     Get Numeric Gradient for an operator's input.
     
@@ -66,7 +73,6 @@ def get_numeric_gradient(op,
             local_scope.find_var(output).get_tensor().alloc_float(core.CPUPlace(
             ))
 
-    # TODO(yuyang18): Only CPU is support now.
     cpu_ctx = core.DeviceContext.create(core.CPUPlace())
 
     def get_output():
@@ -76,6 +82,11 @@ def get_numeric_gradient(op,
     def product(dim):
         return reduce(lambda a, b: a * b, dim, 1)
 
+    def restore_inputs():
+        for var_name in input_values:
+            tensor_ = local_scope.find_var(var_name).get_tensor()
+            tensor_.set(numpy.copy(input_values[var_name]), core.CPUPlace())
+
     # get the input tensor that we want to get it's numeric gradient.
     tensor_to_check = local_scope.find_var(input_to_check).get_tensor()
     tensor_size = product(tensor_to_check.get_dims())
@@ -85,6 +96,8 @@ def get_numeric_gradient(op,
     # we only compute gradient of one element each time.
     # we use a for loop to compute the gradient of every element.
     for i in xrange(tensor_size):
+        if in_place:
+            restore_inputs()
         # get one input element throw it's index i.
         origin = tensor_to_check.get_float_element(i)
 
@@ -94,6 +107,8 @@ def get_numeric_gradient(op,
         y_pos = get_output()
 
         # plus delta to this element, run op and get the sum of the result tensor.
+        if in_place:
+            restore_inputs()
         x_neg = origin - delta
         tensor_to_check.set_float_element(i, x_neg)
         y_neg = get_output()
@@ -109,12 +124,120 @@ def get_numeric_gradient(op,
 
 
 class GradientChecker(unittest.TestCase):
-    def assert_is_close(self, numeric_grads, scope, max_relative_error,
-                        msg_prefix):
-        for name in numeric_grads:
-            b = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
-            a = numeric_grads[name]
+    def __get_gradient(self, forward_op, backward_op, input_value, grad_names,
+                       place):
+        """Get the input gradients after running forward and backward operators
+        on the given places.
+
+        :param forward_op: forward operator
+        :type forward_op: Operator
+        :param backward_op: backward operator
+        :type backward_op: Operator
+        :param input_value: input values.
+        :type input_value: dict{string:numpy.array}
+        :param grad_names: the names of returned input gradients.
+        :type input_value: a list of string
+        :param place: the device type.
+        :type place: CPUPlace or GPUPlace
+        :return: the input grdients of given grad_names.
+        :rtype: a list of numpy.array
+        """
+        scope = core.Scope()
+        ctx = core.DeviceContext.create(place)
 
+        inputs = forward_op.inputs()
+        in_names = [item for k in inputs for item in inputs[k]]
+        outputs = forward_op.outputs()
+        out_names = [item for k in outputs for item in outputs[k]]
+
+        # create input var and set value
+        for name, value in input_value.iteritems():
+            if name not in in_names:
+                raise ValueError(name + "does not exist in Op's inputs.")
+            var = scope.new_var(name).get_tensor()
+            var.set_dims(value.shape)
+            var.set(value, place)
+
+        # run forward op
+        for out_name in out_names:
+            scope.new_var(out_name)
+        forward_op.infer_shape(scope)
+        forward_op.run(scope, ctx)
+
+        # set output var's shape
+        # set output grad to ones
+        for name in out_names:
+            out_tensor = scope.find_var(name).get_tensor()
+            grad_tensor = scope.new_var(grad_var_name(name)).get_tensor()
+            grad_tensor.set_dims(out_tensor.shape())
+            data = numpy.ones(out_tensor.shape(), dtype=numpy.float32)
+            grad_tensor.set(data, place)
+
+        # run backward op
+        backward_outs = backward_op.outputs()
+        backward_names = [
+            item for key in backward_outs for item in backward_outs[key]
+        ]
+        for name in backward_names:
+            scope.new_var(name)
+
+        backward_op.infer_shape(scope)
+        backward_op.run(scope, ctx)
+
+        outs = [
+            numpy.array(scope.find_var(name).get_tensor())
+            for name in grad_names
+        ]
+        return outs
+
+    def compare_grad(self, forward_op, input_value, no_grad_set=None):
+        """ Compare the input gradients between CPU and GPU for the given forward
+        operator.
+
+        :param forward_op: forward operator
+        :type forward_op: Operator
+        :param input_value: input values.
+        :type input_value: dict{string:numpy.array}
+        :param no_grad_set: the set of variables names without gradients.
+        :type no_grad_set: a set of string
+        :raises: AssertionError, there is different gradient value.
+        """
+        if no_grad_set is None:
+            no_grad_set = set()
+        backward_op = core.Operator.backward(forward_op, no_grad_set)
+        # return if not compile with GPU or not implementing GPU kernel
+        if not (core.is_compile_gpu() and backward_op.support_gpu()):
+            return
+
+        outputs = backward_op.outputs()
+        out_names = [item for k in outputs for item in outputs[k]]
+        out_names = filter(lambda x: x != empty_var_name(), out_names)
+        cpu_grads = self.__get_gradient(forward_op, backward_op, input_value,
+                                        out_names, core.CPUPlace())
+        gpu_grads = self.__get_gradient(forward_op, backward_op, input_value,
+                                        out_names, core.GPUPlace(0))
+
+        for c_grad, g_grad, name in itertools.izip(cpu_grads, gpu_grads,
+                                                   out_names):
+            self.assertTrue(
+                numpy.allclose(
+                    c_grad, g_grad, atol=1e-4),
+                "output name: " + name + " has diff")
+
+    def __assert_is_close(self, numeric_grads, analytic_grads, names,
+                          max_relative_error, msg_prefix):
+        """Use relative error for the comparison.
+
+        :param numeric_grads: the numerical graidents.
+        :type numeric_grads: a list of numpy.array 
+        :param analytic_grads: the analytical graidents.
+        :type analytic_grads: a list of numpy.array 
+        :param name: the names of gradients, used to print for debug.
+        :type names: a list of string
+        :param msg_prefix: string info, used to print for debug.
+        :type msf_prefix: string
+        """
+        for a, b, name in itertools.izip(numeric_grads, analytic_grads, names):
             abs_a = numpy.abs(a)
             # if abs_a is nearly zero, then use abs error for a, not relative
             # error.
@@ -138,6 +261,7 @@ class GradientChecker(unittest.TestCase):
                    output_name,
                    no_grad_set=None,
                    only_cpu=False,
+                   in_place=False,
                    max_relative_error=0.005):
         """
         :param forward_op: used to create backward_op
@@ -159,106 +283,27 @@ class GradientChecker(unittest.TestCase):
 
         inputs = forward_op.inputs()
         in_names = [item for k in inputs for item in inputs[k]]
-        outputs = forward_op.outputs()
-        out_names = [item for k in outputs for item in outputs[k]]
-
         for no_grad in no_grad_set:
             if no_grad not in in_names:
                 raise ValueError("no_grad should be in in_names")
-
         backward_op = core.Operator.backward(forward_op, no_grad_set)
 
-        bwd_outputs = backward_op.outputs()
-        bwd_out_names = [item for k in bwd_outputs for item in bwd_outputs[k]]
-
         places = [core.CPUPlace()]
         if not only_cpu and core.is_compile_gpu() and backward_op.support_gpu():
             places.append(core.GPUPlace(0))
 
-        numeric_grad = dict()
-        # get numeric gradient
-        for check_name in inputs_to_check:
-            numeric_grad[check_name] = \
-                get_numeric_gradient(forward_op, input_vars, output_name,
-                                     check_name)
+        # get numerical gradients
+        numeric_grads = [
+            get_numeric_gradient(
+                forward_op, input_vars, output_name, name, in_place=in_place)
+            for name in inputs_to_check
+        ]
 
-        # get operator gradient according to different device
+        check_names = [grad_var_name(name) for name in inputs_to_check]
         for place in places:
-            scope = core.Scope()
-            ctx = core.DeviceContext.create(place)
-
-            # create input var and set value
-            for name, value in input_vars.iteritems():
-                if name not in in_names:
-                    raise ValueError(name + " not in op.inputs_")
-                var = scope.new_var(name).get_tensor()
-                var.set_dims(value.shape)
-                var.set(value, place)
-
-            # create output var
-            for out_name in out_names:
-                scope.new_var(out_name).get_tensor()
-
-            # infer the shape of output var and compute/set value of output var
-            forward_op.infer_shape(scope)
-            forward_op.run(scope, ctx)
-
-            # create output grad var
-            # set shape as the output var
-            # set value of this grad to ones
-            for name in out_names:
-                out_tensor = scope.find_var(name).get_tensor()
-                grad_tensor = scope.new_var(grad_var_name(name)).get_tensor()
-                grad_tensor.set_dims(out_tensor.shape())
-                data = 1.0 * numpy.ones(out_tensor.shape())
-                grad_tensor.set(data, place)
-
-            # create input grad var
-            for name in bwd_out_names:
-                scope.new_var(name).get_tensor()
-
-            # infer the shape of input gradient var and compute/set it's value
-            # with backward op
-            backward_op.infer_shape(scope)
-            backward_op.run(scope, ctx)
-
-            self.assert_is_close(numeric_grad, scope, max_relative_error,
-                                 "Gradient Check On %s" % str(place))
-
-
-if __name__ == '__main__':
-
-    class GetNumericGradientTest(unittest.TestCase):
-        def test_add_op(self):
-            add_op = Operator('add_two', X="X", Y="Y", Out="Z")
-            x = numpy.random.random((10, 1)).astype("float32")
-            y = numpy.random.random((10, 1)).astype("float32")
-
-            arr = get_numeric_gradient(add_op, {'X': x, "Y": y}, 'Z', 'X')
-            self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-2)
-
-        def test_softmax_op(self):
-            def stable_softmax(x):
-                """Compute the softmax of vector x in a numerically stable way."""
-                shiftx = x - numpy.max(x)
-                exps = numpy.exp(shiftx)
-                return exps / numpy.sum(exps)
-
-            def label_softmax_grad(Y, dY):
-                dX = Y * 0.0
-                for i in range(Y.shape[0]):
-                    d = numpy.dot(Y[i, :], dY[i, :])
-                    dX[i, :] = Y[i, :] * (dY[i, :] - d)
-                return dX
-
-            softmax_op = Operator("softmax", X="X", Y="Y")
-
-            X = numpy.random.random((2, 2)).astype("float32")
-            Y = numpy.apply_along_axis(stable_softmax, 1, X)
-            dY = numpy.ones(Y.shape)
-            dX = label_softmax_grad(Y, dY)
-
-            arr = get_numeric_gradient(softmax_op, {"X": X}, 'Y', 'X')
-            numpy.testing.assert_almost_equal(arr, dX, decimal=1e-2)
-
-    unittest.main()
+            # get analytical gradients according to different device
+            analytic_grads = self.__get_gradient(forward_op, backward_op,
+                                                 input_vars, check_names, place)
+            self.__assert_is_close(numeric_grads, analytic_grads, check_names,
+                                   max_relative_error,
+                                   "Gradient Check On %s" % str(place))
diff --git a/python/paddle/v2/framework/tests/mnist.py b/python/paddle/v2/framework/tests/mnist.py
new file mode 100644
index 0000000000..9a0b109850
--- /dev/null
+++ b/python/paddle/v2/framework/tests/mnist.py
@@ -0,0 +1,249 @@
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+import numpy
+import paddle.v2 as paddle
+
+BATCH_SIZE = 100
+
+scope = core.Scope()
+place = core.CPUPlace()
+# if you want to test GPU training, you can use gpu place
+# place = core.GPUPlace(0)
+dev_ctx = core.DeviceContext.create(place)
+
+init_net = core.Net.create()
+forward_net = core.Net.create()
+backward_net = None
+optimize_net = core.Net.create()
+
+
+def atomic_id():
+    id = 0
+    while True:
+        yield id
+        id += 1
+
+
+uniq_id = atomic_id().next
+
+
+def data_layer(name, dims):
+    var = scope.new_var(name)
+    tensor = var.get_tensor()
+    tensor.set_dims(dims)  # 1 is batch size holder.
+    return name
+
+
+def feed_data(name, data):
+    assert isinstance(data, numpy.ndarray)
+    tensor = scope.find_var(name).get_tensor()
+    tensor.set_dims(data.shape)
+    if data.dtype == numpy.dtype('int32'):
+        tensor.alloc_int(place)
+    elif data.dtype == numpy.dtype('float32'):
+        tensor.alloc_float(place)
+    else:
+        raise ValueError("data type not supported")
+    tensor.set(data, place)
+
+
+def grad_var_name(var_name):
+    return var_name + "@GRAD"
+
+
+def sgd_optimizer(net, param_name, learning_rate=0.005):
+    grad_name = grad_var_name(param_name)
+    optimize_op = Operator(
+        "sgd",
+        param=param_name,
+        grad=grad_name,
+        param_out=param_name,
+        learning_rate=learning_rate)
+    net.append_op(optimize_op)
+
+
+# should use operator and add these to the init_network
+def init_param(net, param_name, dims):
+    scope.new_var(param_name)
+    op = Operator(
+        "uniform_random", Out=param_name, dims=dims, min=-0.5, max=0.5, seed=10)
+    op.infer_shape(scope)
+    net.append_op(op)
+
+
+# fc_layer
+def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None):
+    """
+    Add a fc layer to net
+
+    :param input: input variable name.
+    :type input: str
+    :param size: fully connected layer size.
+    :param act: activation name
+    :param param: parameter attribute, used for initialize parameters.
+    :param bias: bias attribute. False will not have a bias.
+    :param name: the name of fc layer. If not set, model will generate a
+    readable name
+    :return: output variable name.
+    """
+    if name is None:
+        name = 'fc_%d' % uniq_id()
+    if not isinstance(name, str):
+        raise ValueError("name should be string")
+
+    input_dims = scope.find_var(input).get_tensor().get_dims()
+
+    w_name = param or name + ".w"
+    init_param(net=init_net, param_name=w_name, dims=[input_dims[1], size])
+    sgd_optimizer(net=optimize_net, param_name=w_name, learning_rate=0.01)
+
+    pre_activation = name + ".mul.out"
+    scope.new_var(pre_activation)
+    mul_op = Operator("mul", X=input, Y=w_name, Out=pre_activation)
+    net.append_op(mul_op)
+
+    # create bias variable if needed
+    if bias:
+        bias_name = name + ".b"
+        init_param(net=init_net, param_name=bias_name, dims=[size])
+        sgd_optimizer(
+            net=optimize_net, param_name=bias_name, learning_rate=0.001)
+        bias_out = name + ".rowwise_add.out"
+        scope.new_var(bias_out)
+        rowwise_append_op = Operator(
+            "rowwise_add", X=pre_activation, b=bias_name, Out=bias_out)
+        net.append_op(rowwise_append_op)
+        pre_activation = bias_out
+
+    activation_op = Operator(act, X=pre_activation, Y=name)
+    net.append_op(activation_op)
+    scope.new_var(name)
+    net.infer_shape(scope)
+    return name
+
+
+def cross_entropy_layer(net, input, label):
+    cost_name = 'cross_entropy_%d' % uniq_id()
+    cross_entropy_op = Operator(
+        "onehot_cross_entropy", X=input, label=label, Y=cost_name)
+    net.append_op(cross_entropy_op)
+    scope.new_var(cost_name)
+    net.infer_shape(scope)
+    return cost_name
+
+
+def create_backward_net(forward_net):
+    net = core.Operator.backward(forward_net, set())
+    for input in net.inputs()["all"]:
+        var = scope.new_var(input)
+        var.get_tensor()
+    for output in net.outputs()["all"]:
+        var = scope.new_var(output)
+        var.get_tensor()
+    return net
+
+
+def debug_print_op(op):
+    print("===============" + op.type() + "==============")
+    print("***inputs:***")
+    for input in op.inputs()["all"]:
+        print input, scope.find_var(input).get_tensor().get_dims()
+    print("\n***outputs:***")
+    for output in op.outputs()["all"]:
+        print output, scope.find_var(output).get_tensor().get_dims()
+    print("")
+    print("")
+
+
+def set_cost(cost):
+    cost_shape = numpy.array(scope.find_var(cost).get_tensor()).shape
+    cost_grad = \
+        scope.find_var(grad_var_name(cost)).get_tensor()
+    cost_grad.set_dims(cost_shape)
+    cost_grad.alloc_float(place)
+    cost_grad.set(numpy.ones(cost_shape).astype("float32"), place)
+
+
+def get_cost_mean(cost):
+    cost_data = numpy.array(scope.find_var(cost).get_tensor())
+    return cost_data.sum() / len(cost_data)
+
+
+def error_rate(predict, label):
+    predict_var = numpy.array(scope.find_var(predict).get_tensor()).argmax(
+        axis=1)
+    label = numpy.array(scope.find_var(label).get_tensor())
+    error_num = numpy.sum(predict_var != label)
+    return error_num / float(len(label))
+
+
+images = data_layer(name='pixel', dims=[BATCH_SIZE, 784])
+labels = data_layer(name='label', dims=[BATCH_SIZE])
+fc1 = fc_layer(net=forward_net, input=images, size=100, act="sigmoid")
+fc2 = fc_layer(net=forward_net, input=fc1, size=100, act="sigmoid")
+predict = fc_layer(net=forward_net, input=fc2, size=100, act="softmax")
+cost = cross_entropy_layer(net=forward_net, input=predict, label=labels)
+
+init_net.complete_add_op(True)
+forward_net.complete_add_op(True)
+backward_net = create_backward_net(forward_net)
+optimize_net.complete_add_op(True)
+
+print(init_net)
+print(forward_net)
+print(backward_net)
+print(optimize_net)
+
+debug_print_op(forward_net)
+debug_print_op(backward_net)
+debug_print_op(optimize_net)
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=8192),
+    batch_size=BATCH_SIZE)
+
+
+def test(cost_name):
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+    cost = []
+    error = []
+    for data in test_reader():
+        image_data = numpy.array(map(lambda x: x[0], data)).astype("float32")
+        label_data = numpy.array(map(lambda x: x[1], data)).astype("int32")
+        feed_data(images, image_data)
+        feed_data(labels, label_data)
+
+        forward_net.infer_shape(scope)
+        forward_net.run(scope, dev_ctx)
+        cost.append(get_cost_mean(cost_name))
+        error.append(error_rate(predict, "label"))
+    print("cost=" + str(sum(cost) / float(len(cost))) + " error_rate=" + str(
+        sum(error) / float(len(error))))
+
+
+PASS_NUM = 1
+
+init_net.run(scope, dev_ctx)
+for pass_id in range(PASS_NUM):
+    batch_id = 0
+
+    for data in train_reader():
+        image_data = numpy.array(map(lambda x: x[0], data)).astype("float32")
+        label_data = numpy.array(map(lambda x: x[1], data)).astype("int32")
+        feed_data(images, image_data)
+        feed_data(labels, label_data)
+
+        forward_net.infer_shape(scope)
+        forward_net.run(scope, dev_ctx)
+        set_cost(cost)
+        backward_net.infer_shape(scope)
+        backward_net.run(scope, dev_ctx)
+
+        optimize_net.run(scope, dev_ctx)
+        if batch_id % 100 == 0:
+            print("pass[" + str(pass_id) + "] batch_id[" + str(batch_id) + "]")
+            test(cost)
+
+        batch_id = batch_id + 1
diff --git a/python/paddle/v2/framework/tests/op_test_util.py b/python/paddle/v2/framework/tests/op_test_util.py
index dd65e0f2dc..3bc05a0fec 100644
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -64,7 +64,8 @@ class OpTestMeta(type):
                     actual = numpy.array(scope.find_var(out_name).get_tensor())
                     expect = self.outputs[out_name]
                     self.assertTrue(
-                        numpy.allclose(actual, expect),
+                        numpy.allclose(
+                            actual, expect, atol=1e-05),
                         "output name: " + out_name + "has diff")
 
         obj.test_all = test_all
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index 4815192e25..d4277f2a42 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -8,9 +8,8 @@ class TestCrossEntropy(unittest.TestCase):
     __metaclass__ = OpTestMeta
 
     def setUp(self):
-        # TODO this unit test is not passed
         self.type = "onehot_cross_entropy"
-        batch_size = 100
+        batch_size = 30
         class_num = 10
         X = numpy.random.random((batch_size, class_num)).astype("float32")
         label = 5 * numpy.ones(batch_size).astype("int32")
@@ -22,9 +21,9 @@ class TestCrossEntropy(unittest.TestCase):
 
 
 class CrossEntropyGradOpTest(GradientChecker):
-    def test_softmax_grad(self):
+    def test_check_grad(self):
         op = create_op("onehot_cross_entropy")
-        batch_size = 100
+        batch_size = 30
         class_num = 10
         inputs = {
             "X": numpy.random.uniform(
diff --git a/python/paddle/v2/framework/tests/test_gather_op.py b/python/paddle/v2/framework/tests/test_gather_op.py
new file mode 100644
index 0000000000..e3de3fd0a1
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_gather_op.py
@@ -0,0 +1,31 @@
+import unittest
+from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
+import numpy
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+
+
+class TestGatherOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "gather"
+        xnp = numpy.random.random((10, 20)).astype("float32")
+        self.inputs = {
+            'X': xnp,
+            'Index': numpy.array([1, 3, 5]).astype("int32")
+        }
+        self.outputs = {'Out': self.inputs['X'][self.inputs['Index']]}
+
+
+class TestGatherGradOp(GradientChecker):
+    def test_gather_grad(self):
+        op = create_op("gather")
+        xnp = numpy.random.random((10, 20)).astype("float32")
+        inputs = {'X': xnp, 'Index': numpy.array([1, 3, 5]).astype("int32")}
+        self.check_grad(op, inputs, set("X"), "Out")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_gradient_checker.py b/python/paddle/v2/framework/tests/test_gradient_checker.py
new file mode 100644
index 0000000000..e0b3151208
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_gradient_checker.py
@@ -0,0 +1,43 @@
+import unittest
+import numpy
+from paddle.v2.framework.op import Operator
+from gradient_checker import GradientChecker
+from gradient_checker import get_numeric_gradient
+
+
+class GetNumericGradientTest(unittest.TestCase):
+    def test_add_op(self):
+        add_op = Operator('add_two', X="X", Y="Y", Out="Z")
+        x = numpy.random.random((10, 1)).astype("float32")
+        y = numpy.random.random((10, 1)).astype("float32")
+
+        arr = get_numeric_gradient(add_op, {'X': x, "Y": y}, 'Z', 'X')
+        self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-4)
+
+    def test_softmax_op(self):
+        def stable_softmax(x):
+            """Compute the softmax of vector x in a numerically stable way."""
+            shiftx = x - numpy.max(x)
+            exps = numpy.exp(shiftx)
+            return exps / numpy.sum(exps)
+
+        def label_softmax_grad(Y, dY):
+            dX = Y * 0.0
+            for i in range(Y.shape[0]):
+                d = numpy.dot(Y[i, :], dY[i, :])
+                dX[i, :] = Y[i, :] * (dY[i, :] - d)
+            return dX
+
+        softmax_op = Operator("softmax", X="X", Y="Y")
+
+        X = numpy.random.random((2, 2)).astype("float32")
+        Y = numpy.apply_along_axis(stable_softmax, 1, X)
+        dY = numpy.ones(Y.shape)
+        dX = label_softmax_grad(Y, dY)
+
+        arr = get_numeric_gradient(softmax_op, {"X": X}, 'Y', 'X')
+        numpy.testing.assert_almost_equal(arr, dX, decimal=1e-2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_lookup_table.py b/python/paddle/v2/framework/tests/test_lookup_table.py
new file mode 100644
index 0000000000..19eb464baa
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_lookup_table.py
@@ -0,0 +1,31 @@
+import unittest
+import numpy as np
+from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
+
+
+class TestSigmoidOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = 'lookup_table'
+        table = np.random.random((17, 31)).astype('float32')
+        ids = np.random.randint(0, 17, 4).astype('int32')
+        self.inputs = {'W': table, 'Ids': ids}
+        self.outputs = {'Out': table[ids]}
+
+
+class TestSigmoidGradOp(GradientChecker):
+    def test_grad(self):
+        op = create_op('lookup_table')
+        table = np.random.random((17, 31)).astype('float32')
+        ids = np.random.randint(0, 17, 4).astype('int32')
+        inputs = {'W': table, 'Ids': ids}
+        # comapre gradients 
+        self.compare_grad(op, inputs, set(['Ids']))
+        # check gradients 
+        self.check_grad(op, inputs, set('W'), 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_mean_op.py b/python/paddle/v2/framework/tests/test_mean_op.py
index b5d52b9056..f32b3160d6 100644
--- a/python/paddle/v2/framework/tests/test_mean_op.py
+++ b/python/paddle/v2/framework/tests/test_mean_op.py
@@ -1,5 +1,6 @@
 import unittest
 from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
 import numpy as np
 
 
@@ -12,5 +13,12 @@ class TestMeanOp(unittest.TestCase):
         self.outputs = {'Out': np.mean(self.inputs['X'])}
 
 
+class MeanGradOpTest(GradientChecker):
+    def test_normal(self):
+        op = create_op("mean")
+        inputs = {"X": np.random.random((10, 10)).astype("float32")}
+        self.check_grad(op, inputs, set("X"), "Out")
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_minus_op.py b/python/paddle/v2/framework/tests/test_minus_op.py
new file mode 100644
index 0000000000..5abdd4a69b
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_minus_op.py
@@ -0,0 +1,30 @@
+import unittest
+import numpy as np
+from gradient_checker import GradientChecker, create_op
+from op_test_util import OpTestMeta
+
+
+class MinusOpTest(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "minus"
+        self.inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'Y': np.random.random((32, 84)).astype("float32")
+        }
+        self.outputs = {'Out': (self.inputs['X'] - self.inputs['Y'])}
+
+
+class MinusGradTest(GradientChecker):
+    def test_left(self):
+        op = create_op("minus")
+        inputs = {
+            "X": np.random.random((10, 10)).astype("float32"),
+            "Y": np.random.random((10, 10)).astype("float32")
+        }
+        self.check_grad(op, inputs, ["X", 'Y'], "Out")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/framework/tests/test_mul_op.py
index ec0ac99156..ee0d81a64e 100644
--- a/python/paddle/v2/framework/tests/test_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
@@ -1,6 +1,7 @@
 import unittest
-from op_test_util import OpTestMeta
 import numpy as np
+from gradient_checker import GradientChecker, create_op
+from op_test_util import OpTestMeta
 
 
 class TestMulOp(unittest.TestCase):
@@ -15,5 +16,19 @@ class TestMulOp(unittest.TestCase):
         self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
 
 
+class MulGradOpTest(GradientChecker):
+    def test_mul(self):
+        op = create_op("mul")
+        inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'Y': np.random.random((84, 100)).astype("float32")
+        }
+        # mul op will enlarge the relative error
+        self.check_grad(
+            op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.5)
+
+
+# TODO(dzh,qijun) : mulgrad test case need transpose feature of blas library
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_net.py b/python/paddle/v2/framework/tests/test_net.py
index b42cadd11a..9339cf28da 100644
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
@@ -6,8 +6,8 @@ import unittest
 def fc(X, W, Y):
     ret_v = core.Net.create()
 
-    ret_v.add_op(Operator("mul", X="X", Y="W", Out="pre_activation"))
-    ret_v.add_op(Operator("sigmoid", X="pre_activation", Y=Y))
+    ret_v.append_op(Operator("mul", X="X", Y="W", Out="pre_activation"))
+    ret_v.append_op(Operator("sigmoid", X="pre_activation", Y=Y))
     ret_v.complete_add_op(True)
     return ret_v
 
@@ -16,12 +16,12 @@ class TestNet(unittest.TestCase):
     def test_net_all(self):
         net = core.Net.create()
         op1 = Operator("add_two", X="X", Y="Y", Out="Out")
-        net.add_op(op1)
+        net.append_op(op1)
 
         net2 = core.Net.create()
-        net2.add_op(fc(X="X", W="w", Y="fc.out"))
+        net2.append_op(fc(X="X", W="w", Y="fc.out"))
         net2.complete_add_op(True)
-        net.add_op(net2)
+        net.append_op(net2)
         net.complete_add_op(True)
 
         expected = '''
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py
index 3d4a34d8d7..d6000ab9f9 100644
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -150,7 +150,7 @@ class TestRecurrentOp(unittest.TestCase):
         sig_op = Operator("sigmoid", X="sum", Y="h@alias")
 
         for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
-            stepnet.add_op(op)
+            stepnet.append_op(op)
         stepnet.complete_add_op(True)
         self.rnnop.set_stepnet(stepnet)
 
diff --git a/python/paddle/v2/framework/tests/test_rowwise_add_op.py b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
index f8521eb517..45d569da29 100644
--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
@@ -1,6 +1,7 @@
 import unittest
-from op_test_util import OpTestMeta
 import numpy as np
+from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
 
 
 class TestRowwiseAddOp(unittest.TestCase):
@@ -15,5 +16,15 @@ class TestRowwiseAddOp(unittest.TestCase):
         self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])}
 
 
+class RowwiseAddGradOpTest(GradientChecker):
+    def test_rowwise_add(self):
+        op = create_op("rowwise_add")
+        inputs = {
+            "X": np.random.uniform(0.1, 1, [5, 10]).astype("float32"),
+            "b": np.random.uniform(0.1, 1, [10]).astype("float32")
+        }
+        self.check_grad(op, inputs, set(["X", "b"]), "Out")
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_scale_and_identity_op.py b/python/paddle/v2/framework/tests/test_scale_and_identity_op.py
new file mode 100644
index 0000000000..69b301c376
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_scale_and_identity_op.py
@@ -0,0 +1,43 @@
+import unittest
+from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
+import numpy as np
+from paddle.v2.framework.op import Operator
+
+
+class IdentityTest(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "identity"
+        self.inputs = {'X': np.random.random((32, 784)).astype("float32")}
+        self.outputs = {'Out': self.inputs['X']}
+
+
+class IdentityGradOpTest(GradientChecker):
+    def test_normal(self):
+        op = create_op("identity")
+        inputs = {"X": np.random.random((10, 10)).astype("float32")}
+        self.check_grad(op, inputs, set("X"), "Out")
+
+
+class ScaleTest(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "scale"
+        self.inputs = {'X': np.random.random((32, 784)).astype("float32")}
+        self.attrs = {'scale': -2.3}
+        self.outputs = {'Out': self.inputs['X'] * self.attrs['scale']}
+
+
+class ScaleGradTest(GradientChecker):
+    def test_normal(self):
+        op = Operator("scale", X="X", Out="Out", scale=3.2)
+        self.check_grad(op,
+                        {"X": np.random.random((10, 10)).astype("float32")},
+                        set("X"), "Out")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_scatter_op.py b/python/paddle/v2/framework/tests/test_scatter_op.py
new file mode 100644
index 0000000000..c1f9444889
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_scatter_op.py
@@ -0,0 +1,38 @@
+import unittest
+from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
+import numpy
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+
+
+class TestScatterOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "scatter"
+        ref_np = numpy.ones((3, 3)).astype("float32")
+        index_np = numpy.array([1, 2]).astype("int32")
+        updates_np = numpy.random.random((2, 3)).astype("float32")
+        output_np = numpy.copy(ref_np)
+        output_np[index_np] += updates_np
+        self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+
+
+class TestScatterGradOp(GradientChecker):
+    def test_scatter_grad(self):
+        op = create_op("scatter")
+        # test data setup
+        ref_np = numpy.ones((3, 10)).astype("float32")
+        index_np = numpy.array([1, 2]).astype("int32")
+        updates_np = numpy.random.random((2, 10)).astype("float32")
+        output_np = numpy.copy(ref_np)
+        output_np[index_np] += updates_np
+        inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np}
+        self.check_grad(
+            op, inputs, set(["Updates", "Ref"]), "Out", in_place=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_sigmoid_op.py b/python/paddle/v2/framework/tests/test_sigmoid_op.py
index 2a57a41ed8..273c2e5ab1 100644
--- a/python/paddle/v2/framework/tests/test_sigmoid_op.py
+++ b/python/paddle/v2/framework/tests/test_sigmoid_op.py
@@ -1,6 +1,7 @@
 import unittest
-from op_test_util import OpTestMeta
 import numpy as np
+from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
 
 
 class TestSigmoidOp(unittest.TestCase):
@@ -8,12 +9,20 @@ class TestSigmoidOp(unittest.TestCase):
 
     def setUp(self):
         self.type = "sigmoid"
-        self.inputs = {'X': np.random.random((32, 100)).astype("float32")}
+        self.inputs = {'X': np.random.random((15, 31)).astype("float32")}
         self.outputs = {'Y': 1 / (1 + np.exp(-self.inputs['X']))}
 
 
-#class TestSigmoidGradOp(unittest.TestCase):
-#TODO(qingqing) add unit test
+class TestSigmoidGradOp(GradientChecker):
+    def test_grad(self):
+        op = create_op("sigmoid")
+        inputs = {"X": np.random.uniform(0.1, 1, [11, 17]).astype("float32")}
+        # compare gpu and cpu results for backward op.
+        # this test will be skiped if only compiling CPU version.
+        self.compare_grad(op, inputs)
+        # check gradients 
+        self.check_grad(op, inputs, set("X"), "Y", max_relative_error=0.007)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index 9c4dd5f250..0654a30104 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -27,16 +27,24 @@ class SGD(object):
     SGD Trainer combines data reader, network topolopy and update_equation together
     to train/test a neural network.
 
-    :param update_equation: The optimizer object.
-    :type update_equation: paddle.v2.optimizer.Optimizer
     :param cost: Target cost that neural network should be optimized.
     :type cost: paddle.v2.config_base.Layer
     :param parameters: The parameters dictionary.
     :type parameters: paddle.v2.parameters.Parameters
+    :param update_equation: The optimizer object.
+    :type update_equation: paddle.v2.optimizer.Optimizer
     :param extra_layers: Some layers in the neural network graph are not
                          in the path of cost layer.
-    :param pserver_spec: pserver location, eg: localhost:3000
     :type extra_layers: paddle.v2.config_base.Layer
+    :param is_local: Whether trainning locally
+    :type is_local: bool
+    :param pserver_spec: comma string for pserver location,
+                         eg:127.10.0.10:3000,127.10.0.11:3000,
+                         and this parameter is only used for fault
+                         tolerant mode cluster training.
+    :type pserver_spec: string
+    :param use_etcd: Whether using etcd pserver.
+    :param use_etcd: bool
     """
 
     def __init__(self,
diff --git a/python/setup.py.in b/python/setup.py.in
index 287442e013..87b3823e52 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -24,14 +24,17 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
     setup_requires+=["opencv-python"]
 
 # the prefix is sys.prefix which should always be usr
-paddle_bin_dir = 'local/opt/paddle/bin'
+paddle_bin_dir = 'opt/paddle/bin'
 paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle_usage',
                '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
                '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
-               '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main']
+               '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main',
+               '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 
-paddle_rt_lib_dir = 'local/lib'
-paddle_rt_libs = [] if '${MKL_SHARED_LIBS}'== '' else '${MKL_SHARED_LIBS}'.split(';')
+paddle_rt_lib_dir = 'lib'
+paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
+if '${MKL_SHARED_LIBS}'!= '':
+  paddle_rt_libs += '${MKL_SHARED_LIBS}'.split(';')
 
 setup(name='paddlepaddle',
       version='${PADDLE_VERSION}',
@@ -50,8 +53,7 @@ setup(name='paddlepaddle',
           'paddle.v2.framework.proto': '${PADDLE_BINARY_DIR}/paddle/framework',
           'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
       },
-      scripts=['${PADDLE_BINARY_DIR}/paddle/scripts/paddle'],
+      scripts=paddle_bins,
       distclass=BinaryDistribution,
-      data_files=[(paddle_bin_dir, paddle_bins),
-                  (paddle_rt_lib_dir, paddle_rt_libs)]
+      data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
 )