diff --git a/.gitignore b/.gitignore
index 020d3f0c30..ac56a3320e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
 paddle/pybind/pybind.h
+python/paddle/version.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6aeef23330..b309ff37e5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,8 @@ SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 include(system)
 
 project(paddle CXX C Go)
+message(STATUS "CXX compiler: " ${CMAKE_CXX_COMPILER} ", version: " ${CMAKE_CXX_COMPILER_VERSION})
+message(STATUS "C compiler: " ${CMAKE_C_COMPILER} ", version: " ${CMAKE_C_COMPILER_VERSION})
 
 find_package(Sphinx)
 if(NOT CMAKE_CROSSCOMPILING)
@@ -58,6 +60,7 @@ option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
+option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
diff --git a/README.md b/README.md
index db0fbd88b2..ceeb6d9e51 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/doc_cn/)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -36,7 +36,7 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
     examples:
 
       - Optimized math operations through SSE/AVX intrinsics, BLAS libraries
-      (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
+      (e.g. MKL, OpenBLAS, cuBLAS) or customized CPU/GPU kernels.
       - Highly optimized recurrent networks which can handle **variable-length**
       sequence without padding.
       - Optimized local and distributed training for models with high dimensional
diff --git a/RELEASE.cn.md b/RELEASE.cn.md
index 5deaf230a8..494c59730d 100644
--- a/RELEASE.cn.md
+++ b/RELEASE.cn.md
@@ -1,3 +1,62 @@
+# v0.11.0版本
+
+## PaddlePaddle Fluid
+
+- PaddlePaddle发布版本v0.11.0包含一个新的特性*PaddlePaddle Fluid*. Fluid 是设计用来让用户像Pytorch和Tensorflow Eager Execution一样执行程序。在这些系统中，不再有*模型*这个概念，应用也不再包含一个用于描述Operator图或者一系列层的符号描述，而是像通用程序那样描述训练或者预测的过程。而Fluid与PyTorch或Eager Execution的区别在于Fluid不依赖Python提供的控制流，例如 if-else-then或者for，而是提供了基于C++实现的控制流并暴露了对应的用with语法实现的Python接口。例如：
+
+  https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44
+
+- 在v0.11.0版本中，我们提供了一个C++类`Executor`用于运行一个Fluid程序。Executor类似一个解释器。在未来的版本中，我们将提升和优化Executor成为一个调试器，就像GDB。并可能提供一些编译器，这个编译器会读取一个上文所描述的应用然后编译成一个等价的
+源代码，这个源代码可以被nvcc编译成可以使用CUDA的二进制，或者被icc编译成可以充分利用Intel CPU的二进制。
+
+
+## 新特点
+
+* 发布 `PaddlePaddle Fluid`。
+* 增加了用于模型预测的C-API。
+* 用Fluid API实现了一个简单的GAN的例子。
+* 增加了关于性能调优的文档。
+* 为`paddle.v2.dataset`下载数据集提供了重试机制.
+* C++中使用protobuf-lite替换protobuf减少了二进制的大小。
+* 发布了新特性 [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment).
+* 基于Bazel API利用cmake实现了一个的新的构建系统函数库。
+* 当使用编译选项`WITH_MKL=ON`时自动下载和编译Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) 函数库.
+* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn):
+  - 完成了 11个 MKL-DNN 层: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN。
+  - 完成了 3个 MKL-DNN 网络: VGG-19, ResNet-50, GoogleNet
+  - 基于Intel Skylake 6148 CPU的[性能测试](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) : 相对于MKLML有2~3倍的训练加速。
+* 增加 [softsign activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign)
+* 增加 [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod)
+* 增加 [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance)
+* 增加 [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq)
+* 增加 [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score)
+* 增加 [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice)
+* 增加 [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv)
+* 增加移动端友好的网页
+
+## 改进
+
+* 使用一个Python`whl`包即可安装.
+* [V2 API可以实现用户定制化评估](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标)。
+* 将 `PADDLE_ONLY_CPU` 改为 `PADDLE_WITH_GPU`, 因为我们会支持多种设备。
+* 删除了有一些bug的BarrierStat。
+* 清理和删除了paddle::Parameter中未使用的函数。
+* 删除了ProtoDataProvider。
+* Huber loss同时支持回归和分类。
+* 为sequence pooling 层增加`stride`参数。
+* v2 API自动使用cudnn batch normalization。
+* 可以使用一个固定的参数名共享BN层的参数。
+* 2D convolution operation支持variable-dimension input特性。
+* 重构cmake中关于CUDA的部分并实现自动检测GPU架构的功能。
+* 优化网页导航。
+
+## 错误修复
+
+* 修复ROI pooling的Bug. cc9a761
+* 修复当label是dense vector是AUC变成0的问题. #5274
+* 修复WarpCTC 层的Bug.
+
+
 # v0.10.0版本
 
 我们非常高兴发布了PaddlePaddle V0.10.0版，并开发了新的[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。
diff --git a/RELEASE.md b/RELEASE.md
index 146f7afa7d..5a62c95513 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,75 @@
+# Release v0.11.0
+
+## PaddlePaddle Fluid
+
+- Release 0.11.0 includes a new feature *PaddlePaddle Fluid*.  Fluid is
+  designed to allow users to program like PyTorch and TensorFlow Eager Execution.
+  In these systems, there is no longer the concept *model* and applications
+  do not include a symbolic description of a graph of operators nor a sequence
+  of layers. Instead, applications look exactly like a usual program that
+  describes a process of training or inference.  The difference between
+  Fluid and PyTorch or Eager Execution is that Fluid doesn't rely on Python's
+  control-flow, `if-then-else` nor `for`.  Instead, Fluid provides its
+  C++ implementations and their Python binding using the `with` statement.  For an example
+
+  https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44
+
+- In 0.11.0, we provides a C++ class `Executor` to run a Fluid program.
+Executor works like an interpreter. In future version, we will improve
+`Executor` into a debugger like GDB, and we might provide some compilers,
+which, for example, takes an application like the above one, and outputs
+an equivalent C++ source program, which can be compiled using
+[`nvcc`](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html)
+to generate binaries that use CUDA, or using
+[`icc`](https://software.intel.com/en-us/c-compilers) to generate binaries
+that make full use of Intel CPUs.
+
+## New Features
+
+* Release `PaddlePaddle Fluid`.
+* Add C-API for model inference
+* Use fluid API to create a simple GAN demo.
+* Add develop guide about performance tunning.
+* Add retry when download `paddle.v2.dataset`.
+* Linking protobuf-lite not protobuf in C++. Reduce the binary size.
+* Feature [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment) released.
+* A new style cmake functions for Paddle. It is based on Bazel API.
+* Automatically download and compile with Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) library as CBLAS when build `WITH_MKL=ON`.
+* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn):
+  - Complete 11 MKL-DNN layers: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN.
+  - Complete 3 MKL-DNN networks: VGG-19, ResNet-50, GoogleNet
+  - [Benchmark](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) on Intel Skylake 6148 CPU: 2~3x training speedup compared with MKLML.
+* Add the [`softsign` activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign).
+* Add the [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod).
+* Add the [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance).
+* Add the [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq).
+* Add the [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score).
+* Add the [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice).
+* Add the [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv)
+* Add mobile friendly webpages.
+
+## Improvements
+
+* Build and install using a single `whl` package.
+* [Custom evaluating in V2 API](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标).
+* Change `PADDLE_ONLY_CPU` to `PADDLE_WITH_GPU`, since we will support many kinds of devices.
+* Remove buggy BarrierStat.
+* Clean and remove unused functions in paddle::Parameter.
+* Remove ProtoDataProvider.
+* Huber loss supports both regression and classification.
+* Add the `stride` parameter  for sequence pooling layers.
+* Enable v2 API use cudnn batch normalization automatically.
+* The BN layer's parameter can be shared by a fixed the parameter name.
+* Support variable-dimension input feature for 2D convolution operation.
+* Refine cmake about CUDA to automatically detect GPU architecture.
+* Improved website navigation.
+
+## Bug Fixes
+
+* Fix bug in ROI pooling. cc9a761
+* Fix AUC is zero when label is dense vector. #5274
+* Fix bug in WarpCTC layer.
+
 # Release v0.10.0
 
 We are glad to release version 0.10.0.  In this version, we are happy to release the new 
diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
index 16c2390fd3..8ee7fd28c5 100644
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -2,27 +2,25 @@
 
 Machine:
 
-- Server
- 	- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
-- Laptop
- 	- DELL XPS15-9560-R1745: i7-7700HQ 8G 256GSSD
- 	- i5 MacBook Pro (Retina, 13-inch, Early 2015)
-- Desktop
- 	- i7-6700k
+- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
+- Laptop: TBD
 
 System: CentOS release 6.3 (Final), Docker 1.12.1.
 
-PaddlePaddle: paddlepaddle/paddle:latest (for MKLML and MKL-DNN), paddlepaddle/paddle:latest-openblas (for OpenBLAS)
-- MKL-DNN tag v0.11
-- MKLML 2018.0.1.20171007
-- OpenBLAS v0.2.20
-(TODO: will rerun after 0.11.0)
+PaddlePaddle: (TODO: will rerun after 0.11.0)
+- paddlepaddle/paddle:latest (for MKLML and MKL-DNN)
+  - MKL-DNN tag v0.11
+  - MKLML 2018.0.1.20171007
+- paddlepaddle/paddle:latest-openblas (for OpenBLAS)
+  - OpenBLAS v0.2.20
 	 
 On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
 
 ## Benchmark Model
 
 ### Server
+
+#### Training
 Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
 
 Input image size - 3 * 224 * 224, Time: images/second
@@ -35,9 +33,7 @@ Input image size - 3 * 224 * 224, Time: images/second
 | MKLML        | 12.12 | 13.70 | 16.18  |
 | MKL-DNN      | 28.46 | 29.83 | 30.44  |
 
-
-chart on batch size 128
-TBD
+<img src="figs/vgg-cpu-train.png" width="500">
 
  - ResNet-50
 
@@ -47,9 +43,7 @@ TBD
 | MKLML        | 32.52 | 31.89 | 33.12  |
 | MKL-DNN      | 81.69 | 82.35 | 84.08  |
 
-
-chart on batch size 128
-TBD
+<img src="figs/resnet-cpu-train.png" width="500">
 
  - GoogLeNet
 
@@ -59,10 +53,35 @@ TBD
 | MKLML        | 128.46| 137.89| 158.63 |
 | MKL-DNN      | 250.46| 264.83| 269.50 |
 
-chart on batch size 128
-TBD
+<img src="figs/googlenet-cpu-train.png" width="500">
+
+#### Inference
+Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+- VGG-19
+
+| BatchSize | 1     | 2     | 4     | 8     | 16    |
+|-----------|-------|-------|-------|-------|-------|
+| OpenBLAS  | 1.07  | 1.08  | 1.06  | 0.88  | 0.65  |
+| MKLML     | 5.58  | 9.80  | 15.15 | 21.21 | 28.67 |
+| MKL-DNN   | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
+
+- ResNet-50
+
+| BatchSize | 1     | 2      | 4      | 8      | 16     |
+|-----------|-------|--------|--------|--------|--------|
+| OpenBLAS  | 3.35  | 3.19   | 3.09   | 2.55   | 1.96   |
+| MKLML     | 6.33  | 12.02  | 22.88  | 40.53  | 63.09  |
+| MKL-DNN   | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
+
+
+- GoogLeNet
+
+| BatchSize | 1      | 2      | 4      | 8      | 16     |
+|-----------|--------|--------|--------|--------|--------|
+| OpenBLAS  | 12.04  | 11.31  | 10.00  | 9.07   | 4.34   |
+| MKLML     | 22.74  | 41.56  | 81.22  | 133.47 | 210.53 |
+| MKL-DNN   | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
+
 
 ### Laptop
 TBD
-### Desktop
-TBD
diff --git a/benchmark/figs/googlenet-cpu-train.png b/benchmark/figs/googlenet-cpu-train.png
new file mode 100644
index 0000000000..c3f67faf09
Binary files /dev/null and b/benchmark/figs/googlenet-cpu-train.png differ
diff --git a/benchmark/figs/resnet-cpu-train.png b/benchmark/figs/resnet-cpu-train.png
new file mode 100644
index 0000000000..b96ecd5ff9
Binary files /dev/null and b/benchmark/figs/resnet-cpu-train.png differ
diff --git a/benchmark/figs/vgg-cpu-train.png b/benchmark/figs/vgg-cpu-train.png
new file mode 100644
index 0000000000..f830ca6a87
Binary files /dev/null and b/benchmark/figs/vgg-cpu-train.png differ
diff --git a/benchmark/paddle/image/run_mkldnn_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh
similarity index 100%
rename from benchmark/paddle/image/run_mkldnn_infer.sh
rename to benchmark/paddle/image/run_mkl_infer.sh
diff --git a/benchmark/paddle/image/run_mkldnn_train.sh b/benchmark/paddle/image/run_mkl_train.sh
similarity index 85%
rename from benchmark/paddle/image/run_mkldnn_train.sh
rename to benchmark/paddle/image/run_mkl_train.sh
index 320206239a..5335af5ac1 100755
--- a/benchmark/paddle/image/run_mkldnn_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@@ -28,6 +28,10 @@ function train() {
     --test_period=100 \
     --config_args=$args \
     2>&1 | tee ${log} 
+
+  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 }
 
 if [ ! -f "train.list" ]; then
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
new file mode 100755
index 0000000000..c1001d3a7c
--- /dev/null
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -0,0 +1,62 @@
+set -e
+
+function clock_to_seconds() {
+  hours=`echo $1 | awk -F ':' '{print $1}'`
+  mins=`echo $1 | awk -F ':' '{print $2}'`
+  secs=`echo $1 | awk -F ':' '{print $3}'`
+  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
+}
+
+function infer() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  thread=`nproc`
+  if [ $thread -gt $bs ]; then
+    thread=$bs
+  fi
+  log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+
+  models_in="models/${topology}-${layer_num}/pass-00000/"
+  if [ ! -d $models_in ]; then
+    echo "./run_mkl_infer.sh to save the model first"
+    exit 0
+  fi
+  log_period=$((256 / bs))
+  paddle train --job=test \
+    --config="${topology}.py" \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=$log_period \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
+    --init_model_path=$models_in \
+    2>&1 | tee ${log}
+
+  # calculate the last 5 logs period time of 1280 samples,
+  # the time before are burning time.
+  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  start_sec=`clock_to_seconds $start`
+  end_sec=`clock_to_seconds $end`
+  fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
+  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -f "test.list" ]; then
+  echo " " > test.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# inference benchmark
+for batchsize in 1 2 4 8 16; do
+  infer googlenet v1 $batchsize
+  infer resnet 50 $batchsize
+  infer vgg 19 $batchsize
+done
diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh
new file mode 100755
index 0000000000..b9494ce119
--- /dev/null
+++ b/benchmark/paddle/image/run_openblas_train.sh
@@ -0,0 +1,39 @@
+set -e
+
+function train() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  thread=`nproc`
+  # each trainer_count use only 1 core to avoid conflict
+  log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+  args="batch_size=${bs},layer_num=${layer_num}"
+  config="${topology}.py"
+  paddle train --job=time \
+    --config=$config \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=10 \
+    --test_period=100 \
+    --config_args=$args \
+    2>&1 | tee ${log} 
+
+  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# training benchmark
+for batchsize in 64 128 256; do
+  train vgg 19 $batchsize
+  train resnet 50 $batchsize
+  train googlenet v1 $batchsize
+done
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index b21fc43904..6320b17520 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -3,7 +3,7 @@
 # It will search MKLML, atlas, OpenBlas, reference-cblas in order.
 #
 # If any cblas implementation found, the following variable will be set.
-#    CBLAS_PROVIDER  # one of MKLML, ATLAS, OPENBLAS, REFERENCE
+#    CBLAS_PROVIDER  # one of MKLML, OPENBLAS, REFERENCE
 #    CBLAS_INC_DIR   # the include directory for cblas.
 #    CBLAS_LIBS      # a list of libraries should be linked by paddle.
 #                    # Each library should be full path to object file.
@@ -17,7 +17,7 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
   set(CBLAS_INC_DIR ${MKLML_INC_DIR})
   set(CBLAS_LIBRARIES ${MKLML_LIB})
 
-  add_definitions(-DPADDLE_USE_MKLML)
+  add_definitions(-DPADDLE_WITH_MKLML)
   add_definitions(-DLAPACK_FOUND)
 
   message(STATUS "Found cblas and lapack in MKLML "
@@ -25,42 +25,6 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
   return()
 endif()
 
-## Then find atlas.
-set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
-set(ATLAS_INCLUDE_SEARCH_PATHS
-        ${ATLAS_ROOT}/include
-        /usr/include
-        /usr/include/atlas)
-set(ATLAS_LIB_SEARCH_PATHS
-        ${ATLAS_ROOT}/lib
-        /usr/lib
-        /usr/lib/blas/atlas
-        /usr/lib/atlas
-        /usr/lib/atlas-base   # special for ubuntu 14.04.
-    )
-find_path(ATLAS_INC_DIR NAMES cblas.h
-  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
-  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
-  PATHS ${ATLAS_LIB_SEARCH_PATHS})
-find_library(ATLAS_CLAPACK_LIB NAMES lapack_atlas liblapack_atlas.so.3
-  PATHS ${ATLAS_LIB_SEARCH_PATHS})
-
-if(ATLAS_CLAPACK_INC_DIR AND ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_CLAPACK_LIB)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER ATLAS)
-  set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
-  set(CBLAS_LIBRARIES ${ATLAS_CLAPACK_LIB} ${ATLAS_CBLAS_LIB})
-
-  add_definitions(-DPADDLE_USE_ATLAS)
-  add_definitions(-DLAPACK_FOUND)
-
-  message(STATUS "Found ATLAS (include: ${ATLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
-  return()
-endif()
-
 ## Then find openblas.
 set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
 set(OPENBLAS_INCLUDE_SEARCH_PATHS
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e550ec2856..5c6bcfde76 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -24,6 +24,11 @@ if(WITH_DOUBLE)
     add_definitions(-DPADDLE_TYPE_DOUBLE)
 endif(WITH_DOUBLE)
 
+if(WITH_ARM_FP16)
+    add_definitions(-DPADDLE_ARM_FP16)
+    add_definitions("-march=armv8.2-a+fp16+simd")
+endif(WITH_ARM_FP16)
+
 if(WITH_TESTING)
     add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index fc52d339d7..5d24caebdc 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -67,5 +67,5 @@ ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
 ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
 MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
-add_definitions(-DPADDLE_USE_MKLDNN)
+add_definitions(-DPADDLE_WITH_MKLDNN)
 LIST(APPEND external_project_dependencies mkldnn)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 4c4f59656d..97857a686b 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -114,11 +114,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
 FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
-    ADD_LIBRARY(cblas SHARED ${dummyfile})
-ELSE()
-    ADD_LIBRARY(cblas STATIC ${dummyfile})
-ENDIF()
+ADD_LIBRARY(cblas STATIC ${dummyfile})
 TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
 
 IF(NOT ${CBLAS_FOUND})
diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst
index 9be0b370ee..84f9097a6c 100644
--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
@@ -7,3 +7,4 @@ API
     模型配置 <v2/model_configs.rst>
     数据访问 <v2/data.rst>
     训练与应用 <v2/run_logic.rst>
+    v2/fluid.rst
diff --git a/doc/api/v2/config/activation.rst b/doc/api/v2/config/activation.rst
index eca3ce03bc..5317e66b64 100644
--- a/doc/api/v2/config/activation.rst
+++ b/doc/api/v2/config/activation.rst
@@ -99,3 +99,10 @@ STanh
 ..  automodule:: paddle.v2.activation
     :members: STanh
     :noindex:
+    
+SoftSign
+========
+
+..  automodule:: paddle.v2.activation
+    :members: SoftSign
+    :noindex:
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index 89e5fec13b..9f3669e115 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -300,3 +300,7 @@ conv2d_transpose
 ..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
     :noindex:
 
+sequence_expand
+---------
+..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+    :noindex:
diff --git a/doc/design/executor.md b/doc/design/executor.md
index b5fb6c5c3c..aa738ab598 100644
--- a/doc/design/executor.md
+++ b/doc/design/executor.md
@@ -1,23 +1,27 @@
 # Executor Design Doc
 
 ## Motivation
+In the [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage user use deep learning programming paradigms to describe training process. When the user-written Python program is executed, it will create a protobuf message
+[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
 
-We use executor to do the runtime evaluation of a `ProgramDesc`.
+The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains intrinsics/operators and variables which will be used, executor explicitly execute the stored precompiled code.
 
 ## Overview
 
 An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
 
-### What does executor do?
+## Executor
 
-It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
+`Executor` explicitly executes all the intrinsics/operators in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence. It is very similar to push stack frame when entering the block, it will destroy the temporary variables when mini-batch is finished, but it does not have stack frame pop process.
 
-### What does executor NOT do?
+### Interface
+```c++
+  Executor(places);
+```
+A executor does not own any computing resources, user can only construct an executor with specified places.
 
-It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
 
-It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
-
-## Implementation
-
-`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
+```
+  void Run(ProgramDesc, Scope, block_id, create_local_scope);
+```
+A executor only provides an unified way to execute `ProgramDesc`. `ProgramDesc` is the target will be executed, scope specifies the variable container. `block_id` indicates the entrance block, `create_local_scope` means if it will destroy the temporary variables after execution finished.
diff --git a/doc/design/fluid-compiler.graffle b/doc/design/fluid-compiler.graffle
new file mode 100644
index 0000000000..c933df2cb8
Binary files /dev/null and b/doc/design/fluid-compiler.graffle differ
diff --git a/doc/design/fluid-compiler.png b/doc/design/fluid-compiler.png
new file mode 100644
index 0000000000..1b0ffed203
Binary files /dev/null and b/doc/design/fluid-compiler.png differ
diff --git a/doc/design/fluid.md b/doc/design/fluid.md
new file mode 100644
index 0000000000..585dc8ef39
--- /dev/null
+++ b/doc/design/fluid.md
@@ -0,0 +1,122 @@
+# Design Doc: PaddlePaddle Fluid
+
+## Why Fluid
+
+When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe.  However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework?
+
+Fluid is the answer.  Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model.  In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no  concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning.
+
+## The Evolution of Deep Learning Systems
+
+Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
+
+| Existed since | model as sequence of layers | model as graph of operators | No model |
+|--|--|--|--|
+| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
+| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
+| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
+
+From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model.  To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
+
+## Deep Learning Programming Paradigms
+
+With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following:
+
+```python
+x = layer.data("image")
+l = layer.data("label")
+f = layer.fc(x, W)
+s = layer.softmax(f)
+c = layer.mse(l, s)
+
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    forward({input=x, data=m}, minimize=c)
+    backward(...)
+
+print W # print the trained model parameters.
+```
+
+The above program includes two parts:
+
+1. The first part describes the model, and
+2. The second part describes the training process (or inference process) for the model.
+
+This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt.
+
+This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general,  prefer PyTorch over the older systems.  Using PyTorch, we would write the above program as following:
+
+```python
+W = tensor(...)
+
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    x = m["image"]
+    l = m["label"]
+    f = layer.fc(x, W)
+    s = layer.softmax(f)
+    c = layer.mse(l, s)
+    backward()
+
+print W # print the trained model parameters.
+```
+
+We can see that the main difference is the moving the model configuration part (the first step) into the training loop.  This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block.  This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop.
+
+## Describe Arbitrary Models for the Future
+
+Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet.
+
+As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator.  A PyTorch example would look like the following:
+
+```python
+for i in xrange(1000):
+    m = read_minibatch()
+    x = m["sentence"]
+    for t in xrange x.len():
+        h[t] = the_step(x[t])
+```        
+
+With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following:
+
+```python
+train_loop = layers.While(cond)
+with train_loop.block():
+  m = read_minibatch()
+  x = m["sentence"]
+  rnn = layers.While(...)
+  with rnn.block():
+    h[t] = the_step(input[t])
+```    
+
+An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44).
+
+From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
+
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
+
+## Turing Completeness
+
+In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine.  For a programming language, if it provides if-then-else and loop, it is Turing complete.  From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there  is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete.  Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth.
+
+## The Execution of a Fluid Program
+
+There are two ways to execute a Fluid program.  When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+
+There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
+
+Fluid is moving towards the direction of a compiler, which is explain in more detail later in this article.
+
+## Backward Compatibility of Fluid
+
+Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference.  For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph).  Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators.  The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators.
+
+For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format.
+
+## Towards a Deep Learning Language and the Compiler
+
+We can change the `if-then-else` and loop structure a little bit in the above Fluid example programs, to make it into a new programming language, different than Python.
+
+Even if we do not invent a new language, as long as we get the `ProgramDesc` message filled in, we can write a transpiler, which translates each invocation to an operator, into a C++ call to a kernel function of that operator. For example, a transpiler that weaves the CUDA kernels outputs an NVIDIA-friendly C++ program, which can be built using `nvcc`.  Another transpiler could generate MKL-friendly code that should be built using `icc` from Intel.  More interestingly, we can translate a Fluid program into its distributed version of two `ProgramDesc` messages, one for running on the trainer process, and the other one for the parameter server.  For more details of the last example, the [concurrent programming design](concurrent_programming.md) document would be a good pointer.  The following figure explains the proposed two-stage process:
+
+![](fluid-compiler.png)
diff --git a/doc/design/images/multigpu_allreduce.graffle b/doc/design/images/multigpu_allreduce.graffle
new file mode 100644
index 0000000000..cb5bc420ce
Binary files /dev/null and b/doc/design/images/multigpu_allreduce.graffle differ
diff --git a/doc/design/images/multigpu_allreduce.png b/doc/design/images/multigpu_allreduce.png
new file mode 100644
index 0000000000..87a1b3e8f6
Binary files /dev/null and b/doc/design/images/multigpu_allreduce.png differ
diff --git a/doc/design/images/multigpu_before_convert.graffle b/doc/design/images/multigpu_before_convert.graffle
new file mode 100644
index 0000000000..6c35ab1b21
Binary files /dev/null and b/doc/design/images/multigpu_before_convert.graffle differ
diff --git a/doc/design/images/multigpu_before_convert.png b/doc/design/images/multigpu_before_convert.png
new file mode 100644
index 0000000000..9c8f771116
Binary files /dev/null and b/doc/design/images/multigpu_before_convert.png differ
diff --git a/doc/design/mkldnn/image/engine.png b/doc/design/mkl/image/engine.png
similarity index 100%
rename from doc/design/mkldnn/image/engine.png
rename to doc/design/mkl/image/engine.png
diff --git a/doc/design/mkldnn/image/gradients.png b/doc/design/mkl/image/gradients.png
similarity index 100%
rename from doc/design/mkldnn/image/gradients.png
rename to doc/design/mkl/image/gradients.png
diff --git a/doc/design/mkldnn/image/layers.png b/doc/design/mkl/image/layers.png
similarity index 100%
rename from doc/design/mkldnn/image/layers.png
rename to doc/design/mkl/image/layers.png
diff --git a/doc/design/mkldnn/image/matrix.png b/doc/design/mkl/image/matrix.png
similarity index 100%
rename from doc/design/mkldnn/image/matrix.png
rename to doc/design/mkl/image/matrix.png
diff --git a/doc/design/mkldnn/image/overview.png b/doc/design/mkl/image/overview.png
similarity index 100%
rename from doc/design/mkldnn/image/overview.png
rename to doc/design/mkl/image/overview.png
diff --git a/doc/design/mkl/mkl_packed.md b/doc/design/mkl/mkl_packed.md
new file mode 100644
index 0000000000..0123315ad4
--- /dev/null
+++ b/doc/design/mkl/mkl_packed.md
@@ -0,0 +1,108 @@
+# Intel® MKL Packed on PaddlePaddle: Design Doc
+
+
+## Contents
+
+- [Overview](#overview)
+- [Key Points](#key-points) 
+   - [Background](#background)
+   - [Solution](#solution)
+- [Actions](#actions)
+    - [CMake](#cmake)
+	- [Layers](#layers)
+	- [Unit Tests](#unit-tests)
+	- [Python API](#python-api)
+	- [Benchmarking](#benchmarking)
+
+
+## Overview
+我们计划将 Intel® MKL 中引入的 GEMM Packed APIs\[[1](#references)\] 集成到 PaddlePaddle 中，充分发挥英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+现阶段的优化主要针对 Recurrent Neural Network（以下简称RNN）相关层（包括`RecurrentLayer`, `GatedRecurrentLayer`和`LstmLayer`）， 以及 PaddlePaddle V1 API。
+
+## Key Points
+
+### Background
+目前PaddlePaddle采用了 Intel® MKL库的[cblas_?gemm](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm)函数，这个函数本身会在计算前将原数据转换为更适合英特尔平台的内部格式。
+
+1. 转换耗时 \
+这一数据格式的转换操作（Packing），在问题本身的计算量比较小的时候，显得相对来说较为耗时。例如在DeepSpeech2 \[[2](#references)\] 的Vanilla RNN部分中，矩阵大小是`batch_size * 2048`。
+2. 转换冗余 \
+由于在现有的某些情况下（例如RNN），多次调用 cblas_?gemm 会使用相同的原数据，因此，每次调用时对原数据的重复Packing便成为了冗余。
+
+为了最大程度减少多次调用 cblas_?gemm 在Packing上的耗时，Intel® MKL 引入了以下四个API:
+   * [cblas_?gemm_alloc](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-alloc)
+   * [cblas_?gemm_pack](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-pack)
+   * [cblas_?gemm_compute](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-compute)
+   * [cblas_?gemm_free](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-free)
+
+通过使用这些API，我们可以先完成对原数据的Packing操作，再把已转换为Packed格式的数据传递给那些复用同一数据的gemm_compute函数，从而避免了Packing冗余。
+
+### Solution
+在RNN的情况下，同一次前向、后向（forward/backward）过程中所有时间步（time step）共享同一个权重（weight）。当只做推断（inference）时，各次前向之间也都使用了相同的权重，没有必要在每次前向中每个时间步的计算时对权重进行重复的Packing操作。
+
+我们通过使用新引入的GEMM Packed APIs，在层初始化的时候，先完成对权重的Packing操作，然后在前向，后向时复用已经转换过的权重，并在每次权重更新后，对新的权重进行转换用于下次迭代。
+
+* 优化前，对于序列长度（sequence length）为`T`的网络模型（model）, `N`次迭代执行的转换次数为：
+  - `inference`： `N * T`  
+  - `training`： `2 * N * T`
+* 优化后，对于同样设置的网络模型，其转换次数减少至：
+  - `inference`： `1`    
+  - `training`： `2 * N`
+
+## Actions
+
+添加的相关文件和目录结构如下：
+
+```txt
+PaddlePaddle/Paddle
+├── ...
+└── paddle/
+    ├── ...
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   ├── MKLPackedRecurrentLayer.*
+        |   ├── MKLPackedGatedRecurrentLayer.*
+        |   ├── MKLPackedLstmLayer.*
+        |   └── MKLPackedGemm.h
+        └── tests/
+            ├── ...
+            └── test_MKLPacked.cpp
+```
+
+### CMake
+在对应的`CMakeLists.txt`中根据`WITH_MKL`是否打开，来决定是否开启MKL Packed相关功能。
+
+### Layers
+所有的`MKLPacked*Layer`都继承于PaddlePaddle的基类`Layer`, 并添加头文件 `MKLPackedGemm.h`，该文件对相关GEMM Packed APIs做了封装。
+
+### Unit Tests
+我们会添加`test_MKLPacked.cpp`用于MKL Packed优化后layer的测试。
+对于每一个新加的RNN layer，我们会对比如下2个方面：
+1. 对比优化后layer自身，sequence mode（`rnn_use_batch=false`）与batch mode(`rnn_use_batch=true`)的结果。
+2. 对比优化后layer与相对应的PaddlePaddle原有layer, 在batch mode下的结果。
+
+### Python API
+计划在`paddle/utils.Flags`中添加`use_mkl_packed`的flag，用于选择是否使用相关功能，并且当编译时`WITH_MKL=ON`的情况下，默认设置为`true`。
+
+同时，在`python/paddle/trainer/config_parser.py`中对应的layer处，添加`use_mkl_packed`这个选择，方便用户在Python端选择是否启用这个功能。
+
+具体实现方式比如：
+
+```python
+use_mkl_packed = bool(int(g_command_config_args.get("use_mkl_packed", 0)))
+if use_mkl_packed:
+    self.layer_type = mkl_packed_*
+```
+
+所有相关的`layer_type`会以*mkl_packed_*开头，这些会在`MKLPacked*Layer`注册layer的时候保证，以示区分。 
+
+
+### Benchmarking
+会添加相应的脚本用于测试和对比在使用MKL Packed recurrent layers 前后的网络性能。
+
+## References 
+1. [Introducing the new Packed APIs for GEMM](https://software.intel.com/en-us/articles/introducing-the-new-packed-apis-for-gemm)
+2. [DeepSpeech2 on PaddlePaddle](https://github.com/PaddlePaddle/DeepSpeech#deepspeech2-on-paddlepaddle)
+
diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkl/mkldnn.md
similarity index 99%
rename from doc/design/mkldnn/README.MD
rename to doc/design/mkl/mkldnn.md
index 61d453de24..e2fe1e6b26 100644
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkl/mkldnn.md
@@ -208,4 +208,3 @@ if use_mkldnn
 但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
 4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`，所以不存在这个问题)。
 所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
-
diff --git a/doc/design/paddle_nccl.md b/doc/design/paddle_nccl.md
new file mode 100644
index 0000000000..c7dac70998
--- /dev/null
+++ b/doc/design/paddle_nccl.md
@@ -0,0 +1,65 @@
+# Design Doc: NCCL support in Paddle Fluid
+
+## Abstract
+
+This Design Doc refers to the NCCL feature in  paddle.  We propose an approach to support NCCL library both on a single machine and multiple machines. We wrapper the NCCL primitives `Broadcast`, `Allreduce`, `Reduce` as operators to utilize Multi-GPU powers in one script.
+
+
+## Motivation
+
+[NCCL](https://developer.nvidia.com/nccl) is a NVIDIA library support Multi-GPU communicating and optimized for NVIDIA GPUs, it provides routines such as all-gather, all-reduce, broadcast, reduce, reduce-scatter, that can achieve high bandwidth over PCIe and NVLink high-speed interconnect. With NCCL library, we can easily accelerate the training in parallel. 
+
+- Pros
+1. easily plug-in with [NCCL2](https://developer.nvidia.com/nccl) library.
+1. high performance in NVIDIA GPUs.
+1. MPI like primitives, which have low learning cost for users.
+
+- Cons
+1. Only design for NVIDIA GPUs, not a general multi-device solution.
+1. Although NCCL1 is opensourced under BSD license, but NCCL2 is not opensourced anymore.
+
+At the beginning of training, the framework needs to distribute the same parameters to every GPU, and merge the gradients at any time user interests.
+
+As a result, during training, we need the operations of peer to peer copy between different GPUs, aggregating gradients/parameters from GPUs, and broadcasting parameters to GPUs. Every GPU only need to run the operator with correct place information.
+
+Besides, it needs interfaces to synchronize model update with each different GPU Cards. 
+
+## Implementation
+
+As mentioned above, we wrap the NCCL routines as several kinds of operators. Need to note that NCCL need to create Communicator between gpu at the beginning, so there is a NCCLInit operator created.
+
+### Transpiler
+
+To be compatible with [parameter server design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md), the transpiler compiles the user defined operation graph into sub-graphs to be executed on different devices.
+
+1. The user-defined model will be a single device program
+
+2. Broadcast/Reduce operators between GPUs will be inserted into the program, even for the multi-node, may insert the `Send`, `Recv` operator.
+
+   *Broadcast, AllReduce in a single machine. And Broadcast, AllReduce, [Send, Recv](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md#graph-converter) in multiple machines*
+
+   <img src="images/multigpu_before_convert.png" width="300"/>
+
+After compiling, the graph as shows
+
+<img src="images/multigpu_allreduce.png" width="1000"/>
+
+Operators are added to the sub-graphs. Every GPU assigned a role of `rank0`, `rank1` etc. 
+
+- **Broadcast**. Broadcast operator distribute initialized parameter to all the GPUs from the GPU who owns it. e.g. from`rank0` GPU.
+- **AllReduce**. AllReduce operator synchronizes parameters/gradients between GPUs. AllReduce implemented in the Ring-Based  communicating method, avoid of the bottle neck in a single GPU.
+
+Need to notice that AllReduce operator force GPUs synchronized at that point. The whole training process in asynchronous or synchronous mode depends on the AllReduce point in the graph.
+
+As it shown in the picture, when each GPU compute the gradient of `W`, followed with a `AllReduce` operator, accumulate the `dW` to full batch of data, then run the optimize process individually and apply the gradient to its `W`.
+
+- **AllReduce**
+  Need to note that our AllReduce operator is a ring-base AllReduce implementation. If we use the NCCL2 AllReduce primitive, every GPU optimized full batch of data, wasted (n-1) GPU compute resources. In addition, NCCL2 built-in AllReduce will only utilize the communicating resource during synchronization, then update the gradient will be a subsequent phase. In fact, we can amortize the update gradient time cost into the communicating phase. The process is
+1. Every parameter has its root card. That card will responsible for aggregating the gradients from GPUs.
+2. The whole model's parameter will be hashed to different root card, ensure the load balance between GPUs.
+3. Logically neighberhood card will start send parameter to the next one. After one round, the parameter main card will aggregate the full gradients.
+4. Then the root card will optimize the parameter.
+5. This parameter card will send its optimized result to its neighberhood, then the neighberhood will send parameter to its next one.
+6. Finish the sychronization round.
+
+The total time cost will be 2 * (n-1) * per-parameter-send-time, we reach the goal of amortize the upgrade time into communicating phase.
diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md
index 62ff8f3229..14c081ea84 100644
--- a/doc/design/releasing_process.md
+++ b/doc/design/releasing_process.md
@@ -5,8 +5,9 @@ PaddlePaddle使用git-flow branching model做分支管理，使用[Semantic Vers
 PaddlePaddle每次发新的版本，遵循以下流程:
 
 1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
-2. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
-3. 对这个版本的提交，做如下几个操作:
+1. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
+1. 对这个版本的提交，做如下几个操作:
+	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
 	* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
 	* 编译这个版本的Ubuntu Deb包。如果失败，修复Ubuntu Deb包编译问题，Patch号加一，返回第二步。
 	* 使用Regression Test List作为检查列表，测试Docker镜像/ubuntu安装包的功能正确性
@@ -20,9 +21,9 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 			pip install twine
 			twine upload dist/[package to upload]
 			```
-4. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-5. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
-6. 协同完成Release Note的书写
+1. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
+1. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
+1. 协同完成Release Note的书写
 
 
 需要注意的是:
@@ -30,7 +31,7 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 * `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试PaddlePaddle的行为。
 * 在`release/版本号`分支存在的时候，如果有bugfix的行为，需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
 
-# PaddlePaddle 分支规范
+## PaddlePaddle 分支规范
 
 PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。
 
@@ -47,11 +48,11 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 
 * BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
 
-# PaddlePaddle回归测试列表
+## PaddlePaddle回归测试列表
 
 本列表说明PaddlePaddle发版之前需要测试的功能点。
 
-## PaddlePaddle Book中所有章节
+### PaddlePaddle Book中所有章节
 
 PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
 
diff --git a/doc/design/support_new_device.md b/doc/design/support_new_device.md
new file mode 100644
index 0000000000..fd23dc211a
--- /dev/null
+++ b/doc/design/support_new_device.md
@@ -0,0 +1,248 @@
+# Design Doc: Supporting new Device/Library
+
+## Background
+
+Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries flexibly and efficiently.
+
+On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example,Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
+
+On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
+
+So, how to support a new Device/Library in Fluid becomes a challenge.
+
+
+## Basic: Integrate A New Device/Library
+
+For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/read_source.md).
+
+There are mainly three parts that we have to consider while integrating a new device/library:
+
+- Place and DeviceContext: indicates the device id and manages hardware resources
+
+- Memory and Tensor: malloc/free data on certain device
+
+- Math Functor and OpKernel: implement computing unit on certain devices/libraries
+
+### Place and DeviceContext
+
+
+#### Place
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent different devices and computing libraries. There are inheritance relationships between different kinds of `Place`.
+
+```
+        |   CPUPlace   --> MKLDNNPlace
+Place --|   CUDAPlace  --> CUDNNPlace
+        |   FPGAPlace
+```
+
+And `Place` is defined as follows:
+
+```
+typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;
+```
+
+#### DeviceContext
+
+Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different hardwares, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
+
+
+```
+                /->  CPUDeviceContext   --> MKLDeviceContext
+DeviceContext ---->  CUDADeviceContext  --> CUDNNDeviceContext
+                \->  FPGADeviceContext
+```
+
+An example of Nvidia GPU is as follows:
+
+- DeviceContext
+
+
+```
+class DeviceContext {
+  virtual Place GetPlace() const = 0;
+};  
+```
+
+
+- CUDADeviceContext
+
+
+```
+class CUDADeviceContext : public DeviceContext {
+  Place GetPlace() const override { return place_; }
+private:
+  CUDAPlace place_;
+  cudaStream_t stream_; 
+  cublasHandle_t cublas_handle_;
+  std::unique_ptr<Eigen::GpuDevice> eigen_device_;  // binds with stream_
+};
+```
+
+- CUDNNDeviceContext
+
+```
+class CUDNNDeviceContext : public CUDADeviceContext {
+  private:
+    cudnnHandle_t cudnn_handle_;
+};
+```
+
+
+### Memory and Tensor
+
+
+#### memory module
+
+Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/memory/memory.h#L36):
+
+```
+template <typename Place>
+void* Alloc(Place place, size_t size);
+
+template <typename Place>
+void Free(Place place, void* ptr);
+
+template <typename Place>
+size_t Used(Place place);
+```
+
+To implementing these interfaces, we have to implement MemoryAllocator for different Devices
+
+
+#### Tensor
+
+[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h#L36) holds data with some shape in a specific Place.
+
+```cpp
+class Tensor {
+ public:
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+ private:
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
+  DDim dim_;
+};
+```
+
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
+
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+
+
+
+### Math Functor and OpKernel
+
+Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors.
+
+Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
+
+The interface is defined in header file.
+
+```
+template <typename DeviceContext, typename T>
+class MaxOutFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* output, int groups);
+};
+```
+
+CPU implemention is in .cc file
+
+```
+template <typename T>
+class MaxOutFunctor<platform::CPUDeviceContext, T> {
+  public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};
+```
+
+CUDA implemention is in .cu file
+
+```
+template <typename T>
+class MaxOutFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};                  
+```
+
+
+We get computing handle from a concrete DeviceContext, and make compution on tensors.
+
+The implemention of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
+
+Fluid provides different register interfaces in op_registry.h
+
+
+Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/crop_op.cc#L134) operator as an example:
+
+In .cc file:
+
+```
+REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CPU_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
+```
+
+In .cu file:
+
+```
+REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CUDA_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
+```
+
+
+## Advanced topics: How to switch between different Device/Library
+
+Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
+
+
+We will discuss how to implement an efficient OpKernel switch policy. 
+
+- TBD
diff --git a/doc/faq/build_and_install/index_cn.rst b/doc/faq/build_and_install/index_cn.rst
index f1677e216f..a2bdeead78 100644
--- a/doc/faq/build_and_install/index_cn.rst
+++ b/doc/faq/build_and_install/index_cn.rst
@@ -14,7 +14,7 @@
 
     $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
     $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
 
 更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/doc_cn/build_and_install/install/docker_install.html>`_ 。
 
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst
index 3c525bdad6..c875c807b8 100644
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -19,7 +19,7 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译
    git clone https://github.com/PaddlePaddle/Paddle.git
    cd Paddle
    # 如果使用Docker编译环境，执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
    # 如果不使用Docker编译环境，执行下面的命令
    mkdir build
    cd build
@@ -30,7 +30,7 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译
 
 .. code-block:: bash
 
-   pip install python/dist/*.whl
+   pip install build/python/dist/*.whl
 
 
 .. _run_test:
@@ -45,7 +45,7 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
 
 如果不使用Docker，可以执行ctest命令即可：
 
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst
index 76fbc43de2..f194f84ce7 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -21,7 +21,7 @@ Then run:
    git clone https://github.com/PaddlePaddle/Paddle.git
    cd Paddle
    # run the following command to build a CPU-Only binaries if you are using docker
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
    # else run these commands
    mkdir build
    cd build
@@ -34,7 +34,7 @@ machine or copy it to the target machine.
 
 .. code-block:: bash
 
-   pip install python/dist/*.whl
+   pip install build/python/dist/*.whl
 
 
 .. _run_test:
@@ -49,7 +49,7 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU.
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
 
 If you don't use Docker, just run ctest will start the tests:
 
@@ -117,7 +117,7 @@ You can add :code:`-D` argument to pass such options, like:
     "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
     "WITH_STYLE_CHECK", "Check code style when building", "ON"
     "WITH_TESTING", "Build unit tests", "ON"
-    "WITH_DOC", "Build documentaions", "OFF"
+    "WITH_DOC", "Build documentations", "OFF"
     "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
     "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
     "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index f78b1fb0e1..1eb06e4182 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -114,7 +114,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
 
   .. code-block:: bash
 
-     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
 
 **注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
 
@@ -122,7 +122,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
 
      export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
      export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
 
 **关于AVX：**
 
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index d7acc7aeb7..5a46c598f2 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -122,7 +122,7 @@ GPU driver installed before move on.
 
   .. code-block:: bash
 
-     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
 
 **NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
 
@@ -130,7 +130,7 @@ GPU driver installed before move on.
 
      export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
      export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
 
 **About AVX:**
 
diff --git a/doc/getstarted/concepts/src/infer.py b/doc/getstarted/concepts/src/infer.py
new file mode 100644
index 0000000000..4cc58dfee0
--- /dev/null
+++ b/doc/getstarted/concepts/src/infer.py
@@ -0,0 +1,18 @@
+import paddle.v2 as paddle
+import numpy as np
+
+paddle.init(use_gpu=False)
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
+y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+# loading the model which generated by training
+with open('params_pass_90.tar', 'r') as f:
+    parameters = paddle.parameters.Parameters.from_tar(f)
+
+# Input multiple sets of data，Output the infer result in a array.
+i = [[[1, 2]], [[3, 4]], [[5, 6]]]
+print paddle.infer(output_layer=y_predict, parameters=parameters, input=i)
+# Will print:
+# [[ -3.24491572]
+#  [ -6.94668722]
+#  [-10.64845848]]
diff --git a/doc/getstarted/concepts/src/train.py b/doc/getstarted/concepts/src/train.py
index 8aceb23406..4bccbfca3c 100644
--- a/doc/getstarted/concepts/src/train.py
+++ b/doc/getstarted/concepts/src/train.py
@@ -26,6 +26,11 @@ def event_handler(event):
         if event.batch_id % 1 == 0:
             print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id,
                                                   event.cost)
+    # product model every 10 pass
+    if isinstance(event, paddle.event.EndPass):
+        if event.pass_id % 10 == 0:
+            with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
 
 
 # define training dataset reader
diff --git a/doc/getstarted/concepts/use_concepts_cn.rst b/doc/getstarted/concepts/use_concepts_cn.rst
index c243083794..e695ff283e 100644
--- a/doc/getstarted/concepts/use_concepts_cn.rst
+++ b/doc/getstarted/concepts/use_concepts_cn.rst
@@ -147,4 +147,9 @@ PaddlePaddle支持不同类型的输入数据，主要包括四种类型，和
 ..  literalinclude:: src/train.py
     :linenos:
 
+使用以上训练好的模型进行预测，取其中一个模型params_pass_90.tar，输入需要预测的向量组，然后打印输出：
+
+..  literalinclude:: src/infer.py
+    :linenos:
+
 有关线性回归的实际应用，可以参考PaddlePaddle book的 `第一章节 <http://book.paddlepaddle.org/index.html>`_。
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md
index 6993901452..3e0bf7b397 100644
--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -76,18 +76,18 @@ no changes added to commit (use "git add" and/or "git commit -a")
 
 ## 构建和测试
 
-编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家，我们的标准开发流程是把这些工具都装进一个Docker image，称为*开发镜像*，通常名字是 `paddle:dev`。然后所有用 `cmake && make` 的地方（比如IDE配置里）都用 `docker run paddle:dev`来代替。
+编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家，我们的标准开发流程是把这些工具都装进一个Docker image，称为*开发镜像*，通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方（比如IDE配置里）都用 `docker run paddle:latest-dev`来代替。
 
 如要build这个开发镜像，在源码目录树的根目录中运行：
 
 ```bash
-➜  docker build -t paddle:dev .
+➜  docker build -t paddle:latest-dev .
 ```
 
 随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU，但是支持AVX指令集，并且包括unit tests的PaddlePaddle，可以：
 
 ```bash
-➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev
+➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev
 ```
 
 这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`，并且输出一个 `./build/paddle.deb`文件之外，还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*（`paddle:prod`）：
@@ -99,7 +99,7 @@ no changes added to commit (use "git add" and/or "git commit -a")
 如果要运行所有的单元测试，可以用如下命令：
 
 ```bash
-➜  docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+➜  docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
 ```
 
 关于构建和测试的更多信息，请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index 6cfc9536f2..757a5840bc 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -1,17 +1,18 @@
 # 如何写新的Operator
 
  - [概念简介](#概念简介)
- - [实现C++类](#实现C++类)
-   - [定义ProtoMaker类](#定义ProtoMaker类)
-   - [定义Operator类](#定义Operator类)
-   - [定义OpKernel类](#定义OpKernel类)
-   - [注册Operator](#注册Operator)
+ - [实现C++类](#实现c类)
+   - [定义ProtoMaker类](#定义protomaker类)
+   - [定义Operator类](#定义operator类)
+   - [定义OpKernel类](#定义opkernel类)
+   - [注册Operator](#注册operator)
    - [编译](#编译)
- - [绑定Python](#绑定Python)
+ - [绑定Python](#绑定python)
  - [实现单元测试](#实现单元测试)
-   - [前向Operator单测](#前向Operator单测)
-   - [反向Operator单测](#反向Operator单测)
+   - [前向Operator单测](#前向operator单测)
+   - [反向Operator单测](#反向operator单测)
    - [编译和执行](#编译和执行)
+ - [注意事项](#注意事项)
 
 
 ## 概念简介
@@ -30,8 +31,8 @@
 --------------  | :----------------------
 OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
 Op定义           | `.cc`文件
-Kernel实现       | CPU、GPU共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，GPU 实现在`.cu`文件中。
-注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，GPU实现在`.cu`文件中
+Kernel实现       | CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。
+注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中
 
 
 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
@@ -43,7 +44,7 @@ Kernel实现       | CPU、GPU共享Kernel实现在`.h`文件中，否则，CPU
 ## 实现C++类
 
 
-### 1. 定义ProtoMaker类
+### 定义ProtoMaker类
 
 矩阵乘法的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。
 
@@ -100,7 +101,7 @@ The equation is: Out = scale*X
 - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
 
 
-### 2. 定义Operator类
+### 定义Operator类
 
 下面的点实现了MulOp的定义：
 
@@ -149,11 +150,11 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
 通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和下面将要介绍的注册函数一起放在`.cc`中
 
-### 3. 定义OpKernel类
+### 定义OpKernel类
 
 `MulKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
 
-- `typename  Place`: 表示设备类型，不同设备(CPU、GPU)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+- `typename DeviceContext`: 表示设备类型，不同设备(CPU、CUDA)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
 
 - `typename T` : 表示数据类型，如`float`, `double`等。
 
@@ -165,7 +166,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 下面是 `MulKernel` `Compute`的实现：
 
   ```cpp
-  template <typename Place, typename T>
+  template <typename DeviceContext, typename T>
   class MulKernel : public framework::OpKernel {
   public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -173,33 +174,32 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
     auto* Y = context.Input<Tensor>("Y");
     auto* Z = context.Output<Tensor>("Out");
     Z->mutable_data<T>(context.GetPlace());
-    auto* device_context =
-        const_cast<platform::DeviceContext*>(context.device_context_);
-    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+    auto& device_context = context.template device_context<DeviceContext>();
+    math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
   }
   };
   ```
 
-需要注意：**不同设备(CPU、GPU)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
+需要注意：**不同设备(CPU、CUDA)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
 
-`MulOp`的CPU、GPU实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
 
-为了使`OpKernel`的计算过程书写更加简单，并且CPU、GPU的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
+为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
 
 
 到此，前向Op实现完成。接下来，需要在`.cc`文件中注册该op和kernel。
 反向Op类的定义，反向OpKernel的定义与前向Op类似，这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
 
-### 4. 注册Operator
+### 注册Operator
 
 - 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
 
     ```cpp
     namespace ops = paddle::operators;
     REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
-    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
     REGISTER_OP_CPU_KERNEL(mul_grad,
-                  ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
     ```
 
    在上面的代码中：
@@ -209,20 +209,20 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
     - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
 
 
-- 在 `.cu`文件中注册GPU Kernel。
-    - 请注意，如果GPU Kernel的实现基于Eigen unsupported模块，那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`，代码示例如下：
+- 在 `.cu`文件中注册CUDA Kernel。
+    - 请注意，如果CUDA Kernel的实现基于Eigen unsupported模块，那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`，代码示例如下：
 
     ```cpp
     // if use Eigen unsupported module before include head files
-    // #define EIGEN_USE_GPU
+    #define EIGEN_USE_GPU
 
     namespace ops = paddle::operators;
-    REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
-    REGISTER_OP_GPU_KERNEL(mul_grad,
-                           ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+    REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+    REGISTER_OP_CUDA_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
     ```
 
-### 5. 编译
+### 编译
 
 运行下面命令可以进行编译：
 
@@ -236,71 +236,57 @@ make mul_op
 
 ## 实现单元测试
 
-单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
 
-### 前向Operator单元测试
+### 前向Operator单测
 
-前向Op单元测试继承自`unittest.TestCase`，并定义元类`__metaclass__ = OpTestMeta`。各项更加具体的单元测试在`OpTestMeta`里完成。测试前向Operator，需要：
+Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator，需要：
 
 1. 在`setUp`函数定义输入、输出，以及相关的属性参数。
 2. 生成随机的输入数据。
 3. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比。
+4. 反向计算已经自动集成进测试框架，直接调用相应接口即可。
 
 
   ```python
   import unittest
   import numpy as np
-  from gradient_checker import GradientChecker, create_op
-  from op_test_util import OpTestMeta
+  from op_test import OpTest
 
-  class TestMulOp(unittest.TestCase):
-      __metaclass__ = OpTestMeta
 
+  class TestMulOp(OpTest):
       def setUp(self):
-          self.type = "mul"
+          self.op_type = "mul"
           self.inputs = {
               'X': np.random.random((32, 84)).astype("float32"),
               'Y': np.random.random((84, 100)).astype("float32")
           }
           self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
-  ```
-
-上面的代码首先导入依赖的包，下面是对`setUp`函数中操作的重要变量的详细解释：
-
-- `self.type = "mul" ` : 定义类型，与operator注册时注册的类型一致。
-- `self.inputs` : 定义输入，类型为`numpy.array`，并初始化。
-- `self.outputs` : 定义输出，并在Python脚本中完成与operator同样的计算逻辑，返回Python端的计算结果。
 
+      def test_check_output(self):
+          self.check_output()
 
-### 反向Operator单元测试
+      def test_check_grad_normal(self):
+          self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
 
-反向Op单元测试继承自`GradientChecker`，而`GradientChecker`继承自`unittest.TestCase`，因此，**反向单元测试函数需要以`test_`开头**。
+      def test_check_grad_ingore_x(self):
+          self.check_grad(
+              ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
 
-```python
-class TestMulGradOp(GradientChecker):
-    def setUp(self):
-        self.op = create_op("mul")
-        self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((84, 100)).astype("float32")
-        }
-
-    def test_check_grad_normal(self):
-        # mul op will enlarge the relative error
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+      def test_check_grad_ingore_y(self):
+          self.check_grad(
+              ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+  ```
 
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+上面的代码首先导入依赖的包，下面是对`setUp`函数中操作的重要变量的详细解释：
 
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-```
+- `self.op_type = "mul" ` : 定义类型，与operator注册时注册的类型一致。
+- `self.inputs` : 定义输入，类型为`numpy.array`，并初始化。
+- `self.outputs` : 定义输出，并在Python脚本中完成与operator同样的计算逻辑，返回Python端的计算结果。
 
-下面解释代码中一些关键的地方:
+### 反向operator单测
 
-- 调用`create_op("mul")`创建反向Op对应的前向Op。
+而反向测试中：
 - `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
   - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
   - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
@@ -308,7 +294,7 @@ class TestMulGradOp(GradientChecker):
 - `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
 
 
-### 编译和执行单元测试
+### 编译和执行
 
 `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
 
@@ -328,5 +314,5 @@ ctest -R test_mul_op
 
 - 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。不允许一个文件中包含多个Op，这将会导致编译出错。
 - 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`等，这将会导致单元测试出错。
-- 如果Op没有实现GPU Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
+- 如果Op没有实现CUDA Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
 - 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md
index 1e88e1f5b4..fe86936bc1 100644
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@@ -1,8 +1,8 @@
 # How to write a new operator
 
  - [Background](#background)
- - [Implementing C++ Types](#implementing-c++-types)
-   - [Defining ProtoMaker](#defining-protoMaker)
+ - [Implementing C++ Types](#implementing-c-types)
+   - [Defining ProtoMaker](#defining-protomaker)
    - [Defining Operator](#defining-operator)
    - [Registering Operator](#registering-operator)
    - [Compilation](#compilation)
@@ -28,8 +28,8 @@ An operator can be differentiated by whether in has kernel methods. An operator
 --------------  | :----------------------
 OpProtoMake definition  | `.cc`files, Backward Op does not need an OpProtoMake interface.
 Op definition           | `.cc` files
-Kernel implementation       | The kernel methods shared between CPU and GPU are defined in `.h` files. CPU-specific kernels live in `.cc` files, while GPU-specific kernels are implemented in `.cu`files.
-Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the GPU implementation.
+Kernel implementation       | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
+Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
 
 
 New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. **
@@ -41,7 +41,7 @@ Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePadd
 ## Implementing C++ Types
 
 
-### 1. Defining Class ProtoMaker
+### Defining ProtoMaker
 
 Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output.
 
@@ -98,7 +98,7 @@ There are two changes in this example:
 - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);`  adds `scale`constant as an attribute, and sets the default value to 1.0.
 
 
-### 2. Defining Operator
+### Defining Operator
 
 The following code defines the interface for MulOp:
 
@@ -147,11 +147,11 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
 Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later.
 
-### 3. Defining OpKernel
+### Defining OpKernel
 
 `MulKernel` inherits `framework::OpKernel`, which includes the following templates:
 
-- `typename  Place` denotes device type. When different devices, namely the CPU and the GPU, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+- `typename  DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
 
 - `typename T` denotes data type, such as `float` or `double`.
 
@@ -163,7 +163,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
 `MulKernel`'s implementation of `Compute` is as follows:
 
   ```cpp
-  template <typename Place, typename T>
+  template <typename DeviceContext, typename T>
   class MulKernel : public framework::OpKernel {
   public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -171,16 +171,15 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
     auto* Y = context.Input<Tensor>("Y");
     auto* Z = context.Output<Tensor>("Out");
     Z->mutable_data<T>(context.GetPlace());
-    auto* device_context =
-        const_cast<platform::DeviceContext*>(context.device_context_);
-    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+    auto& device_context = context.template device_context<DeviceContext>();
+    math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
   }
   };
   ```
 
-Note that **different devices (CPU, GPU)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
+Note that **different devices (CPU, CUDA)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
 
-`MulOp`'s CPU and GPU share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
 
 To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
 
@@ -189,16 +188,16 @@ This concludes the forward implementation of an operator. Next its operation and
 
 The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.
 
-### 4. Registering Operator
+### Registering Operator
 
 - In `.cc` files, register forward and backward operator classes and the CPU kernel.
 
     ```cpp
     namespace ops = paddle::operators;
     REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
-    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
     REGISTER_OP_CPU_KERNEL(mul_grad,
-                  ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
     ```
 
    In that code block,
@@ -208,20 +207,20 @@ The definition of its corresponding backward operator, if applicable, is similar
     - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
 
 
-- Registering GPU Kernel in `.cu` files
-    - Note that if GPU Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
+- Registering CUDA Kernel in `.cu` files
+    - Note that if CUDA Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
 
     ```cpp
     // if use Eigen unsupported module before include head files
     #define EIGEN_USE_GPU
 
     namespace ops = paddle::operators;
-    REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
-    REGISTER_OP_GPU_KERNEL(mul_grad,
-                           ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+    REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+    REGISTER_OP_CUDA_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
     ```
 
-### 5. Compilation
+### Compilation
 
 Run the following commands to compile.
 
@@ -253,62 +252,51 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass
 
 2. Generating random input data.
 
-3. Implementing the same computation logic in a Python script:
+3. Implementing the same computation logic in a Python script.
+
+4. Call check gradient function to check the backward operator.
 
   ```python
   import unittest
   import numpy as np
-  from gradient_checker import GradientChecker, create_op
-  from op_test_util import OpTestMeta
+  from op_test import OpTest
 
-  class TestMulOp(unittest.TestCase):
-      __metaclass__ = OpTestMeta
 
+  class TestMulOp(OpTest):
       def setUp(self):
-          self.type = "mul"
+          self.op_type = "mul"
           self.inputs = {
               'X': np.random.random((32, 84)).astype("float32"),
               'Y': np.random.random((84, 100)).astype("float32")
           }
           self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+      def test_check_output(self):
+          self.check_output()
+          
+      def test_check_grad_normal(self):
+          self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+
+      def test_check_grad_ingore_x(self):
+          self.check_grad(
+              ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+      def test_check_grad_ingore_y(self):
+          self.check_grad(
+              ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
   ```
 Get its output, and compare it with the forward operator's own output.
 
 The code above first loads required packages. In addition, we have
 
-- `self.type = "mul" ` defines the type that is identical to what the operator's registered type.
+- `self.op_type = "mul" ` defines the type that is identical to what the operator's registered type.
 - `self.inputs` defines input, with type `numpy.array` and initializes it.
 - `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script.
 
 ### Testing Backward Operators
 
-A backward operator unit test inherits `GradientChecker`, which inherits `unittest.TestCase`. As a result, **a backward operator unit test needs to be have the prefix `test_`**.
-
-```python
-class TestMulGradOp(GradientChecker):
-    def setUp(self):
-        self.op = create_op("mul")
-        self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((84, 100)).astype("float32")
-        }
-
-    def test_check_grad_normal(self):
-        # mul op will enlarge the relative error
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-```
-
-Some key points in the code above include:
+Some key points in checking gradient above include:
 
-- `create_op("mul")` creates the backward operator's corresponding forward operator.
 - `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
   - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
   - The second variable `"Out"` points to the network's final output target `Out`.
@@ -338,5 +326,5 @@ ctest -R test_mul_op
 
 - Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file.
 - The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures.
-- If the operator does not implement a GPU kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
+- If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
 - If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`.
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index 174fb7a074..678a00c9d3 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -9,9 +9,6 @@
 
   usage/cmd_parameter/index_cn.rst
   usage/cluster/cluster_train_cn.md
-  usage/k8s/k8s_basis_cn.md
-  usage/k8s/k8s_cn.md
-  usage/k8s/k8s_distributed_cn.md
   usage/capi/overview.md
 
 开发标准
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 61bf25ccd1..6d1bf7dfc0 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -9,8 +9,6 @@ Usage
 
   usage/cmd_parameter/index_en.rst
   usage/cluster/cluster_train_en.md
-  usage/k8s/k8s_en.md
-  usage/k8s/k8s_aws_en.md
 
 Development
 ------------
diff --git a/doc/howto/read_source.md b/doc/howto/read_source.md
new file mode 100644
index 0000000000..e4211abb3b
--- /dev/null
+++ b/doc/howto/read_source.md
@@ -0,0 +1,67 @@
+# PaddlePaddle Fluid Source Code Overview
+
+Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/tests/book
+
+Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework
+
+Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators
+
+Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory
+
+Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform
+
+# Compile Time
+
+The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto).
+
+```python
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+```
+
+- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#L93)
+- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/layers.py)
+  - Every Layer has one or more operators and variables/parameters
+    - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files:
+      - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h)
+      - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) 
+      - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h)
+- Optimizer: `fluid.optimizer.SGD`. It does the following
+  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/backward.cc)]
+  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py), [C++](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer)]
+
+# Run Time
+
+The following **evaluates** the NN. Instantiates all the variables, operators.
+
+```python
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+# Allocate memory. Initialize Parameter.
+exe.run(fluid.default_startup_program())
+
+# Allocate memory. Do computation.
+exe.run(fluid.default_main_program(),
+        feed=feeder.feed(data),
+        fetch_list=[avg_cost])
+```
+
+- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h)
+  - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h)
+- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)]
+  - Feeds the data: `feed=feeder.feed(data)`
+  - Evaluates all the operators
+  - Fetches the result: `fetch_list=[avg_cost]`
+- Other worth looking files:
+  - Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live
+    - Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live
+      - Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory)
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
index 2e98b3de3f..c9f90538a6 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -1,25 +1,8 @@
 # PaddlePaddle分布式训练
 
-* [概述](#概述)
-* [环境准备](#环境准备)
-* [启动参数说明](#启动参数说明)
-  * [启动参数服务器](#启动参数服务器)
-  * [启动计算节点](#启动计算节点)
-  * [准备数据集](#准备数据集)
-  * [准备训练程序](#准备训练程序)
-* [使用分布式计算平台或工具](#使用分布式计算平台或工具)
-  * [使用Fabric启动集群作业](#使用fabric启动集群作业)
-     * [准备一个Linux集群](#准备一个linux集群)
-     * [启动集群作业](#启动集群作业)
-     * [终止集群作业](#终止集群作业)
-     * [检查集群训练结果](#检查集群训练结果)
-     * [检查模型输出](#检查模型输出)
-  * [在OpenMPI集群中提交训练作业](#在openmpi集群中提交训练作业)
-     * [准备OpenMPI集群](#准备OpenMPI集群)
-     * [启动集群作业](#启动集群作业-1)
-  * [在Kubernetes集群中提交训练作业](#在kubernetes集群中提交训练作业)
 
 ## 概述
+
 本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：
 
 <img src="https://user-images.githubusercontent.com/13348433/31772175-5f419eca-b511-11e7-9db7-5231fe3d9ccb.png" width="500">
@@ -32,10 +15,11 @@
 
 在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。
 
+
 ## 环境准备
 
 1. 准备您的计算集群。计算集群通常由一组（几台到几千台规模）的Linux服务器组成。服务器之间可以通过局域网（LAN）联通，每台服务器具有集群中唯一的IP地址（或者可被DNS解析的主机名）。集群中的每台计算机通常被成为一个“节点”。
-1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install)的多种安装方式。我们推荐使用[Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)安装方式来快速安装PaddlePaddle。
+1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
 
 安装完成之后，执行下面的命令可以查看已经安装的版本（docker安装方式可以进入docker容器执行：`docker run -it paddlepaddle/paddle:[tag] /bin/bash`）：
 ```bash
@@ -63,12 +47,12 @@ $ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradie
 $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
 ```
 
-| 参数  | 是否必选 | 默认值 | 说明 |
-| ------------- | ------------- | ------------- | ------------- |
-| port  | 必选 | 7164 | pserver监听的起始端口，根据ports_num决定<br>总端口个数，从起始端口监听多个端口用于通信  |
-| ports_num  | 必选 | 1 | 监听的端口个数  |
-| ports_num_for_sparse  | 必选 | 1 | 用于稀疏类型参数通信的端口个数  |
-| num_gradient_servers  | 必选 | 1 | 当前训练任务pserver总数 |
+参数说明
+
+- port：**必选，默认7164**，pserver监听的起始端口，根据ports_num决定总端口个数，从起始端口监听多个端口用于通信
+- ports_num：**必选，默认1**，监听的端口个数
+- ports_num_for_sparse：**必选，默认1**，用于稀疏类型参数通信的端口个数
+- num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
 
 ### 启动计算节点
 执行以下命令启动使用python编写的trainer程序（文件名为任意文件名，如train.py）
@@ -105,16 +89,16 @@ paddle.init(
         pservers="127.0.0.1")
 ```
 
-| 参数  | 是否必选 | 默认 | 说明 |
-| ------------- | ------------- | ------------- | ------------- |
-| use_gpu  | 可选 | False | 是否启用GPU训练 |
-| trainer_count  | 必选 | 1 | 当前训练任务trainer总个数 |
-| port  | 必选 | 7164 | 连接到pserver的端口  |
-| ports_num  | 必选 | 1 | 连接到pserver的端口个数  |
-| ports_num_for_sparse  | 必选 | 1 | 和pserver之间用于稀疏类型参数通信的端口个数  |
-| num_gradient_servers  | 必选 | 1 | 当前训练任务pserver总数 |
-| trainer_id  | 必选 | 0 | 每个trainer的唯一ID，从0开始的整数 |
-| pservers  | 必选 | 127.0.0.1 | 当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开 |
+参数说明
+
+- use_gpu： **可选，默认False**，是否启用GPU训练
+- trainer_count：**必选，默认1**，当前训练任务trainer总个数
+- port：**必选，默认7164**，连接到pserver的端口
+- ports_num：**必选，默认1**，连接到pserver的端口个数
+- ports_num_for_sparse：**必选，默认1**，和pserver之间用于稀疏类型参数通信的端口个数
+- num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
+- trainer_id：**必选，默认0**，每个trainer的唯一ID，从0开始的整数
+- pservers：**必选，默认127.0.0.1**，当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开
 
 
 ### 准备数据集
@@ -171,7 +155,7 @@ test.txt-00002
 
 - `my_lib.py`：会被`train.py`调用的一些用户定义的库函数，比如PIL库等。
 - `word_dict.pickle`：在`train.py`中会使用到的字典数据文件。
-- `train.py`：训练程序，代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py)。***注意：*** 对于本样例代码，在使用不同的分布式计算平台时，您可能需要修改`train.py`开头的部分（如下），以便获得训练数据的位置和获取环境变量配置：
+- `train.py`：训练程序，代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py)。***注意：*** 对于本样例代码，在使用不同的分布式计算平台时，您可能需要修改`train.py`开头的部分（如下），以便获得训练数据的位置和获取环境变量配置：
 
   ```python
   cluster_train_file = "./train_data_dir/train/train.txt"
@@ -195,91 +179,10 @@ PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务
 
 在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。
 
-### 使用Fabric启动集群作业
-
-#### 准备一个Linux集群
-可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下，执行`kubectl -f ssh_servers.yaml`启动一个测试集群，并使用`kubectl get po -o wide`获得这些节点的IP地址。
-
-#### 启动集群作业
-
-`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
-
-`paddle.py` 为方便作业启动提供了两个独特的命令选项。
-
--  `job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
--  `job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
-
-`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务，只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
-```
-sh run.sh
-```
-
-集群作业将会在几秒后启动。
-
-#### 终止集群作业
-`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
-
-#### 检查集群训练结果
-详细信息请检查 $workspace/log 里的日志，每一个节点都有相同的日志结构。
-
-`paddle_trainer.INFO`
-提供几乎所有训练的内部输出日志，与本地训练相同。这里检验运行时间模型的收敛。
-
-`paddle_pserver2.INFO`
-提供 pserver 运行日志，有助于诊断分布式错误。
-
-`server.log`
-提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
-
-`train.log`
-提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
-
-#### 检查模型输出
-运行完成后，模型文件将被写入节点 0 的 `output` 目录中。
-工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
-
-### 在OpenMPI集群中提交训练作业
-
-#### 准备OpenMPI集群
-
-执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点：
-
-```bash
-paddle/scripts/cluster_train_v2/openmpi/docker_cluster
-kubectl create -f head.yaml
-kubectl create -f mpi-nodes.yaml
-```
-
-然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。
-
-#### 启动集群作业
-
-您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务：
-
-```bash
-# 获得head和node节点的IP地址
-kubectl get po -o wide
-# 将node节点的IP地址保存到machines文件中
-kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
-# 拷贝必要的文件到head节点
-scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
-# ssh 登录到head节点
-ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
-# --------------- 以下操作均在head节点中执行 ---------------
-# 准备训练数据
-python prepare.py
-# 拷贝训练程序和字典文件到每台MPI节点
-cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
-# 创建日志目录
-mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
-# 拷贝训练数据到各自的节点
-scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
-scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
-scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
-# 启动训练任务
-mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
-```
-
-### 在Kubernetes集群中提交训练作业
+## 在不同集群中运行
 
-此部分的使用方法可以参考[here](../k8s/k8s_distributed_cn.md)。
+  - [fabric](fabric_cn.md)
+  - [openmpi](openmpi_cn.md)
+  - [kubernetes](k8s_cn.md)
+  - [kubernetes distributed](k8s_distributed_cn.md)
+  - [kubernetes on AWS](k8s_aws_cn.md)
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
index baa97c0c02..f9819470c0 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -1,24 +1,5 @@
 # PaddlePaddle Distributed Training
 
-* [Introduction](#introduction)
-* [Preparations](#preparations)
-* [Command-line arguments](#command-line-arguments)
-   * [Starting parameter server](#starting-parameter-server)
-   * [Starting trainer](#starting-trainer)
-   * [Prepare Training Dataset](#prepare-training-dataset)
-   * [Prepare Training program](#prepare-training-program)
-* [Use cluster platforms or cluster management tools](#use-cluster-platforms-or-cluster-management-tools)
-   * [Cluster Training Using Fabric](#cluster-training-using-fabric)
-      * [Prepare a Linux cluster](#prepare-a-linux-cluster)
-      * [Launching Cluster Job](#launching-cluster-job)
-      * [Kill Cluster Job](#kill-cluster-job)
-      * [Check Cluster Training Result](#check-cluster-training-result)
-      * [Check Model Output](#check-model-output)
-   * [Cluster Training Using OpenMPI](#cluster-training-using-openmpi)
-      * [Prepare an OpenMPI cluster](#prepare-an-openmpi-cluster)
-      * [Launching Cluster Job](#launching-cluster-job-1)
-   * [Cluster Training Using Kubernetes](#cluster-training-using-kubernetes)
-
 ## Introduction
 
 In this article, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
@@ -35,7 +16,7 @@ When training with synchronize SGD, PaddlePaddle uses an internal "synchronize b
 
 ## Preparations
 1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
-2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install) document. We strongly recommend using [Docker installation](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).
+2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
 
 After installation, you can check the version by typing the below command (run a docker container  if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
 
@@ -67,12 +48,12 @@ If you wish to run parameter servers in background, and save a log file, you can
 $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
 ```
 
-| param  | required | default | description |
-| ------------- | ------------- | ------------- | ------------- |
-| port  | required | 7164 | port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput |
-| ports_num  | required | 1 | total number of ports will listen on  |
-| ports_num_for_sparse  | required | 1 | number of ports which serves sparse parameter update  |
-| num_gradient_servers  | required | 1 | total number of gradient servers |
+Parameter Description
+
+- port: **required, default 7164**, port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput.
+- ports_num: **required, default 1**, total number of ports will listen on.
+- ports_num_for_sparse: **required, default 1**, number of ports which serves sparse parameter update.
+- num_gradient_servers: **required, default 1**, total number of gradient servers.
 
 ### Starting trainer
 Type the command below to start the trainer(name the file whatever you want, like "train.py")
@@ -111,16 +92,16 @@ paddle.init(
         pservers="127.0.0.1")
 ```
 
-| param  | required | default | description |
-| ------------- | ------------- | ------------- | ------------- |
-| use_gpu  | optional | False | set to "True" to enable GPU training |
-| trainer_count  | required | 1 | total count of trainers in the training job |
-| port  | required | 7164 | port to connect to parameter server  |
-| ports_num  | required | 1 | number of ports for communication |
-| ports_num_for_sparse  | required | 1 | number of ports for sparse type caculation |
-| num_gradient_servers  | required | 1 | total number of gradient server |
-| trainer_id  | required | 0 | ID for every trainer, start from 0 |
-| pservers  | required | 127.0.0.1 | list of IPs of parameter servers, separated by "," |
+Parameter Description
+
+- use_gpu: **optional, default False**, set to "True" to enable GPU training.
+- trainer_count: **required, default 1**, total count of trainers in the training job.
+- port: **required, default 7164**, port to connect to parameter server.
+- ports_num: **required, default 1**, number of ports for communication.
+- ports_num_for_sparse: **required, default 1**, number of ports for sparse type caculation.
+- num_gradient_servers: **required, default 1**, total number of gradient server.
+- trainer_id: **required, default 0**, ID for every trainer, start from 0.
+- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
 
 ### Prepare Training Dataset
 
@@ -178,7 +159,7 @@ Your workspace may looks like:
 
 - `my_lib.py`: user defined libraries, like PIL libs. This is optional.
 - `word_dict.pickle`: dict file for training word embeding.
-- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables:
+- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables:
 
   ```python
   cluster_train_file = "./train_data_dir/train/train.txt"
@@ -202,92 +183,10 @@ We'll introduce cluster job management on these platforms. The examples can be f
 
 These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
 
-### Cluster Training Using Fabric
-
-#### Prepare a Linux cluster
-
-Run `kubectl -f ssh_servers.yaml` under the directory:  `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
-
-#### Launching Cluster Job
-`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
-
-`paddle.py`provides two distinguished command option for easy job launching.
-
-- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
-- `job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
-dispatch latency.
-
-`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
-```
-sh run.sh
-```
-
-The cluster Job will start in several seconds.
-
-#### Kill Cluster Job
-`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
-
-#### Check Cluster Training Result
-Check log in $workspace/log for details, each node owns same log structure.
-
-`paddle_trainer.INFO`
-It provides almost all internal output log for training,  same as local training. Check runtime model convergence here.
-
-`paddle_pserver2.INFO`
-It provides parameter server running log, which could help to diagnose distributed error.
-
-`server.log`
-It provides stderr and stdout of parameter server process. Check error log if training crashes.
-
-`train.log`
-It provides stderr and stdout of trainer process. Check error log if training crashes.
-
-#### Check Model Output
-After one pass finished, model files will be written in `output` directory in node 0.
-`nodefile` in workspace indicates the node id of current cluster job.
-
-### Cluster Training Using OpenMPI
-
-#### Prepare an OpenMPI cluster
-
-Run the following command to start a 3-node MPI cluster and one "head" node.
-
-```bash
-cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster
-kubectl create -f head.yaml
-kubectl create -f mpi-nodes.yaml
-```
-
-Then you can log in to every OpenMPI node using ssh without input any passwords.
-
-#### Launching Cluster Job
-
-Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
-
-```bash
-# find out node IP addresses
-kubectl get po -o wide
-# generate a "machines" file containing node IP addresses
-kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
-# copy necessary files onto "head" node
-scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
-# login to head node using ssh
-ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
-# --------------- in head node ---------------
-# prepare training data
-python prepare.py
-# copy training data and dict file to MPI nodes
-cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
-# creat a directory for storing log files
-mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
-# copy training data to every node
-scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
-scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
-scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
-# start the job
-mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
-```
-
-### Cluster Training Using Kubernetes
+## Use different clusters
 
-The details can be found [here](../k8s/k8s_cn.md)
+  - [fabric](fabric_en.md)
+  - [openmpi](openmpi_en.md)
+  - [kubernetes](k8s_en.md)
+  - kubernetes distributed
+  - [kubernetes on AWS](k8s_aws_en.md)
diff --git a/doc/howto/usage/cluster/fabric_cn.md b/doc/howto/usage/cluster/fabric_cn.md
new file mode 100644
index 0000000000..0385e401b3
--- /dev/null
+++ b/doc/howto/usage/cluster/fabric_cn.md
@@ -0,0 +1,42 @@
+# 使用fabric启动集群训练
+
+## 准备一个Linux集群
+可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下，执行`kubectl -f ssh_servers.yaml`启动一个测试集群，并使用`kubectl get po -o wide`获得这些节点的IP地址。
+
+## 启动集群作业
+
+`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
+
+`paddle.py` 为方便作业启动提供了两个独特的命令选项。
+
+-  `job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
+-  `job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
+
+`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务，只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
+```
+sh run.sh
+```
+
+集群作业将会在几秒后启动。
+
+## 终止集群作业
+`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
+
+## 检查集群训练结果
+详细信息请检查 $workspace/log 里的日志，每一个节点都有相同的日志结构。
+
+`paddle_trainer.INFO`
+提供几乎所有训练的内部输出日志，与本地训练相同。这里检验运行时间模型的收敛。
+
+`paddle_pserver2.INFO`
+提供 pserver 运行日志，有助于诊断分布式错误。
+
+`server.log`
+提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+`train.log`
+提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+## 检查模型输出
+运行完成后，模型文件将被写入节点 0 的 `output` 目录中。
+工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
diff --git a/doc/howto/usage/cluster/fabric_en.md b/doc/howto/usage/cluster/fabric_en.md
new file mode 100644
index 0000000000..bf270d89ab
--- /dev/null
+++ b/doc/howto/usage/cluster/fabric_en.md
@@ -0,0 +1,43 @@
+# Cluster Training Using Fabric
+
+## Prepare a Linux cluster
+
+Run `kubectl -f ssh_servers.yaml` under the directory:  `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
+
+## Launching Cluster Job
+`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
+
+`paddle.py`provides two distinguished command option for easy job launching.
+
+- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
+- `job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
+dispatch latency.
+
+`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
+```
+sh run.sh
+```
+
+The cluster Job will start in several seconds.
+
+## Kill Cluster Job
+`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
+
+## Check Cluster Training Result
+Check log in $workspace/log for details, each node owns same log structure.
+
+`paddle_trainer.INFO`
+It provides almost all internal output log for training,  same as local training. Check runtime model convergence here.
+
+`paddle_pserver2.INFO`
+It provides parameter server running log, which could help to diagnose distributed error.
+
+`server.log`
+It provides stderr and stdout of parameter server process. Check error log if training crashes.
+
+`train.log`
+It provides stderr and stdout of trainer process. Check error log if training crashes.
+
+## Check Model Output
+After one pass finished, model files will be written in `output` directory in node 0.
+`nodefile` in workspace indicates the node id of current cluster job.
diff --git a/doc/howto/usage/cluster/k8s_aws_cn.md b/doc/howto/usage/cluster/k8s_aws_cn.md
new file mode 120000
index 0000000000..c44cd9a731
--- /dev/null
+++ b/doc/howto/usage/cluster/k8s_aws_cn.md
@@ -0,0 +1 @@
+k8s_aws_en.md
\ No newline at end of file
diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/cluster/k8s_aws_en.md
similarity index 100%
rename from doc/howto/usage/k8s/k8s_aws_en.md
rename to doc/howto/usage/cluster/k8s_aws_en.md
diff --git a/doc/howto/usage/k8s/k8s_cn.md b/doc/howto/usage/cluster/k8s_cn.md
similarity index 100%
rename from doc/howto/usage/k8s/k8s_cn.md
rename to doc/howto/usage/cluster/k8s_cn.md
diff --git a/doc/howto/usage/k8s/k8s_distributed_cn.md b/doc/howto/usage/cluster/k8s_distributed_cn.md
similarity index 91%
rename from doc/howto/usage/k8s/k8s_distributed_cn.md
rename to doc/howto/usage/cluster/k8s_distributed_cn.md
index a9bebf0955..0fc9e37a99 100644
--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/cluster/k8s_distributed_cn.md
@@ -1,6 +1,6 @@
 # Kubernetes分布式训练
 
-前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
+前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cluster/cluster_train_cn.html)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
 
 有关Kubernetes相关概念以及如何搭建和配置Kubernetes集群，可以参考[k8s_basis](./k8s_basis_cn.md)。
 
@@ -28,7 +28,7 @@ PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行
 - 拷贝训练文件到容器内
 - 生成`paddle pserver`与`paddle train`进程的启动参数，并且启动训练
 
-因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/src/k8s_train/Dockerfile)。
+因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile)。
 
 ```bash
 $ cd doc/howto/usage/k8s/src/k8s_train
@@ -149,20 +149,19 @@ spec:
 
 文件中，`metadata`下的`name`表示这个job的名字。`parallelism，completions`字段表示这个job会同时开启3个PaddlePaddle节点，成功训练且退出的pod数目为3时，这个job才算成功结束。然后申明一个存储卷`jobpath`，代表宿主机目录`/home/work/mfs`，在对容器的描述`containers`字段中，将此目录挂载为容器的`/home/jobpath`目录，这样容器的`/home/jobpath`目录就成为了共享存储，放在这个目录里的文件其实是保存到了MFS上。
 
-`env`字段表示容器的环境变量，我们将`paddle`运行的一些参数通过这种方式传递到容器内。
+`env`字段表示容器的环境变量，我们将`paddle`运行的一些参数通过这种方式传递到容器内：
 
-环境变量 | 说明
---- | ---
-JOB_PATH | 共享存储挂在的路径
-JOB_NAME | Job的名字
-TRAIN_CONFIG_DIR | 本次训练文件所在目录，与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径
-CONF_PADDLE_NIC | `paddle pserver`进程需要的`--nics`参数，即网卡名
-CONF_PADDLE_PORT | `paddle paserver`的`--port`参数
-CONF_PADDLE_PORTS_NUM | 稠密更新的端口数量，即`--ports_num`参数
-CONF_PADDLE_PORTS_NUM_SPARSE | 稀疏更新的端口数量，即`--ports_num_for_sparse`参数
-CONF_PADDLE_GRADIENT_NUM | 训练节点数量，即`--num_gradient_servers参数`
 
-这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/doc/ui/cmd_argument/detail_introduction.html#parameter-server-and-distributed-communication)。
+- JOB_PATH：共享存储挂在的路径
+- JOB_NAME：Job的名字
+- TRAIN_CONFIG_DIR：本次训练文件所在目录，与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径
+- CONF_PADDLE_NIC：`paddle pserver`进程需要的`--nics`参数，即网卡名
+- CONF_PADDLE_PORT：`paddle paserver`的`--port`参数
+- CONF_PADDLE_PORTS_NUM：稠密更新的端口数量，即`--ports_num`参数
+- CONF_PADDLE_PORTS_NUM_SPARSE：稀疏更新的端口数量，即`--ports_num_for_sparse`参数
+- CONF_PADDLE_GRADIENT_NUM：训练节点数量，即`--num_gradient_servers参数`
+
+这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。
 
 编写完YAML文件后，可以使用Kubernetes的命令行工具创建job。
 
diff --git a/doc/howto/usage/k8s/k8s_en.md b/doc/howto/usage/cluster/k8s_en.md
similarity index 100%
rename from doc/howto/usage/k8s/k8s_en.md
rename to doc/howto/usage/cluster/k8s_en.md
diff --git a/doc/howto/usage/cluster/openmpi_cn.md b/doc/howto/usage/cluster/openmpi_cn.md
new file mode 100644
index 0000000000..831cafdc03
--- /dev/null
+++ b/doc/howto/usage/cluster/openmpi_cn.md
@@ -0,0 +1,41 @@
+# 在OpenMPI集群中提交训练作业
+
+## 准备OpenMPI集群
+
+执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点：
+
+```bash
+paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。
+
+## 启动集群作业
+
+您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务：
+
+```bash
+# 获得head和node节点的IP地址
+kubectl get po -o wide
+# 将node节点的IP地址保存到machines文件中
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# 拷贝必要的文件到head节点
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# ssh 登录到head节点
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- 以下操作均在head节点中执行 ---------------
+# 准备训练数据
+python prepare.py
+# 拷贝训练程序和字典文件到每台MPI节点
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# 创建日志目录
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# 拷贝训练数据到各自的节点
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# 启动训练任务
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
diff --git a/doc/howto/usage/cluster/openmpi_en.md b/doc/howto/usage/cluster/openmpi_en.md
new file mode 100644
index 0000000000..09af46e25e
--- /dev/null
+++ b/doc/howto/usage/cluster/openmpi_en.md
@@ -0,0 +1,41 @@
+# Cluster Training Using OpenMPI
+
+## Prepare an OpenMPI cluster
+
+Run the following command to start a 3-node MPI cluster and one "head" node.
+
+```bash
+cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+Then you can log in to every OpenMPI node using ssh without input any passwords.
+
+## Launching Cluster Job
+
+Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
+
+```bash
+# find out node IP addresses
+kubectl get po -o wide
+# generate a "machines" file containing node IP addresses
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# copy necessary files onto "head" node
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# login to head node using ssh
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- in head node ---------------
+# prepare training data
+python prepare.py
+# copy training data and dict file to MPI nodes
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# creat a directory for storing log files
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# copy training data to every node
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# start the job
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
diff --git a/doc/howto/usage/k8s/src/Dockerfile b/doc/howto/usage/cluster/src/Dockerfile
similarity index 100%
rename from doc/howto/usage/k8s/src/Dockerfile
rename to doc/howto/usage/cluster/src/Dockerfile
diff --git a/doc/howto/usage/k8s/src/add_security_group.png b/doc/howto/usage/cluster/src/add_security_group.png
similarity index 100%
rename from doc/howto/usage/k8s/src/add_security_group.png
rename to doc/howto/usage/cluster/src/add_security_group.png
diff --git a/doc/howto/usage/k8s/src/create_efs.png b/doc/howto/usage/cluster/src/create_efs.png
similarity index 100%
rename from doc/howto/usage/k8s/src/create_efs.png
rename to doc/howto/usage/cluster/src/create_efs.png
diff --git a/doc/howto/usage/k8s/src/efs_mount.png b/doc/howto/usage/cluster/src/efs_mount.png
similarity index 100%
rename from doc/howto/usage/k8s/src/efs_mount.png
rename to doc/howto/usage/cluster/src/efs_mount.png
diff --git a/doc/howto/usage/cluster/src/k8s-paddle-arch.png b/doc/howto/usage/cluster/src/k8s-paddle-arch.png
new file mode 100644
index 0000000000..b3800c4fe8
Binary files /dev/null and b/doc/howto/usage/cluster/src/k8s-paddle-arch.png differ
diff --git a/doc/howto/usage/k8s/src/k8s_data/Dockerfile b/doc/howto/usage/cluster/src/k8s_data/Dockerfile
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_data/Dockerfile
rename to doc/howto/usage/cluster/src/k8s_data/Dockerfile
diff --git a/doc/howto/usage/k8s/src/k8s_data/README.md b/doc/howto/usage/cluster/src/k8s_data/README.md
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_data/README.md
rename to doc/howto/usage/cluster/src/k8s_data/README.md
diff --git a/doc/howto/usage/k8s/src/k8s_data/get_data.sh b/doc/howto/usage/cluster/src/k8s_data/get_data.sh
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_data/get_data.sh
rename to doc/howto/usage/cluster/src/k8s_data/get_data.sh
diff --git a/doc/howto/usage/k8s/src/k8s_train/Dockerfile b/doc/howto/usage/cluster/src/k8s_train/Dockerfile
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_train/Dockerfile
rename to doc/howto/usage/cluster/src/k8s_train/Dockerfile
diff --git a/doc/howto/usage/k8s/src/k8s_train/README.md b/doc/howto/usage/cluster/src/k8s_train/README.md
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_train/README.md
rename to doc/howto/usage/cluster/src/k8s_train/README.md
diff --git a/doc/howto/usage/k8s/src/k8s_train/start.sh b/doc/howto/usage/cluster/src/k8s_train/start.sh
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_train/start.sh
rename to doc/howto/usage/cluster/src/k8s_train/start.sh
diff --git a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py b/doc/howto/usage/cluster/src/k8s_train/start_paddle.py
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_train/start_paddle.py
rename to doc/howto/usage/cluster/src/k8s_train/start_paddle.py
diff --git a/doc/howto/usage/k8s/src/managed_policy.png b/doc/howto/usage/cluster/src/managed_policy.png
similarity index 100%
rename from doc/howto/usage/k8s/src/managed_policy.png
rename to doc/howto/usage/cluster/src/managed_policy.png
diff --git a/doc/howto/usage/k8s/src/pserver_and_trainer.png b/doc/howto/usage/cluster/src/pserver_and_trainer.png
similarity index 100%
rename from doc/howto/usage/k8s/src/pserver_and_trainer.png
rename to doc/howto/usage/cluster/src/pserver_and_trainer.png
diff --git a/doc/howto/usage/k8s/src/route53_create_recordset.png b/doc/howto/usage/cluster/src/route53_create_recordset.png
similarity index 100%
rename from doc/howto/usage/k8s/src/route53_create_recordset.png
rename to doc/howto/usage/cluster/src/route53_create_recordset.png
diff --git a/doc/howto/usage/k8s/src/route53_create_zone.png b/doc/howto/usage/cluster/src/route53_create_zone.png
similarity index 100%
rename from doc/howto/usage/k8s/src/route53_create_zone.png
rename to doc/howto/usage/cluster/src/route53_create_zone.png
diff --git a/doc/howto/usage/k8s/src/worker_security_group.png b/doc/howto/usage/cluster/src/worker_security_group.png
similarity index 100%
rename from doc/howto/usage/k8s/src/worker_security_group.png
rename to doc/howto/usage/cluster/src/worker_security_group.png
diff --git a/doc/howto/usage/k8s/k8s_basis_cn.md b/doc/howto/usage/k8s/k8s_basis_cn.md
deleted file mode 100644
index 4c3dc81ed3..0000000000
--- a/doc/howto/usage/k8s/k8s_basis_cn.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# Kubernetes 简介
-
-[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统，其提供应用部署、维护、扩展机制等功能，利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行，且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)，[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)，[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前，需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识，下面先简要介绍一下本文用到的几个Kubernetes概念。
-
-- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点，这个节点可以是物理机或者虚拟机，Kubernetes集群就是由node节点与master节点组成的。
-
-- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器，pod是Kubernetes的最小调度单元，一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET，PID，IPC，UTS等Linux namespace。由于容器之间共享NET namespace，所以它们使用同一个IP地址，可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
-
-- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 描述Kubernetes上运行的作业，一次作业称为一个job，通常每个job包括一个或者多个pods，job启动后会创建这些pod并开始执行一个程序，等待这个程序执行成功并返回0则成功退出，如果执行失败，也可以配置不同的重试机制。
-
-- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷，是pod内的容器都可以访问的共享目录，也是容器与node之间共享文件的方式，因为容器内的文件都是暂时存在的，当容器因为各种原因被销毁时，其内部的文件也会随之消失。通过volume，就可以将这些文件持久化存储。Kubernetes支持多种volume，例如hostPath(宿主机目录)，gcePersistentDisk，awsElasticBlockStore等。
-
-- [*Namespaces*](https://kubernetes.io/docs/user-guide/namespaces/) 命名空间，在kubernetes中创建的所有资源对象(例如上文的pod，job)等都属于一个命名空间，在同一个命名空间中，资源对象的名字是唯一的，不同空间的资源名可以重复，命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
-
-- [*PersistentVolume*](https://kubernetes.io/docs/user-guide/persistent-volumes/): 和[*PersistentVolumeClaim*](https://kubernetes.io/docs/user-guide/persistent-volumes/#persistentvolumeclaims)结合，将外部的存储服务在Kubernetes中描述成为统一的资源形式，便于存储资源管理和Pod引用。
-
-## 部署Kubernetes集群
-
-Kubernetes提供了多种集群部署的方案，本文档内不重复介绍。这里给出集中常见的部署方法：
-
-- [*minikube*](https://kubernetes.io/docs/getting-started-guides/minikube/): 快速在本地启动一个单机的kubernetes服务器，便于本地验证和测试。
-- [*kubeadm*](http://kubernetes.io/docs/getting-started-guides/kubeadm/): 在不同操作系统，不同主机(Bare-Metal, AWS, GCE)条件下，快速部署集群。
-- [*AWS EC2*](https://kubernetes.io/docs/getting-started-guides/aws/): 在aws上快速部署集群。
-- [*Bare-Metal*](https://kubernetes.io/docs/getting-started-guides/centos/centos_manual_config/): 在物理机上手动部署。
-
-可以参考[这个表格](https://kubernetes.io/docs/getting-started-guides/#table-of-solutions)选择适合您的场景的合适方案。
-
-## 选择存储方案
-
-容器不会保留在运行时生成的数据，job或者应用程序在容器中运行时生成的数据会在容器销毁时消失。为了完成分布式机器学习训练任务，需要有一个外部的存储服务来保存训练所需数据和训练输出。
-常见的可选存储服务包括：
-
-- [*NFS*](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/nfs): 可以将磁盘上某个目录共享给网络中其他机器访问。部署和配置比较简单，可以用于小量数据的验证。不提供分布式存储，高可用，冗余等功能。NFS的部署方法可以参考[这里](http://www.tecmint.com/how-to-setup-nfs-server-in-linux/)。
-- [*GlusterFS*](http://gluster.readthedocs.io/en/latest/Quick-Start-Guide/Quickstart/): 网络分布式文件系统，可以在Kubernetes中按照[这个](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/glusterfs)例子使用。
-- [*Ceph*](http://docs.ceph.com/docs/master/): 分布式文件系统，支持rbd，POSIX API接口(ceph fs)和对象存储API，参考[这里](https://kubernetes.io/docs/user-guide/volumes/#rbd)。
-- [*MooseFS*](https://moosefs.com/documentation.html): 一个分布式的存储系统。需要先挂载到服务器Node上再通过kubernetes hostPath Volume挂载到容器中。
-
-## 配置kubectl
-
-### 安装kubectl
-```
-# OS X
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/darwin/amd64/kubectl
-
-# Linux
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
-
-# Windows
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/windows/amd64/kubectl.exe
-```
-
-### 配置kubectl访问你的kubernetes集群
-
-编辑`~/.kube/config`这个配置文件，修改`Master-IP`的地址。如果使用SSL认证，则需要配置`certificate-authority`和`users`中的用户证书。如果是使用非SSL方式访问（比如通过8080端口），也可以去掉这些证书的配置。
-```
-apiVersion: v1
-clusters:
-- cluster:
-    certificate-authority: /path/to/ca.crt
-    server: https://[Master-IP]:443
-  name: minikube
-contexts:
-- context:
-    cluster: minikube
-    user: minikube
-  name: minikube
-current-context: minikube
-kind: Config
-preferences: {}
-users:
-- name: minikube
-  user:
-    client-certificate: /path/to/apiserver.crt
-    client-key: /Users/wuyi/.minikube/apiserver.key
-```
diff --git a/doc/howto/usage/k8s/src/k8s-paddle-arch.png b/doc/howto/usage/k8s/src/k8s-paddle-arch.png
deleted file mode 100644
index 2183a232ad..0000000000
Binary files a/doc/howto/usage/k8s/src/k8s-paddle-arch.png and /dev/null differ
diff --git a/doc/mobile/cross_compiling_for_ios_cn.md b/doc/mobile/cross_compiling_for_ios_cn.md
index 9da48e7f21..d5196d9a4c 100644
--- a/doc/mobile/cross_compiling_for_ios_cn.md
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
@@ -18,11 +18,11 @@ PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/
 
 - `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须设置为`iOS`。在设置`CMAKE_SYSTEM_NAME=iOS`后，PaddlePaddle的CMake系统会自动编译所有的第三方依赖库，并且强制设置一些PaddlePaddle参数的值（`WITH_C_API=ON`、`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`）。
 - `WITH_C_API`，是否编译C-API预测库，必须设置为ON。在iOS平台上只支持使用C-API来预测。
-- `WITH_SWIG_PY`，必须设置为ON。在iOS平台上不支持通过swig调用来训练或者预测。
+- `WITH_SWIG_PY`，必须设置为`OFF`。在iOS平台上不支持通过swig调用来训练或者预测。
 
 iOS平台可选配置参数：
 
-- `IOS_PLATFORM`，可设置为`OS/SIMULATOR`，默认值为`OS`。
+- `IOS_PLATFORM`，可设置为`OS`（默认值）或`SIMULATOR`。
   - `OS`，构建目标为`arm`架构的iPhone或者iPad等物理设备。
   - `SIMULATOR`，构建目标为`x86`架构的模拟器平台。
 - `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示，默认编译所有架构：
diff --git a/doc/mobile/cross_compiling_for_ios_en.md b/doc/mobile/cross_compiling_for_ios_en.md
new file mode 100644
index 0000000000..aa390cd61f
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_ios_en.md
@@ -0,0 +1,120 @@
+# PaddlePaddle Compiling Guide for iOS
+
+This tutorial will walk you through cross compiling the PaddlePaddle library for iOS from the source in MacOS.
+
+## Preparation
+
+Apple provides Xcode for cross-compiling and IDE for iOS development. Download from App store or [here](https://developer.apple.com/cn/xcode/). To verify your installation, run command as follows
+
+```bash
+$ xcodebuild -version
+Xcode 9.0
+Build version 9A235
+```
+
+## Cross-compiling configurations
+
+PaddlePaddle provides cross-compiling toolchain configuration documentation [cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake), which has some default settings for frequently used compilers.
+
+There are some mandatory environment variables need to be set before cross compiling PaddlePaddle for iOS:
+
+- `CMAKE_SYSTEM_NAME`, CMake compiling target platform name, has to be `iOS`. PaddlePaddle CMake will compile all the third party dependencies and enforce some parameters (`WITH_C_API=ON`, `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`,`WITH_RDMA=OFF`) when this variable is set with value `iOS`.
+
+- `WITH_C_API`, Whether to compile inference C-API library, has to be `ON`, since C-API is the only supported interface for inferencing in iOS.
+- `WITH_SWIG_PY`, has to be `OFF`. It's not supported to inference or train via swig in iOS.
+
+Optional environment variables for iOS are:
+
+- `IOS_PLATFORM`, either `OS` (default) or `SIMULATOR`.
+  - `OS`, build targets ARM-based physical devices like iPhone or iPad.
+  - `SIMULATOR`, build targets x86 architecture simulators.
+- `IOS_ARCH`, target architecture. By default, all architecture types will be compiled. If you need to specify the architecture to compile for, please find valid values for different `IOS_PLATFORM` settings from the table below:
+
+    <table class="docutils">
+    <colgroup>
+      <col width="35%" />
+      <col width="65%" />
+    </colgroup>
+    <thead valign="bottom">
+      <tr class="row-odd">
+      <th class="head">IOS_PLATFORM</th>
+      <th class="head">IOS_ARCH</th>
+    </tr>
+    </thead>
+    <tbody valign="top">
+      <tr class="row-even">
+      <td>OS</td>
+      <td>armv7, armv7s, arm64 </td>
+    </tr>
+    <tr class="row-odd">
+      <td>SIMULATOR</td>
+      <td>i386, x86_64 </td>
+    </tr>
+    </tbody>
+    </table>
+
+- `IOS_DEPLOYMENT_TARGET`, minimum iOS version to deployment, `7.0` by default.
+- `IOS_ENABLE_BITCODE`, whether to enable [Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3), values can be `ON/OFF`, `ON` by default.
+- `IOS_USE_VECLIB_FOR_BLAS`, whether to use [vecLib](https://developer.apple.com/documentation/accelerate/veclib) framework for BLAS computing. values can be `ON/OFF`, `OFF` by default.
+- `IOS_DEVELOPMENT_ROOT`, the path to `Developer` directory, can be explicitly set with your `/path/to/platform/Developer`. If left blank, PaddlePaddle will automatically pick the Xcode corresponding `platform`'s `Developer` directory based on your `IOS_PLATFORM` value.
+- `IOS_SDK_ROOT`, the path to `SDK` root, can be explicitly set with your  `/path/to/platform/Developer/SDKs/SDK`. if left black, PaddlePaddle will pick the latest SDK in the directory of `IOS_DEVELOPMENT_ROOT`.
+
+other settings：
+
+- `USE_EIGEN_FOR_BLAS`, whether to use Eigen for matrix computing. effective when `IOS_USE_VECLIB_FOR_BLAS=OFF`. Values can be `ON/OFF`, `OFF` by default.
+- `HOST_C/CXX_COMPILER`, host C/C++ compiler. Uses value from environment variable `CC/CXX` by default or `cc/c++` if `CC/CXX` doesn't exist.
+
+some typical cmake configurations:
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=OS \
+      -DIOS_ARCH="armv7;arm64" \
+      -DIOS_ENABLE_BITCODE=ON \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=SIMULATOR \
+      -DIOS_ARCH="x86_64" \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+You can set other compiling parameters for your own need. I.E. if you are trying to minimize the library size, set `CMAKE_BUILD_TYPE` with `MinSizeRel`; or if the performance is your concern, set `CMAKE_BUILD_TYPE` with `Release`. You can even manipulate the PaddlePaddle compiling procedure by manually set `CMAKE_C/CXX_FLAGS` values.
+
+**TIPS for a better performance**:
+
+- set `CMAKE_BUILD_TYPE` with `Release`
+- set `IOS_USE_VECLIB_FOR_BLAS` with `ON`
+
+## Compile and install
+
+After CMake, run following commands, PaddlePaddle will download the compile 3rd party dependencies, compile and install PaddlePaddle inference library.
+
+```
+$ make
+$ make install
+```
+
+Please Note: if you compiled PaddlePaddle in the source directory for other platforms, do remove `third_party` and `build` directory within the source with `rm -rf` to ensure that all the 3rd party libraries dependencies and PaddlePaddle is newly compiled with current CMake configuration.
+
+`your/path/to/install` directory will have following directories after `compile` and `install`:
+
+- `include`, contains all the C-API header files.
+- `lib`, contains PaddlePaddle C-API static library.
+- `third_party` contains all the 3rd party libraries.
+
+Please note: if PaddlePaddle library need to support both physical devices and simulators, you will need to compile correspondingly, then merge fat library with `lipo`.
+
+Now you will have PaddlePaddle library compiled and installed, the fat library can be used in deep learning related iOS APPs. Please refer to C-API documentation for usage guides.
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
index 3c08d73671..ef421dacad 100644
--- a/doc/mobile/index_en.rst
+++ b/doc/mobile/index_en.rst
@@ -5,4 +5,5 @@ MOBILE
   :maxdepth: 1
 
   cross_compiling_for_android_en.md
+  cross_compiling_for_ios_en.md
   cross_compiling_for_raspberry_en.md
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index d6b8464100..cf84568ecd 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -25,8 +25,18 @@ FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py)
 
 SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
 
+SET(SWIG_NEED_FLAGS
+    -ftls-model=global-dynamic
+    -Wno-parentheses-equality
+    -Wno-self-assign
+    -Wno-maybe-uninitialized
+    -Wno-missing-field-initializers)
+  FOREACH(flag ${SWIG_NEED_FLAGS})
+  safe_set_cxxflag(SWIG_CXX_FLAGS ${flag})
+ENDFOREACH()
+
 SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign -ftls-model=global-dynamic")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SWIG_CXX_FLAGS}")
 
 SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
     paddle_parameter
diff --git a/paddle/capi/Main.cpp b/paddle/capi/Main.cpp
index bb8249a551..c038789340 100644
--- a/paddle/capi/Main.cpp
+++ b/paddle/capi/Main.cpp
@@ -43,4 +43,11 @@ paddle_error paddle_init(int argc, char** argv) {
   isInit = true;
   return kPD_NO_ERROR;
 }
+
+paddle_error paddle_init_thread() {
+  if (FLAGS_use_gpu) {
+    hl_init(FLAGS_gpu_id);
+  }
+  return kPD_NO_ERROR;
+}
 }
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index 30f3a766f0..cbacd1fb71 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -40,7 +40,7 @@ paddle_error paddle_matrix_destroy(paddle_matrix mat) {
 paddle_error paddle_matrix_set_row(paddle_matrix mat,
                                    uint64_t rowID,
                                    paddle_real* rowArray) {
-  if (mat == nullptr) return kPD_NULLPTR;
+  if (mat == nullptr || rowArray == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
   if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
diff --git a/paddle/capi/error.cpp b/paddle/capi/error.cpp
new file mode 100644
index 0000000000..96ce31b45f
--- /dev/null
+++ b/paddle/capi/error.cpp
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "error.h"
+
+extern "C" const char* paddle_error_string(paddle_error err) {
+  switch (err) {
+    case kPD_NULLPTR:
+      return "nullptr error";
+    case kPD_OUT_OF_RANGE:
+      return "out of range error";
+    case kPD_PROTOBUF_ERROR:
+      return "protobuf error";
+    case kPD_NOT_SUPPORTED:
+      return "not supported error";
+    case kPD_UNDEFINED_ERROR:
+      return "undefined error";
+    default:
+      return "";
+  }
+}
diff --git a/paddle/capi/error.h b/paddle/capi/error.h
index 44d8c2040d..2da9e0a3ef 100644
--- a/paddle/capi/error.h
+++ b/paddle/capi/error.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifndef __PADDLE_CAPI_ERROR_H__
 #define __PADDLE_CAPI_ERROR_H__
 
+#include "config.h"
+
 /**
  * Error Type for Paddle API.
  */
@@ -27,4 +29,17 @@ typedef enum {
   kPD_UNDEFINED_ERROR = -1,
 } paddle_error;
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Error string for Paddle API.
+ */
+PD_API const char* paddle_error_string(paddle_error err);
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
index 98e411ddc0..2fc8debdde 100644
--- a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
+++ b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
@@ -1,8 +1,29 @@
 project(multi_thread)
 cmake_minimum_required(VERSION 2.8)
-aux_source_directory(. SRC_LIST)
-add_executable(${PROJECT_NAME} ${SRC_LIST})
+
 find_package (Threads)
+
+if(NOT PADDLE_ROOT)
+  set(PADDLE_ROOT $ENV{PADDLE_ROOT} CACHE PATH "Paddle Path")
+endif()
+if(PADDLE_ROOT)
+  include_directories(${PADDLE_ROOT}/include)
+  link_directories(${PADDLE_ROOT}/lib)
+endif()
+
+set(CPU_SRCS main.c)
+add_executable(${PROJECT_NAME} ${CPU_SRCS})
 set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
-target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared
-  ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${PROJECT_NAME}
+                      -lpaddle_capi_shared
+                      ${CMAKE_THREAD_LIBS_INIT})
+
+find_package(CUDA QUIET)
+if(CUDA_FOUND)
+  set(GPU_SRCS main_gpu.c)
+  cuda_add_executable(${PROJECT_NAME}_gpu ${GPU_SRCS})
+  set_property(TARGET ${PROJECT_NAME}_gpu PROPERTY C_STANDARD 99)
+  target_link_libraries(${PROJECT_NAME}_gpu
+                        -lpaddle_capi_shared
+                        ${CMAKE_THREAD_LIBS_INIT})
+endif(CUDA_FOUND)
diff --git a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
new file mode 100644
index 0000000000..6fd376e0d1
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
@@ -0,0 +1,113 @@
+#include <paddle/capi.h>
+#include <pthread.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+#define NUM_THREAD 4
+#define NUM_ITER 1000
+
+pthread_mutex_t mutex;
+
+/*
+ * @brief It is an simple inference example that runs multi-threads on a GPU.
+ *        Each thread holds it own local gradient_machine but shares the same
+ *        parameters.
+ *        If you want to run on different GPUs, you need to launch
+ *        multi-processes or set trainer_count > 1.
+ */
+void* thread_main(void* gm_ptr) {
+  // Initialize the thread environment of Paddle.
+  CHECK(paddle_init_thread());
+
+  paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr);
+  // Create input arguments.
+  paddle_arguments in_args = paddle_arguments_create_none();
+  // Create input matrix.
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+                                           /* size */ 784,
+                                           /* useGPU */ true);
+  // Create output arguments.
+  paddle_arguments out_args = paddle_arguments_create_none();
+  // Create output matrix.
+  paddle_matrix prob = paddle_matrix_create_none();
+
+  // CPU buffer to cache the input and output.
+  paddle_real* cpu_input = (paddle_real*)malloc(784 * sizeof(paddle_real));
+  paddle_real* cpu_output = (paddle_real*)malloc(10 * sizeof(paddle_real));
+  for (int iter = 0; iter < NUM_ITER; ++iter) {
+    // There is only one input layer of this network.
+    CHECK(paddle_arguments_resize(in_args, 1));
+    CHECK(paddle_arguments_set_value(in_args, 0, mat));
+
+    for (int i = 0; i < 784; ++i) {
+      cpu_input[i] = rand() / ((float)RAND_MAX);
+    }
+    CHECK(paddle_matrix_set_value(mat, cpu_input));
+
+    CHECK(paddle_gradient_machine_forward(machine,
+                                          in_args,
+                                          out_args,
+                                          /* isTrain */ false));
+
+    CHECK(paddle_arguments_get_value(out_args, 0, prob));
+    CHECK(paddle_matrix_get_value(prob, cpu_output));
+
+    pthread_mutex_lock(&mutex);
+    printf("Prob: ");
+    for (int i = 0; i < 10; ++i) {
+      printf("%.2f ", cpu_output[i]);
+    }
+    printf("\n");
+    pthread_mutex_unlock(&mutex);
+  }
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_matrix_destroy(mat));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+
+  free(cpu_input);
+  free(cpu_output);
+
+  return NULL;
+}
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=True"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  srand(time(0));
+  pthread_mutex_init(&mutex, NULL);
+
+  pthread_t threads[NUM_THREAD];
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    paddle_gradient_machine thread_local_machine;
+    CHECK(paddle_gradient_machine_create_shared_param(
+        machine, buf, size, &thread_local_machine));
+    pthread_create(&threads[i], NULL, thread_main, thread_local_machine);
+  }
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    pthread_join(threads[i], NULL);
+  }
+
+  pthread_mutex_destroy(&mutex);
+
+  return 0;
+}
diff --git a/paddle/capi/main.h b/paddle/capi/main.h
index 893ebcbd58..99c4e8428d 100644
--- a/paddle/capi/main.h
+++ b/paddle/capi/main.h
@@ -26,6 +26,13 @@ extern "C" {
  */
 PD_API paddle_error paddle_init(int argc, char** argv);
 
+/**
+ * Initialize the thread environment of Paddle.
+ * @note it is requisite for GPU runs but optional for CPU runs.
+ *       For GPU runs, all threads will run on the same GPU devices.
+ */
+PD_API paddle_error paddle_init_thread();
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 89c1f48eda..8841806292 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -116,6 +116,7 @@ extern void hl_maxpool_backward(const int frameCnt,
  * @param[in]   paddingW    padding width.
  * @param[out]  tgtData     output data.
  * @param[in]   tgtStride   stride between output data samples.
+ * @param[in]   excludeMode whether to consider paddings for size.
  *
  */
 extern void hl_avgpool_forward(const int frameCnt,
@@ -132,7 +133,8 @@ extern void hl_avgpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride);
+                               const int tgtStride,
+                               bool excludeMode);
 
 /**
  * @brief   Maximum pool backward.
@@ -154,6 +156,7 @@ extern void hl_avgpool_forward(const int frameCnt,
  * @param[in]   scaleB      scale.
  * @param[out]  backGrad    output grad.
  * @param[in]   outStride   stride between output data samples.
+ * @param[in]   excludeMode whether to consider paddings for size.
  *
  */
 extern void hl_avgpool_backward(const int frameCnt,
@@ -172,7 +175,8 @@ extern void hl_avgpool_backward(const int frameCnt,
                                 real scaleA,
                                 real scaleB,
                                 real* backGrad,
-                                const int outStride);
+                                const int outStride,
+                                bool excludeMode);
 
 extern void hl_maxpool3D_forward(const int frameCnt,
                                  const real* inputData,
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 968ed4840f..706cc59a8e 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -68,7 +68,8 @@ inline void hl_avgpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride) {}
+                               const int tgtStride,
+                               const bool excludeMode) {}
 
 inline void hl_avgpool_backward(const int frameCnt,
                                 const real* outGrad,
@@ -86,7 +87,8 @@ inline void hl_avgpool_backward(const int frameCnt,
                                 real scaleA,
                                 real scaleB,
                                 real* backGrad,
-                                const int outStride) {}
+                                const int outStride,
+                                const bool excludeMode) {}
 
 inline void hl_maxpool3D_forward(const int frameCnt,
                                  const real* inputData,
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index 3699b1e8ae..2d1bc4f6d5 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -210,7 +210,8 @@ __global__ void KeAvgPoolForward(const int nthreads,
                                  const int padH,
                                  const int padW,
                                  real* tgtData,
-                                 const int tgtStride) {
+                                 const int tgtStride,
+                                 const bool excludeMode) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
@@ -224,7 +225,8 @@ __global__ void KeAvgPoolForward(const int nthreads,
     int wend = min(wstart + sizeX, width);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
-    int pool_size = (hend - hstart) * (wend - wstart);
+    int poolSize =
+        excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
 
     real aveval = 0;
     inputData += (frameNum * channels + c) * height * width;
@@ -235,7 +237,7 @@ __global__ void KeAvgPoolForward(const int nthreads,
     }
     int tgtIndex =
         index % (pooledW * pooledH * channels) + frameNum * tgtStride;
-    tgtData[tgtIndex] = aveval / pool_size;
+    tgtData[tgtIndex] = aveval / poolSize;
   }
 }
 
@@ -253,7 +255,8 @@ void hl_avgpool_forward(const int frameCnt,
                         const int paddingH,
                         const int paddingW,
                         real* tgtData,
-                        const int tgtStride) {
+                        const int tgtStride,
+                        const bool excludeMode) {
   int num_kernels = pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
   KeAvgPoolForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
@@ -270,7 +273,8 @@ void hl_avgpool_forward(const int frameCnt,
                                                         paddingH,
                                                         paddingW,
                                                         tgtData,
-                                                        tgtStride);
+                                                        tgtStride,
+                                                        excludeMode);
   CHECK_SYNC("hl_avgpool_forward failed");
 }
 
@@ -290,7 +294,8 @@ __global__ void KeAvgPoolBackward(const int nthreads,
                                   real scaleA,
                                   real scaleB,
                                   real* tgtGrad,
-                                  const int outStride) {
+                                  const int outStride,
+                                  const bool excludeMode) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int offsetW = index % width + padW;
@@ -314,8 +319,9 @@ __global__ void KeAvgPoolBackward(const int nthreads,
         int wstart = pw * strideW - padW;
         int wend = min(wstart + sizeX, width);
         wstart = max(wstart, 0);
-        int poolsize = (hend - hstart) * (wend - wstart);
-        gradient += outGrad[ph * pooledW + pw] / poolsize;
+        int poolSize =
+            excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
+        gradient += outGrad[ph * pooledW + pw] / poolSize;
       }
     }
     tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient;
@@ -338,7 +344,8 @@ void hl_avgpool_backward(const int frameCnt,
                          real scaleA,
                          real scaleB,
                          real* backGrad,
-                         const int outStride) {
+                         const int outStride,
+                         const bool excludeMode) {
   int num_kernels = height * width * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
 
@@ -358,7 +365,8 @@ void hl_avgpool_backward(const int frameCnt,
                                                          scaleA,
                                                          scaleB,
                                                          backGrad,
-                                                         outStride);
+                                                         outStride,
+                                                         excludeMode);
   CHECK_SYNC("hl_avgpool_backward failed");
 }
 
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 4b0eff3adb..206e298eb2 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -58,3 +58,6 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
         proto_desc)
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
+
+cc_library(init SRCS init.cc DEPS gflags executor place stringpiece)
+cc_test(init_test SRCS init_test.cc DEPS init)
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 7294ba1a9c..faf6e60cbd 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -190,8 +190,9 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
       // collect all the offset for each alias,
       // insert a sum operator to add all aliases to output
       insert_position.push_back(
-          {dup_op.back(), OpRegistry::CreateOp("sum", {{"X", dup_outputs}},
-                                               {{"Out", {name}}}, {})});
+          {dup_op.back(),
+           OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, {{"Out", {name}}},
+                                AttributeMap{})});
     }
 
     // make sure the inserted `sum` ops follow the BFS order.
@@ -216,7 +217,8 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
         // If part of input gradient of that operator is not calculated, fill
         // zero variables to that input gradient.
         net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
-                                           {{"Y", {grad_input}}}, {}));
+                                           {{"Y", {grad_input}}},
+                                           AttributeMap{}));
       }
       return false;
     });
@@ -392,8 +394,9 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
             0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
         std::string new_name = prefix + kZeroVarSuffix;
         desc->Rename(in_name, new_name);
-        std::unique_ptr<OpDescBind> fill_zeros_op(new OpDescBind(
-            "fill_zeros_like", {{"X", {prefix}}}, {{"Y", {new_name}}}, {}));
+        std::unique_ptr<OpDescBind> fill_zeros_op(
+            new OpDescBind("fill_zeros_like", {{"X", {prefix}}},
+                           {{"Y", {new_name}}}, AttributeMap{}));
         pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
       }
     }
@@ -427,14 +430,14 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
     std::vector<std::unique_ptr<OpDescBind>> op_grads;
 
     if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
-      int step_block_idx = (*it)->GetBlockAttr("step_block");
+      int step_block_idx = (*it)->GetBlockAttr("sub_block");
       BlockDescBind* backward_block = CreateStepBlock(
           program_desc, no_grad_vars, grad_to_var, step_block_idx);
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
     } else if ((*it)->Type() == "conditional_block") {
       BlockDescBind* backward_block =
           CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
-                          (*it)->GetBlockAttr("block"));
+                          (*it)->GetBlockAttr("sub_block"));
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
     } else {
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
@@ -483,8 +486,9 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
         sum_op_inputs.emplace_back(new_name);
         next_g_name = sum_op_inputs.back();
       }
-      std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
-          "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
+      std::unique_ptr<OpDescBind> sum_op(
+          new OpDescBind("sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}},
+                         AttributeMap{}));
       pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
     }
   }
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 2b858f5ea0..9fe49881d5 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -106,15 +106,15 @@ class FcOp : public operators::NetOp {
   FcOp(const std::string &type, const VariableNameMap &inputs,
        const VariableNameMap &outputs, const AttributeMap &attrs)
       : NetOp(type, inputs, outputs, attrs) {
-    AppendOp(OpRegistry::CreateOp("mul",
-                                  {{"X", {Input("X")}}, {"Y", {Input("W")}}},
-                                  {{"Out", {Output("mul_result")}}}, {}));
+    AppendOp(OpRegistry::CreateOp(
+        "mul", {{"X", {Input("X")}}, {"Y", {Input("W")}}},
+        {{"Out", {Output("mul_result")}}}, AttributeMap{}));
     auto input_b = Inputs("b");
     std::string before_act = "mul_result";
     if (input_b.size() != 0) {
       AppendOp(OpRegistry::CreateOp(
           "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
-          {{"Out", {Output("add_result")}}}, {}));
+          {{"Out", {Output("add_result")}}}, AttributeMap{}));
       before_act = "add_result";
     } else {
       auto out_varname = Output("add_result");
@@ -124,7 +124,7 @@ class FcOp : public operators::NetOp {
     }
 
     AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
-                                  {{"Out", {Output("Out")}}}, {}));
+                                  {{"Out", {Output("Out")}}}, AttributeMap{}));
     CompleteAddOp(false);
   }
 };
@@ -278,8 +278,9 @@ REGISTER_OPERATOR(scale, f::NoneOp);
 REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel<paddle::platform::CPUPlace, float>);
 
 TEST(Backward, simple_op_not_need_grad) {
-  auto fwd = f::OpRegistry::CreateOp(
-      "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
+  auto fwd =
+      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+                              {{"Out", {"out"}}}, f::AttributeMap{});
   ASSERT_NE(fwd, nullptr);
   auto gop = f::Backward(*fwd, {"x"});
   ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName);
@@ -296,9 +297,10 @@ TEST(Backward, net_fc_backward_normal) {
                               {{"mul_result", {"mul_res"}},
                                {"add_result", {"add_re"}},
                                {"Out", {"out"}}},
-                              {});
+                              f::AttributeMap{});
   ASSERT_NE(fwd, nullptr);
-  std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
+  std::shared_ptr<f::OperatorBase> gop =
+      f::Backward(*fwd, std::unordered_set<std::string>{});
   ASSERT_TRUE(gop->IsNetOp());
   auto net = static_cast<ops::NetOp *>(gop.get());
 
@@ -322,9 +324,10 @@ TEST(Backward, net_fc_backward_not_have_b) {
                               {{"mul_result", {"mul_res"}},
                                {"add_result", {"add_res"}},
                                {"Out", {"tmp"}}},
-                              {});
+                              f::AttributeMap{});
   ASSERT_NE(fwd, nullptr);
-  std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
+  std::shared_ptr<f::OperatorBase> gop =
+      f::Backward(*fwd, std::unordered_set<std::string>{});
   ASSERT_TRUE(gop->IsNetOp());
   auto net = static_cast<ops::NetOp *>(gop.get());
 
@@ -346,13 +349,13 @@ TEST(Backward, net_input_of_network_not_need_grad) {
       {{"mul_result", {"mul_tmp_0"}},
        {"add_result", {"add_tmp_0"}},
        {"Out", {"hidden0"}}},
-      {}));
+      f::AttributeMap{}));
   net.AppendOp(f::OpRegistry::CreateOp(
       "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
       {{"mul_result", {"mul_tmp_1"}},
        {"add_result", {"add_tmp_1"}},
        {"Out", {"hidden1"}}},
-      {}));
+      f::AttributeMap{}));
   net.CompleteAddOp();
   auto bwd = Backward(net, {"x"});  // x@GRAD is not need.
   ASSERT_TRUE(bwd->IsNetOp());
@@ -381,12 +384,13 @@ TEST(Backward, net_input_of_network_not_need_grad) {
 TEST(Backward, net_shared_weight) {
   ops::NetOp net;
   net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
-                                       {{"Out", {"out"}}}, {}));
+                                       {{"Out", {"out"}}}, f::AttributeMap{}));
   net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
-                                       {{"Out", {"FinalOut"}}}, {}));
+                                       {{"Out", {"FinalOut"}}},
+                                       f::AttributeMap{}));
   net.CompleteAddOp();
 
-  auto bwd = f::Backward(net, {});
+  auto bwd = f::Backward(net, std::unordered_set<std::string>{});
   ASSERT_TRUE(bwd->IsNetOp());
   auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
   ASSERT_EQ(3UL, bwd_net->ops_.size());
@@ -394,8 +398,9 @@ TEST(Backward, net_shared_weight) {
 }
 
 TEST(Backward, op_all_input_are_not_need) {
-  auto fwd = f::OpRegistry::CreateOp(
-      "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
+  auto fwd =
+      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+                              {{"Out", {"out"}}}, f::AttributeMap{});
   auto backward = f::Backward(*fwd, {"x", "b"});
   ASSERT_TRUE(backward->IsNetOp());
   auto net = static_cast<ops::NetOp *>(backward.get());
@@ -403,8 +408,9 @@ TEST(Backward, op_all_input_are_not_need) {
 }
 
 TEST(Backward, op_all_output_are_not_need) {
-  auto fwd = f::OpRegistry::CreateOp(
-      "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
+  auto fwd =
+      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+                              {{"Out", {"out"}}}, f::AttributeMap{});
   auto backward = f::Backward(*fwd, {"out"});
   ASSERT_TRUE(backward->IsNetOp());
   auto net = static_cast<ops::NetOp *>(backward.get());
@@ -412,8 +418,9 @@ TEST(Backward, op_all_output_are_not_need) {
 }
 
 TEST(Backward, op_part_of_output_are_not_need) {
-  auto fwd = f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
-                                     {{"y", {"Y"}}, {"z", {"Z"}}}, {});
+  auto fwd =
+      f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
+                              {{"y", {"Y"}}, {"z", {"Z"}}}, f::AttributeMap{});
   auto backward = f::Backward(*fwd, {"Z"});
   ASSERT_TRUE(backward->IsNetOp());
   auto net = static_cast<ops::NetOp *>(backward.get());
@@ -437,7 +444,7 @@ TEST(Backward, op_part_of_output_are_not_need) {
 
 TEST(Backward, op_part_of_input_are_not_need) {
   auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}},
-                                     {{"Out", {"out"}}}, {});
+                                     {{"Out", {"out"}}}, f::AttributeMap{});
   auto backward = f::Backward(*fwd, {"a"});
   auto &grad_mul = *backward;
   ASSERT_EQ(grad_mul.Type(), "mul_grad");
@@ -458,19 +465,19 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
       {{"mul_result", {"mul_out1"}},
        {"add_result", {"add_out1"}},
        {"Out", {"out1"}}},
-      {}));
+      f::AttributeMap{}));
   net.AppendOp(f::OpRegistry::CreateOp(
       "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
       {{"mul_result", {"mul_out2"}},
        {"add_result", {"tmp_out2"}},
        {"Out", {"out2"}}},
-      {}));
+      f::AttributeMap{}));
   net.AppendOp(f::OpRegistry::CreateOp(
       "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
       {{"mul_result", {"mul_out3"}},
        {"add_result", {"tmp_out3"}},
        {"Out", {"out3"}}},
-      {}));
+      f::AttributeMap{}));
   net.CompleteAddOp();
 
   auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
@@ -509,7 +516,8 @@ TEST(Backward, simple_single_op) {
 
   auto target = f::VarDescBind("out");
   target.SetShape({1});
-  auto var_to_grad = AppendBackward(program, target, {});
+  auto var_to_grad =
+      AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 3UL);
   f::OpDescBind *fill_op = block->AllOps()[1];
@@ -546,7 +554,7 @@ TEST(Backward, default_attribute) {
 
   auto target = f::VarDescBind("out");
   target.SetShape({1});
-  AppendBackward(program, target, {});
+  AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 3UL);
   EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
@@ -585,7 +593,8 @@ TEST(Backward, simple_mult_op) {
   auto target = f::VarDescBind("out3");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {});
+  auto var_to_grad =
+      AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 6UL + 1);
   f::OpDescBind *fill_op = block->AllOps()[forward_len];
@@ -817,7 +826,8 @@ TEST(Backward, shared_var) {
   auto target = f::VarDescBind("out3");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {});
+  auto var_to_grad =
+      AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 8UL);
   f::OpDescBind *fill_op = block->AllOps()[forward_len];
diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc
index 756232b1b5..bd5ea09d7d 100644
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -1,3 +1,16 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 #include <sstream>
 #include <vector>
 
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 83aa927c29..a8b8a6f8e8 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -33,32 +33,12 @@ namespace framework {
 const std::string kFeedOpType = "feed";
 const std::string kFetchOpType = "fetch";
 
-Executor::Executor(const std::vector<platform::Place>& places) : own_(true) {
-  PADDLE_ENFORCE_GT(places.size(), 0);
-  device_contexts_.resize(places.size());
-  for (size_t i = 0; i < places.size(); i++) {
-    if (platform::is_cpu_place(places[i])) {
-      device_contexts_[i] = new platform::CPUDeviceContext(
-          boost::get<platform::CPUPlace>(places[i]));
-    } else if (platform::is_gpu_place(places[i])) {
-#ifdef PADDLE_WITH_CUDA
-      device_contexts_[i] = new platform::CUDADeviceContext(
-          boost::get<platform::GPUPlace>(places[i]));
-#else
-      PADDLE_THROW(
-          "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
-          "option");
-#endif
-    }
-  }
-}
+DeviceContextPool* DeviceContextPool::pool = nullptr;
 
-Executor::~Executor() {
-  if (own_) {
-    for (auto& device_context : device_contexts_) {
-      delete device_context;
-    }
-  }
+Executor::Executor(const std::vector<platform::Place>& places) {
+  DeviceContextPool& pool = DeviceContextPool::Get();
+  auto borrowed_contexts = pool.Borrow(places);
+  device_contexts_.swap(borrowed_contexts);
 }
 
 static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
@@ -132,8 +112,5 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
   }
 }
 
-Executor::Executor(const platform::DeviceContext& device)
-    : device_contexts_({&device}), own_(false) {}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index b745f4f647..073e04729b 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -14,19 +14,98 @@ limitations under the License. */
 
 #pragma once
 
+#include <map>
+#include <unordered_map>
+
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
 
+class DeviceContextPool {
+ public:
+  static DeviceContextPool& Get() {
+    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
+    return *pool;
+  }
+
+  static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
+    if (pool == nullptr) {
+      pool = new DeviceContextPool(places);
+    }
+    return *pool;
+  }
+
+  std::vector<const platform::DeviceContext*> Borrow(
+      const std::vector<platform::Place>& places) {
+    PADDLE_ENFORCE_GT(places.size(), 0);
+    PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
+    std::vector<const platform::DeviceContext*> borrowed_contexts;
+    for (auto& place : places) {
+      auto range = device_contexts_.equal_range(place);
+      if (range.first == range.second) {
+        PADDLE_THROW(
+            "'Place' is not supported, Please re-compile with WITH_GPU "
+            "option");
+      }
+      // TODO(dzhwinter) : assign the first found device. Will enhanced later.
+      // device load balancer maybe useful here.
+      borrowed_contexts.emplace_back(range.first->second);
+    }
+    return borrowed_contexts;
+  }
+
+  explicit DeviceContextPool(const std::vector<platform::Place>& places) {
+    PADDLE_ENFORCE_GT(places.size(), 0);
+    for (size_t i = 0; i < places.size(); i++) {
+      if (platform::is_cpu_place(places[i])) {
+        device_contexts_.emplace(
+            places[i], new platform::CPUDeviceContext(
+                           boost::get<platform::CPUPlace>(places[i])));
+      } else if (platform::is_gpu_place(places[i])) {
+#ifdef PADDLE_WITH_CUDA
+        device_contexts_.emplace(
+            places[i], new platform::CUDADeviceContext(
+                           boost::get<platform::GPUPlace>(places[i])));
+#else
+        PADDLE_THROW(
+            "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
+            "option");
+#endif
+      }
+    }
+  }
+
+  ~DeviceContextPool() {}
+
+ private:
+  static DeviceContextPool* pool;
+  struct Hash {
+    std::hash<int> hash_;
+    size_t operator()(const platform::Place& place) const {
+      return hash_(place.which());
+    }
+  };
+  std::unordered_multimap<const platform::Place, const platform::DeviceContext*,
+                          Hash>
+      device_contexts_;
+  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
+};
+
 class Executor {
  public:
+  // TODO(dzhwinter) : Do not rely on this function, it will be removed
+  explicit Executor(const platform::DeviceContext& device)
+      : Executor(std::vector<platform::Place>({device.GetPlace()})) {}
+
+  explicit Executor(const platform::Place& place)
+      : Executor(std::vector<platform::Place>({place})) {}
+
   explicit Executor(const std::vector<platform::Place>& places);
-  explicit Executor(const platform::DeviceContext& devices);
-  ~Executor();
 
   /* @Brief
    * Runtime evaluation of the given ProgramDesc under certain Scope
@@ -39,7 +118,6 @@ class Executor {
 
  private:
   std::vector<const platform::DeviceContext*> device_contexts_;
-  bool own_;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc
new file mode 100644
index 0000000000..1c4476f4b3
--- /dev/null
+++ b/paddle/framework/init.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <algorithm>
+#include <string>
+
+#include "paddle/framework/executor.h"
+#include "paddle/framework/init.h"
+#include "paddle/platform/place.h"
+#include "paddle/string/piece.h"
+
+namespace paddle {
+namespace framework {
+
+std::once_flag gflags_init_flag;
+
+// TODO(qijun) move init gflags to init.cc
+void InitGflags(std::vector<std::string> &argv) {
+  std::call_once(gflags_init_flag, [&]() {
+    int argc = argv.size();
+    char **arr = new char *[argv.size()];
+    std::string line;
+    for (size_t i = 0; i < argv.size(); i++) {
+      arr[i] = &argv[i][0];
+      line += argv[i];
+      line += ' ';
+    }
+    google::ParseCommandLineFlags(&argc, &arr, true);
+    VLOG(1) << "Init commandline: " << line;
+  });
+}
+
+bool InitDevices(const std::vector<std::string> &devices) {
+  // device format
+  // CPU
+  // GPU:1
+  // TODO(dzhwinter) : add device format annotation for users.
+  std::vector<platform::Place> places;
+  for (auto &device : devices) {
+    auto p = string::Piece(device);
+    if (string::Find(p, ':', 0) == string::Piece::npos) {
+      places.emplace_back(platform::CPUPlace());
+    } else if (string::HasPrefix(p, "GPU")) {
+#ifdef PADDLE_WITH_CUDA
+      auto pos = string::RFind(p, ':', string::Piece::npos);
+      auto number = device.substr(pos + 1);
+      places.emplace_back(platform::GPUPlace(std::stoi(number)));
+#else
+      LOG(WARNING)
+          << "'GPU' is not supported, Please re-compile with WITH_GPU option";
+#endif
+    } else {
+      return false;
+    }
+  }
+
+  if (std::find_if(places.begin(), places.end(),
+                   [&](const platform::Place &place) {
+                     return platform::is_cpu_place(place);
+                   }) == places.end()) {
+    places.emplace_back(platform::CPUPlace());
+    LOG(WARNING) << "Not specified any device, use CPU by Default.";
+  }
+  DeviceContextPool::Create(places);
+  return true;
+  return true;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/init.h b/paddle/framework/init.h
new file mode 100644
index 0000000000..1715cd81e6
--- /dev/null
+++ b/paddle/framework/init.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <mutex>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+namespace paddle {
+namespace framework {
+
+void InitGflags(std::vector<std::string> &argv);
+
+bool InitDevices(const std::vector<std::string> &devices);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/init_test.cc b/paddle/framework/init_test.cc
new file mode 100644
index 0000000000..f65e881a76
--- /dev/null
+++ b/paddle/framework/init_test.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
+
+TEST(Init, InitDevices) {
+  using paddle::framework::InitDevices;
+  std::vector<std::string> ds1 = {"CPU"};
+  ASSERT_EQ(InitDevices(ds1), true);
+
+#ifdef PADDLE_WITH_CUDA
+  std::vector<std::string> ds2 = {"CPU", "GPU:0", "GPU:1"};
+  ASSERT_EQ(InitDevices(ds2), true);
+#endif
+}
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 2281d93df9..7ba1e3e4e3 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -59,7 +59,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
     auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
     if (in_var->GetType() != VarDesc::LOD_TENSOR) {
-      VLOG(3) << "input " << in << "is not LodTensor";
+      VLOG(3) << "input " << in << " is not LodTensor";
       return;
     }
     PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
@@ -316,8 +316,8 @@ static void InitInferShapeFuncs() {
     for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) {
       auto op_type = kern_pair.first;
       auto &op_info = info_map.at(op_type);
-      auto op =
-          static_cast<OperatorWithKernel *>(op_info.Creator()("", {}, {}, {}));
+      auto op = static_cast<OperatorWithKernel *>(op_info.Creator()(
+          "", VariableNameMap{}, VariableNameMap{}, AttributeMap{}));
       if (op_info.infer_shape_) {  // infer_shape has been registered.
         continue;
       }
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index daade439e5..b29238432b 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -181,8 +181,8 @@ class OpKernelRegistrar : public Registrar {
     return 0;                                                             \
   }
 
-#define REGISTER_OP_GPU_KERNEL(op_type, ...) \
-  REGISTER_OP_KERNEL(op_type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
+#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__)
 
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
@@ -217,7 +217,7 @@ class OpKernelRegistrar : public Registrar {
 #else
 #define USE_OP_KERNEL(op_type)        \
   USE_OP_DEVICE_KERNEL(op_type, CPU); \
-  USE_OP_DEVICE_KERNEL(op_type, GPU)
+  USE_OP_DEVICE_KERNEL(op_type, CUDA)
 #endif
 
 #define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
@@ -226,9 +226,9 @@ class OpKernelRegistrar : public Registrar {
   USE_OP_ITSELF(op_type);        \
   USE_OP_DEVICE_KERNEL(op_type, CPU);
 
-#define USE_GPU_ONLY_OP(op_type) \
-  USE_OP_ITSELF(op_type);        \
-  USE_OP_DEVICE_KERNEL(op_type, GPU)
+#define USE_CUDA_ONLY_OP(op_type) \
+  USE_OP_ITSELF(op_type);         \
+  USE_OP_DEVICE_KERNEL(op_type, CUDA)
 
 #define USE_OP(op_type)   \
   USE_OP_ITSELF(op_type); \
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index f1444eeee9..e83d754783 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,20 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-template <>
-Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
-    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return *device_context_.GetEigenDevice<platform::CPUPlace>();
-}
-
-#ifdef PADDLE_WITH_CUDA
-template <>
-Eigen::GpuDevice&
-ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return *device_context_.GetEigenDevice<platform::GPUPlace>();
-}
-#endif
-
 std::string OperatorBase::Input(const std::string& name) const {
   auto& ins = Inputs(name);
   PADDLE_ENFORCE_LE(ins.size(), 1UL,
@@ -429,7 +415,7 @@ void OperatorWithKernel::Run(const Scope& scope,
 }
 OpKernelType OperatorWithKernel::GetKernelType(
     const ExecutionContext& ctx) const {
-  return OpKernelType(IndicateDataType(ctx), ctx.device_context());
+  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
 }
 DataType OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 60861d9293..e60dbfc313 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -276,17 +276,25 @@ class ExecutionContext {
     out_tensor->set_lod(in_tensor.lod());
   }
 
-  template <typename PlaceType,
-            typename DeviceType = typename platform::EigenDeviceConverter<
-                PlaceType>::EigenDeviceType>
-  DeviceType& GetEigenDevice() const;
-
   platform::Place GetPlace() const { return device_context_.GetPlace(); }
 
+  template <typename DeviceContextType>
+  const DeviceContextType& device_context() const {
+    return *reinterpret_cast<const DeviceContextType*>(&device_context_);
+  }
+
   const platform::DeviceContext& device_context() const {
     return device_context_;
   }
 
+#ifdef PADDLE_WITH_CUDA
+  const inline platform::CUDADeviceContext& cuda_device_context() const {
+    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
+    return *reinterpret_cast<const platform::CUDADeviceContext*>(
+        &device_context_);
+  }
+#endif
+
   //! Get actual name vector for this input.
   const std::vector<std::string>& Inputs(const std::string& name) const {
     return op_.Inputs(name);
@@ -297,14 +305,6 @@ class ExecutionContext {
     return op_.Outputs(name);
   }
 
-#ifdef PADDLE_WITH_CUDA
-  const inline platform::CUDADeviceContext& cuda_device_context() const {
-    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
-    return *reinterpret_cast<const platform::CUDADeviceContext*>(
-        &device_context_);
-  }
-#endif
-
  private:
   const OperatorBase& op_;
   const Scope& scope_;
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 1e19f82b34..b678178454 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -115,7 +115,7 @@ class OpWithKernelTest : public OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {}
   OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
-    return OpKernelType(DataType::FP32, ctx.device_context());
+    return OpKernelType(DataType::FP32, ctx.GetPlace());
   }
 };
 
@@ -261,7 +261,9 @@ class OperatorClone : public paddle::framework::OperatorBase {
 };
 
 TEST(Operator, Clone) {
-  OperatorClone a("ABC", {}, {}, {});
+  OperatorClone a("ABC", paddle::framework::VariableNameMap{},
+                  paddle::framework::VariableNameMap{},
+                  paddle::framework::AttributeMap{});
   auto b = a.Clone();
   ASSERT_EQ(a.Type(), b->Type());
 }
diff --git a/paddle/framework/prune_test.cc b/paddle/framework/prune_test.cc
index 5988874809..f21df37a29 100644
--- a/paddle/framework/prune_test.cc
+++ b/paddle/framework/prune_test.cc
@@ -54,7 +54,8 @@ TEST(Prune, one_operator) {
   f::ProgramDescBind program;
   f::BlockDescBind *block = program.MutableBlock(0);
 
-  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, {}, block);
+  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
+        block);
 
   f::ProgramDesc *pdesc = program.Proto();
   f::ProgramDesc pruned;
@@ -71,10 +72,14 @@ TEST(Prune, forward) {
   f::ProgramDescBind program;
   f::BlockDescBind *block = program.MutableBlock(0);
 
-  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, {}, block);
-  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, {}, block);
-  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, {}, block);
-  AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, {}, block);
+  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, f::AttributeMap{},
+        block);
 
   f::ProgramDesc *pdesc = program.Proto();
 
@@ -90,11 +95,14 @@ TEST(Prune, multi_input_op) {
   f::ProgramDescBind program;
   f::BlockDescBind *block = program.MutableBlock(0);
 
-  AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, {}, block);
-  AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, {}, block);
-  AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, {}, block);
-  AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}}, {},
+  AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
         block);
+  AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, f::AttributeMap{},
+        block);
+  AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}},
+        f::AttributeMap{}, block);
 
   f::ProgramDesc *pdesc = program.Proto();
   pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true);
@@ -108,9 +116,12 @@ TEST(Prune, multi_output_op) {
   f::ProgramDescBind program;
   f::BlockDescBind *block = program.MutableBlock(0);
 
-  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, {}, block);
-  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, {}, block);
-  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, {}, block);
+  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
+        f::AttributeMap{}, block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
+        block);
 
   f::ProgramDesc *pdesc = program.Proto();
   pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
@@ -124,9 +135,12 @@ TEST(Prune, multi_target) {
   f::ProgramDescBind program;
   f::BlockDescBind *block = program.MutableBlock(0);
 
-  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, {}, block);
-  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, {}, block);
-  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, {}, block);
+  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
+        f::AttributeMap{}, block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
+        block);
 
   f::ProgramDesc *pdesc = program.Proto();
   pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index 8d34eee886..de7b70e271 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -126,6 +126,11 @@ public:
       inputData += inputChannels * inputHeight * inputWidth;
       outputData += outputChannels * outputHeight * outputWidth;
     }
+#ifdef PADDLE_MOBILE_INFERENCE
+    if (Device == DEVICE_TYPE_CPU) {
+      memory_.reset();
+    }
+#endif
   }
 };
 
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index f5a41b66bf..57c890e488 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/utils/ClassRegistrar.h"
 #include "paddle/utils/Logging.h"
 
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
 #include "MKLDNNActivation.h"
 #endif
 
@@ -490,7 +490,7 @@ Error __must_check backward(Argument& act) {
 END_DEFINE_ACTIVATION(log)
 
 ActivationFunction* ActivationFunction::create(const std::string& type) {
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
   if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) {
     return MKLDNNActivation::create(type);
   }
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index be112b4123..68bf37d59d 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
 #include "paddle/gserver/layers/MKLDNNLayer.h"
 #endif
 
@@ -307,7 +307,7 @@ void NeuralNetwork::backward(const UpdateCallback& callback) {
 }
 
 void NeuralNetwork::finish() {
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
   FOR_EACH_R(layer, layers_) {
     MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(*layer);
     if (dnnLayer) {
diff --git a/paddle/gserver/layers/ConvTransProjection.cpp b/paddle/gserver/layers/ConvTransProjection.cpp
index 48132a3ce4..e7f081c023 100644
--- a/paddle/gserver/layers/ConvTransProjection.cpp
+++ b/paddle/gserver/layers/ConvTransProjection.cpp
@@ -24,13 +24,13 @@ size_t ConvTransProjection::calOutputSize() {
   if (outputH_ == 0) outputH_ = configOutH_;
   if (outputW_ == 0) outputW_ = configOutW_;
   imageH_ = imageSize(outputH_,
-                      filterH_,
+                      (filterH_ - 1) * dilationH_ + 1,
                       paddingH_,
                       strideH_,
                       /* caffeMode */ true);
 
   imageW_ = imageSize(outputW_,
-                      filterW_,
+                      (filterW_ - 1) * dilationW_ + 1,
                       paddingW_,
                       strideW_,
                       /* caffeMode */ true);
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
new file mode 100644
index 0000000000..741984bb68
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
@@ -0,0 +1,163 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNLRNLayer.h"
+#include "paddle/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_lrn, MKLDNNLRNLayer);
+
+bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  /* the size of inputs for norm-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1UL);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  localSize_ = conf.size();
+  alpha_ = conf.scale();
+  beta_ = conf.pow();
+
+  ic_ = conf.channels();
+  oc_ = ic_;
+  iw_ = conf.img_size();
+  ow_ = conf.output_x();
+  ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  oh_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  CHECK_EQ(iw_, ow_);
+  CHECK_EQ(ih_, oh_);
+  return true;
+}
+
+void MKLDNNLRNLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  reshapeInput(bs, ih, iw);
+  // ic_ and oc can not be changed
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
+      << "Input channel can not be changed";
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNLRNLayer::resetFwd(std::vector<primitive>& pipeline,
+                              std::vector<MKLDNNMatrixPtr>& inputs,
+                              MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs[0], out);
+
+  resetFwdPD(fwdPD_, inputs[0], out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
+}
+
+void MKLDNNLRNLayer::resetBwd(std::vector<primitive>& pipeline,
+                              std::vector<MKLDNNMatrixPtr>& inputs,
+                              MKLDNNMatrixPtr& out) {
+  std::shared_ptr<lrn_bwd::primitive_desc> pd;
+
+  resetBwdBuffers(inputs[0], out);
+
+  resetBwdPD(pd, inputs[0], out);
+
+  resetBwdPipeline(pipeline, pd, inputs[0], out);
+}
+
+void MKLDNNLRNLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                     MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+  CHECK(in);
+  resetOutValue(out, in->getPrimitiveDesc());
+}
+
+void MKLDNNLRNLayer::resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                                MKLDNNMatrixPtr in,
+                                MKLDNNMatrixPtr out) {
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  auto fwdDesc = lrn_fwd::desc(pk,
+                               algorithm::lrn_across_channels,
+                               in->getMemoryDesc(),
+                               localSize_,
+                               alpha_,
+                               beta_,
+                               1.0f);
+  pd.reset(new lrn_fwd::primitive_desc(fwdDesc, engine_));
+  // prepare workspace if necessary
+  workspace_ =
+      passType_ != PASS_TEST
+          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
+          : nullptr;
+}
+
+void MKLDNNLRNLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  fwd_ = workspace_
+             ? std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *workspace_, *out))
+             : std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNLRNLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                     MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0] && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
+}
+
+void MKLDNNLRNLayer::resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                                MKLDNNMatrixPtr& in,
+                                MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  CHECK(out);
+  auto bwdDesc = lrn_bwd::desc(algorithm::lrn_across_channels,
+                               in->getMemoryDesc(),
+                               out->getMemoryDesc(),
+                               localSize_,
+                               alpha_,
+                               beta_,
+                               1.0f);
+  pd.reset(new lrn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNLRNLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  if (pd == nullptr) {
+    return;
+  }
+  CHECK(inVals_[0]);
+  CHECK(workspace_);
+  bwdData_ = std::make_shared<lrn_bwd>(
+      lrn_bwd(*pd, *inVals_[0], *out, *workspace_, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.h b/paddle/gserver/layers/MKLDNNLRNLayer.h
new file mode 100644
index 0000000000..cfe5621252
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+typedef mkldnn::lrn_forward lrn_fwd;
+typedef mkldnn::lrn_backward lrn_bwd;
+
+/**
+ * @brief A subclass of MKLDNNLayer LRN(Local Response Norm) layer.
+ *
+ * The config file api is mkldnn_lrn
+ */
+class MKLDNNLRNLayer : public MKLDNNLayer {
+protected:
+  // save forward primitive_desc, which can be used in backward
+  std::shared_ptr<lrn_fwd::primitive_desc> fwdPD_;
+  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+  // test_lrn_backward.cpp, lrn need workspace for backward
+  std::shared_ptr<mkldnn::memory> workspace_;
+
+  int localSize_;
+  float alpha_, beta_;  // scale and pow in paddle
+
+public:
+  explicit MKLDNNLRNLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+
+  ~MKLDNNLRNLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+protected:
+  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr& in,
+                  MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 87613a96c5..fceb389d06 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -45,6 +45,8 @@ bool PoolLayer::init(const LayerMap& layerMap,
   strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
   confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
   outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+
+  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
   return true;
 }
 
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index d43292ad2d..9df672a935 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -38,6 +38,8 @@ protected:
 
   std::string poolType_;
 
+  bool excludeMode_;
+
 public:
   explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
 
diff --git a/paddle/gserver/layers/PoolProjection.cpp b/paddle/gserver/layers/PoolProjection.cpp
index d90b438448..6a9de394ce 100644
--- a/paddle/gserver/layers/PoolProjection.cpp
+++ b/paddle/gserver/layers/PoolProjection.cpp
@@ -36,6 +36,8 @@ PoolProjection::PoolProjection(const ProjectionConfig& config,
   strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
   confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
   outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+
+  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
 }
 
 size_t PoolProjection::getSize() {
@@ -141,7 +143,8 @@ void AvgPoolProjection::forward() {
                        outputY_,
                        outputX_,
                        confPaddingY_,
-                       confPadding_);
+                       confPadding_,
+                       excludeMode_);
 }
 
 void AvgPoolProjection::backward(const UpdateCallback& callback) {
@@ -166,6 +169,7 @@ void AvgPoolProjection::backward(const UpdateCallback& callback) {
                              1,
                              1,
                              confPaddingY_,
-                             confPadding_);
+                             confPadding_,
+                             excludeMode_);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h
index 9a75f465f6..a0412714bc 100644
--- a/paddle/gserver/layers/PoolProjection.h
+++ b/paddle/gserver/layers/PoolProjection.h
@@ -28,6 +28,7 @@ protected:
   int confPaddingY_, confPadding_;
   size_t channels_;
   std::string poolType_;
+  bool excludeMode_;
 
 public:
   PoolProjection(const ProjectionConfig& config,
diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/gserver/layers/ROIPoolLayer.cpp
index 2c8256b91c..7d7c30b4d8 100644
--- a/paddle/gserver/layers/ROIPoolLayer.cpp
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
@@ -84,12 +84,15 @@ void ROIPoolLayer::forward(PassType passType) {
   size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
 
   real* outputData = outputValue->getData();
-  Matrix::resizeOrCreate(maxIdxs_,
-                         numROIs,
-                         channels_ * pooledHeight_ * pooledWidth_,
-                         false,
-                         false);
-  real* argmaxData = maxIdxs_->getData();
+  real* argmaxData = nullptr;
+  if (passType != PASS_TEST) {
+    Matrix::resizeOrCreate(maxIdxs_,
+                           numROIs,
+                           channels_ * pooledHeight_ * pooledWidth_,
+                           false,
+                           false);
+    argmaxData = maxIdxs_->getData();
+  }
 
   for (size_t n = 0; n < numROIs; ++n) {
     // the first five elememts of each RoI should be:
@@ -128,14 +131,18 @@ void ROIPoolLayer::forward(PassType passType) {
           bool isEmpty = (hend <= hstart) || (wend <= wstart);
           size_t poolIndex = ph * pooledWidth_ + pw;
           outputData[poolIndex] = isEmpty ? 0 : -FLT_MAX;
-          argmaxData[poolIndex] = -1;
+          if (argmaxData) {
+            argmaxData[poolIndex] = -1;
+          }
 
           for (size_t h = hstart; h < hend; ++h) {
             for (size_t w = wstart; w < wend; ++w) {
               size_t index = h * width_ + w;
               if (batchData[index] > outputData[poolIndex]) {
                 outputData[poolIndex] = batchData[index];
-                argmaxData[poolIndex] = index;
+                if (argmaxData) {
+                  argmaxData[poolIndex] = index;
+                }
               }
             }
           }
@@ -143,7 +150,9 @@ void ROIPoolLayer::forward(PassType passType) {
       }
       batchData += channelOffset;
       outputData += poolChannelOffset;
-      argmaxData += poolChannelOffset;
+      if (argmaxData) {
+        argmaxData += poolChannelOffset;
+      }
     }
     bottomROIs += roiOffset;
   }
diff --git a/paddle/gserver/layers/SequenceToBatch.cpp b/paddle/gserver/layers/SequenceToBatch.cpp
index 5fa7b6f488..6b769378d2 100644
--- a/paddle/gserver/layers/SequenceToBatch.cpp
+++ b/paddle/gserver/layers/SequenceToBatch.cpp
@@ -171,12 +171,31 @@ void SequenceToBatch::sequence2BatchCopy(Matrix &batch,
     hl_sequence2batch_copy(
         batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
   } else {
-    for (int i = 0; i < batchCount; ++i) {
-      if (seq2batch) {
+    if (seq2batch) {
+#ifdef PADDLE_USE_MKLML
+      const int blockMemSize = 8 * 1024;
+      const int blockSize = blockMemSize / sizeof(real);
+#pragma omp parallel for collapse(2)
+      for (int i = 0; i < batchCount; ++i) {
+        for (int j = 0; j < seqWidth; j += blockSize) {
+          memcpy(batch.rowBuf(i) + j,
+                 sequence.rowBuf(idxData[i]) + j,
+                 (j + blockSize > seqWidth) ? (seqWidth - j) * sizeof(real)
+                                            : blockMemSize);
+        }
+      }
+#else
+      for (int i = 0; i < batchCount; ++i) {
         memcpy(batch.rowBuf(i),
                sequence.rowBuf(idxData[i]),
                seqWidth * sizeof(real));
-      } else {
+      }
+#endif
+    } else {
+#ifdef PADDLE_USE_MKLML
+#pragma omp parallel for
+#endif
+      for (int i = 0; i < batchCount; ++i) {
         memcpy(sequence.rowBuf(idxData[i]),
                batch.rowBuf(i),
                seqWidth * sizeof(real));
diff --git a/paddle/gserver/tests/mkldnn_simple_net.conf b/paddle/gserver/tests/mkldnn_simple_net.conf
index 8bbe91e56d..0e9d6b31fa 100644
--- a/paddle/gserver/tests/mkldnn_simple_net.conf
+++ b/paddle/gserver/tests/mkldnn_simple_net.conf
@@ -51,6 +51,8 @@ tmp = img_pool_layer(input=tmp,
             padding=1,
             pool_type=MaxPooling())
 
+tmp = img_cmrnorm_layer(input=tmp, size=5, scale=0.0001, power=0.75)
+
 tmp = fc_layer(input=tmp,
             size=channels,
             bias_attr=False,
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index c5359f272b..a2f07937b8 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -238,9 +238,24 @@ void testProjectionConv(size_t groups, bool isDeconv) {
                             /* caffeMode */ true);
   conv->set_output_x(output_x);
   conv->set_output_y(output_y);
+  LOG(INFO) << "DILATION:" << DILATION << "; output_x: " << output_x
+            << "; output_y: " << output_y;
   if (isDeconv) {
+    int deconv_image_x = imageSize(output_x,
+                                   (conv->filter_size() - 1) * DILATION + 1,
+                                   conv->padding(),
+                                   conv->stride(),
+                                   /* caffeMode */ true);
+    int deconv_image_y = imageSize(output_y,
+                                   (conv->filter_size_y() - 1) * DILATION + 1,
+                                   conv->padding_y(),
+                                   conv->stride_y(),
+                                   /* caffeMode */ true);
+
+    LOG(INFO) << " deconv_image_x: " << deconv_image_x
+              << "; deconv_image_y: " << deconv_image_y;
     conf.set_input_size(output_x * output_y * CHANNELS);
-    conf.set_output_size(IMAGE_SIZE * IMAGE_SIZE * NUM_FILTERS);
+    conf.set_output_size(deconv_image_x * deconv_image_y * NUM_FILTERS);
   } else {
     conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
     conf.set_output_size(output_x * output_y * NUM_FILTERS);
@@ -1211,7 +1226,10 @@ void setPoolConfig(TestConfig* config,
   pool->set_output_y(oh);
 }
 
-void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
+void testPoolLayer(const string& poolType,
+                   bool trans,
+                   bool useGpu,
+                   bool excludeMode = true) {
   TestConfig config;
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
   LayerInputConfig* input = config.layerConfig.add_inputs();
@@ -1219,6 +1237,7 @@ void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
 
   pool->set_img_size(14);
   pool->set_img_size_y(14);
+  pool->set_exclude_mode(excludeMode);
   setPoolConfig(&config, pool, poolType);
   config.layerConfig.set_size(pool->output_x() * pool->output_y() *
                               pool->channels());
@@ -1250,16 +1269,26 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
 
 TEST(Layer, PoolLayer) {
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ false,
+                /* excludeMode= */ false);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
   testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
 
 #ifdef PADDLE_WITH_CUDA
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ true,
+                /* excludeMode= */ false);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2(
+      "cudnn-avg-incl-pad-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index 56b523f220..ad1dbc3ee2 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -272,6 +272,51 @@ TEST(MKLDNNLayer, BatchNormLayer) {
   testBatchNormLayer({4, 16, 8, 10});
 }
 
+struct testLRNDesc {
+  int bs, ic, ih, iw;
+  float scale, pow;
+  int localSize;
+};
+
+void getMKLDNNLRNConfig(TestConfig& cfg, const testLRNDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_lrn");
+  cfg.layerConfig.set_active_type("relu");
+  size_t layerSize = pm.ic * pm.ih * pm.iw;
+  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_channels(pm.ic);
+  norm->set_size(pm.localSize);
+  norm->set_scale(pm.scale);
+  norm->set_pow(pm.pow);
+  norm->set_blocked(0);
+  norm->set_img_size(pm.iw);
+  norm->set_img_size_y(pm.ih);
+  norm->set_output_x(norm->img_size());
+  norm->set_output_y(norm->img_size_y());
+  cfg.layerConfig.set_size(layerSize);
+  cfg.biasSize = 0;
+}
+
+void testLRNLayer(const testLRNDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNLRNConfig(dnnConfig, pm);
+  // mkldnn_lrn <==> norm with cmrnorm-projection type
+  TestConfig refConfig = dnnConfig;
+  refConfig.layerConfig.set_type("norm");
+  LayerInputConfig* input = refConfig.layerConfig.mutable_inputs(0);
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cmrnorm-projection");
+  norm->set_scale(norm->scale() / norm->size());
+  RUN_MKLDNN_TEST(dnnConfig, refConfig, pm)
+}
+
+TEST(MKLDNNLayer, LRNLayer) {
+  testLRNLayer({4, 10, 12, 12, 0.001f, 0.75f, 5});
+  testLRNLayer({2, 32, 6, 6, 0.001f, 0.75f, 5});
+  testLRNLayer({4, 16, 8, 10, 0.01f, 0.5f, 5});
+}
+
 struct testImageDesc {
   int bs, ic, ih, iw;
 };
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index 94ef561f06..17563bf5e1 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -48,7 +48,7 @@ public:
    */
   virtual void* alloc(size_t size) {
     void* ptr;
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
     // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
     // memory alignment
     CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index ba86eacbb5..28ab54b450 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -206,7 +206,7 @@ double dotProduct<double>(const int n, const double* x, const double* y) {
 }
 #endif
 
-#if defined(PADDLE_USE_MKLML)
+#if defined(PADDLE_WITH_MKLML)
 
 template <>
 void vExp<float>(const int n, const float* a, float* r) {
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index f6e77029bd..29fe36e3a4 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifndef MATHFUNCTIONS_H_
 #define MATHFUNCTIONS_H_
 
-#ifdef PADDLE_USE_MKLML
+#ifdef PADDLE_WITH_MKLML
 #include <mkl_cblas.h>
 #include <mkl_lapacke.h>
 #include <mkl_vml_functions.h>
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 88e9180690..1ec4336cab 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "hl_top_k.h"
 #include "paddle/utils/Logging.h"
 
+#include "NEONFunctions.h"
 #include "paddle/function/GemmFunctor.h"
 #include "paddle/utils/ThreadLocal.h"
 
@@ -1130,7 +1131,8 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat,
                                size_t outputH,
                                size_t outputW,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               bool excludeMode) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
   real* inputData = inputMat.getData();
@@ -1153,7 +1155,8 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat,
                      paddingH,
                      paddingW,
                      data_,
-                     getStride());
+                     getStride(),
+                     excludeMode);
 }
 
 void GpuMatrix::avgPoolBackward(Matrix& outGrad,
@@ -1168,7 +1171,8 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
                                 real scaleTargets,
                                 real scaleOutput,
                                 size_t paddingH,
-                                size_t paddingW) {
+                                size_t paddingW,
+                                bool excludeMode) {
   CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
 
   real* outDiff = outGrad.getData();
@@ -1194,7 +1198,8 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
                       scaleTargets,
                       scaleOutput,
                       data_,
-                      outGrad.getStride());
+                      outGrad.getStride(),
+                      excludeMode);
 }
 
 void GpuMatrix::maxPool3DForward(Matrix& inputMat,
@@ -2136,7 +2141,8 @@ void CpuMatrix::avgPoolForward(Matrix& input,
                                size_t outputH,
                                size_t outputW,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               bool excludeMode) {
   // The main loop
   size_t num = input.getHeight();
   size_t inLength = imgSizeH * imgSizeW;
@@ -2165,7 +2171,8 @@ void CpuMatrix::avgPoolForward(Matrix& input,
               tgtData[ph * outputW + pw] += inData[h * imgSizeW + w];
             }
           }
-          int poolSize = (hend - hstart) * (wend - wstart);
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
           CHECK(poolSize);
           tgtData[ph * outputW + pw] /= poolSize;
         }
@@ -2189,7 +2196,8 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
                                 real scaleTargets,
                                 real scaleOutput,
                                 size_t paddingH,
-                                size_t paddingW) {
+                                size_t paddingW,
+                                bool excludeMode) {
   size_t num = input.getHeight();
   size_t channels = input.getWidth() / outputH / outputW;
   size_t inLength = imgSizeH * imgSizeW;
@@ -2211,7 +2219,8 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
           int wstart = pw * strideW - paddingW;
           int wend = std::min(wstart + sizeX, imgSizeW);
           wstart = std::max(wstart, 0);
-          int poolSize = (hend - hstart) * (wend - wstart);
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
           CHECK(poolSize);
 
           for (int h = hstart; h < hend; ++h) {
@@ -4157,16 +4166,36 @@ void CpuMatrix::print(std::ostream& os) const {
 void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   real* input = data.getData();
   real* w = W.getData();
+  real* output = data_;
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
   size_t paraSize = W.getHeight() * W.getWidth();
   CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+
   size_t partial_sum = numElements / paraSize;
+  if (paraSize == numElements) {
+    for (size_t n = 0; n < numSamples * numElements; ++n) {
+      output[n] = input[n] > 0 ? input[n] : input[n] * w[n % numElements];
+    }
+    return;
+  }
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  for (size_t n = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < paraSize; i++) {
+      neon::prelu(
+          input + i * partial_sum, w[i], output + i * partial_sum, partial_sum);
+    }
+    input = input + numElements;
+    output = output + numElements;
+  }
+#else
   for (size_t n = 0, k = 0; n < numSamples; ++n) {
     for (size_t i = 0; i < numElements; ++i, ++k) {
-      data_[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
+      output[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
     }
   }
+#endif
 }
 
 void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index e273f11236..c8e690e642 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -911,7 +911,8 @@ public:
                               size_t outputH,
                               size_t outputW,
                               size_t paddingH,
-                              size_t paddingW) {
+                              size_t paddingW,
+                              bool excludeMode = true) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -927,9 +928,11 @@ public:
                                real scaleTargets,
                                real scaleOutput,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               bool excludeMode = true) {
     LOG(FATAL) << "Not implemeted";
   }
+
   /**
    * Pooling 3D forward operation, pick out the largest element
    * in the sizeX of value
@@ -1458,7 +1461,8 @@ public:
                       size_t outputH,
                       size_t outputW,
                       size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      bool excludeMode = true);
 
   void avgPoolBackward(Matrix& input,
                        size_t imgSizeH,
@@ -1472,7 +1476,8 @@ public:
                        real scaleTargets,
                        real scaleOutput,
                        size_t paddingH,
-                       size_t paddingW);
+                       size_t paddingW,
+                       bool excludeMode = true);
 
   void maxPool3DForward(Matrix& inputMat,
                         Matrix& maxPoolIdx,
@@ -1730,7 +1735,8 @@ public:
                       size_t outputH,
                       size_t outputW,
                       size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      bool excludeMode = true);
 
   void avgPoolBackward(Matrix& input,
                        size_t imgSizeH,
@@ -1744,7 +1750,8 @@ public:
                        real scaleTargets,
                        real scaleOutput,
                        size_t paddingH,
-                       size_t paddingW);
+                       size_t paddingW,
+                       bool excludeMode = true);
 
   void maxPool3DForward(Matrix& inputMat,
                         Matrix& maxPoolIdx,
diff --git a/paddle/math/NEONFunctions.cpp b/paddle/math/NEONFunctions.cpp
index 3bf47901f1..0f83149422 100644
--- a/paddle/math/NEONFunctions.cpp
+++ b/paddle/math/NEONFunctions.cpp
@@ -49,6 +49,46 @@ void relu(const float* a, float* b, int len) {
   }
 }
 
+// b[i] = a[i] > 0.0f ? a[i] : a[i] * w
+void prelu(const float* a, float w, float* b, int len) {
+  int offset = len % 16;
+  float32x4_t ma0, ma1, ma2, ma3;
+
+  float32x4_t zero = vdupq_n_f32(0.f);
+  float32x4_t vw = vdupq_n_f32(w);
+
+  for (int k = 0; k < len / 16; k++, a += 16, b += 16) {
+    ma0 = vld1q_f32(a);
+    ma1 = vld1q_f32(a + 4);
+    ma2 = vld1q_f32(a + 8);
+    ma3 = vld1q_f32(a + 12);
+
+    uint32x4_t flag0 = vcgtq_f32(ma0, zero);
+    uint32x4_t flag1 = vcgtq_f32(ma1, zero);
+    uint32x4_t flag2 = vcgtq_f32(ma2, zero);
+    uint32x4_t flag3 = vcgtq_f32(ma3, zero);
+
+    float32x4_t mul0 = vmulq_f32(ma0, vw);
+    float32x4_t mul1 = vmulq_f32(ma1, vw);
+    float32x4_t mul2 = vmulq_f32(ma2, vw);
+    float32x4_t mul3 = vmulq_f32(ma3, vw);
+
+    ma0 = vbslq_f32(flag0, ma0, mul0);
+    ma1 = vbslq_f32(flag1, ma1, mul1);
+    ma2 = vbslq_f32(flag2, ma2, mul2);
+    ma3 = vbslq_f32(flag3, ma3, mul3);
+
+    vst1q_f32(b, ma0);
+    vst1q_f32(b + 4, ma1);
+    vst1q_f32(b + 8, ma2);
+    vst1q_f32(b + 12, ma3);
+  }
+
+  for (int i = 0; i < offset; i++) {
+    b[i] = a[i] > 0.0f ? a[i] : a[i] * w;
+  }
+}
+
 }  // namespace neon
 }  // namespace paddle
 
diff --git a/paddle/math/NEONFunctions.h b/paddle/math/NEONFunctions.h
index 69085e3335..d67b2f47a8 100644
--- a/paddle/math/NEONFunctions.h
+++ b/paddle/math/NEONFunctions.h
@@ -18,6 +18,7 @@ namespace paddle {
 namespace neon {
 
 void relu(const float* a, float* b, int len);
+void prelu(const float* a, float w, float* b, int len);
 
 }  // namespace neon
 }  // namespace paddle
diff --git a/paddle/math/float16.h b/paddle/math/float16.h
new file mode 100644
index 0000000000..efebbce504
--- /dev/null
+++ b/paddle/math/float16.h
@@ -0,0 +1,739 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif  // PADDLE_WITH_CUDA
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+#include "paddle/platform/hostdevice.h"
+
+#ifdef __GNUC__
+#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
+#else
+#define PADDLE_GNUC_VER 0
+#endif  // __GNUC__
+
+#ifdef __clang__
+#define PADDLE_CLANG_VER (__clang_major__ * 10 + __clang_minor__)
+#else
+#define PADDLE_CLANG_VER 0
+#endif  // __clang__
+
+#if defined(__CUDACC__) && CUDA_VERSION >= 7050
+#define PADDLE_CUDA_FP16
+#include <cuda_fp16.h>
+#endif
+
+#if defined(__arm__) || defined(__aarch64__)
+#define PADDLE_ARM
+#endif
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#define PADDLE_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
+    (PADDLE_GNUC_VER >= 62 || PADDLE_CLANG_VER >= 37)
+#define PADDLE_WITH_NATIVE_FP16
+#endif
+
+#ifndef PADDLE_ARM
+#include <immintrin.h>
+#endif  // PADDLE_ARM
+
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+
+namespace paddle {
+
+// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
+// and aligned at least on a 2-byte boundary, which leads to efficient
+// memory access of float16 struct and also makes float16 compatible
+// with CUDA half, ARM float16_t, and Eigen::half data types.
+struct PADDLE_ALIGN(2) float16 {
+public:
+  uint16_t x;
+
+  // Constructors
+  HOSTDEVICE inline float16() : x(0) {}
+
+  HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
+
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline explicit float16(const half& h) {
+#if CUDA_VERSION >= 9000
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
+#else
+    x = h.x;
+#endif  // CUDA_VERSION >= 9000
+  }
+#endif  // PADDLE_CUDA_FP16
+
+  HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {}
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  // __fp16 is a native half precision data type for arm cpu,
+  // float16_t is an alias for __fp16
+  HOSTDEVICE inline explicit float16(const float16_t& h) {
+    x = *reinterpret_cast<const uint16_t*>(&h);
+  }
+#endif
+
+  HOSTDEVICE inline explicit float16(float val) {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    half tmp = __float2half(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
+
+#elif defined(PADDLE_WITH_NATIVE_FP16)
+    float32x4_t tmp = vld1q_dup_f32(&val);
+    float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
+    x = *reinterpret_cast<uint16_t*>(&res);
+
+#elif defined(__F16C__)
+    x = _cvtss_sh(val, 0);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v, s;
+    v.f = val;
+    uint32_t sign = v.si & sigN;
+    v.si ^= sign;
+    sign >>= shiftSign;  // logical shift
+    s.si = mulN;
+    s.si = s.f * v.f;  // correct subnormals
+    v.si ^= (s.si ^ v.si) & -(minN > v.si);
+    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+    v.ui >>= shift;  // logical shift
+    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+    x = v.ui | sign;
+
+#endif
+  }
+
+  HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
+
+  template <class T>
+  HOSTDEVICE inline explicit float16(const T& val)
+      : x(float16(static_cast<float>(val)).x) {}
+
+  HOSTDEVICE inline float16& operator=(const float16& rhs) {
+    x = rhs.x;
+    return *this;
+  }
+
+// Assignment operators
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline float16& operator=(const half& rhs) {
+#if CUDA_VERSION >= 9000
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
+#else
+    x = rhs.x;
+#endif
+    return *this;
+  }
+#endif
+
+  HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
+    x = rhs.x;
+    return *this;
+  }
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
+    x = *reinterpret_cast<const uint16_t*>(&rhs);
+    return *this;
+  }
+#endif
+
+  HOSTDEVICE inline float16& operator=(bool b) {
+    x = b ? 0x3c00 : 0;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(float val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(double val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+// Conversion opertors
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline explicit operator half() const {
+#if CUDA_VERSION >= 9000
+    __half_raw h;
+    h.x = x;
+    return half(h);
+#else
+    half h;
+    h.x = x;
+    return h;
+#endif  // CUDA_VERSION >= 9000
+  }
+#endif  // PADDLE_CUDA_FP16
+
+  HOSTDEVICE inline explicit operator Eigen::half() const {
+    Eigen::half h;
+    h.x = x;
+    return h;
+  }
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline explicit operator float16_t() const {
+    return *reinterpret_cast<const float16_t*>(this);
+  }
+#endif
+
+  HOSTDEVICE inline explicit operator float() const {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    half tmp = *reinterpret_cast<const half*>(this);
+    return __half2float(tmp);
+
+#elif defined(PADDLE_WITH_NATIVE_FP16)
+    float16x4_t res = vld1_dup_f16(reinterpret_cast<const float16_t*>(this));
+    return vgetq_lane_f32(vcvt_f32_f16(res), 0);
+
+#elif defined(__F16C__)
+    return _cvtsh_ss(this->x);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v;
+    v.ui = this->x;
+    int32_t sign = v.si & sigC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+
+#endif
+  }
+
+  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(float(*this));
+  }
+
+private:
+  union Bits {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  };
+
+  static const int shift = 13;
+  static const int shiftSign = 16;
+
+  static const int32_t infN = 0x7F800000;
+  static const int32_t maxN = 0x477FE000;  // max flt16 as flt32
+  static const int32_t minN = 0x38800000;  // min flt16 normal as flt32
+  static const int32_t sigN = 0x80000000;  // sign bit
+
+  static constexpr int32_t infC = infN >> shift;
+  static constexpr int32_t nanN = (infC + 1)
+                                  << shift;  // minimum flt16 nan as float32
+  static constexpr int32_t maxC = maxN >> shift;
+  static constexpr int32_t minC = minN >> shift;
+  static constexpr int32_t sigC = sigN >> shiftSign;
+
+  static const int32_t mulN = 0x52000000;  // (1 << 23) / minN
+  static const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
+  static const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
+  static const int32_t norC = 0x00400;     // min flt32 normal downshifted
+
+  static constexpr int32_t maxD = infC - maxC - 1;
+  static constexpr int32_t minD = minC - subC - 1;
+};
+
+// Arithmetic operators on GPU
+// CUDA 9.0 provides built-in arithmetic operators for half while
+// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
+// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
+// CUDA 9.0 regarding the half data type.
+#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
+
+DEVICE inline half operator+(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hadd(a, b);
+#else
+  float res = float(float16(a)) + float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator-(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hsub(a, b);
+#else
+  float res = float(float16(a)) - float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator*(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hmul(a, b);
+#else
+  float res = float(float16(a)) * float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator/(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+  float num = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
+#else
+  float res = float(float16(a)) / float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator-(const half& a) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hneg(a);
+#else
+  float res = -float(float16(a));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half& operator+=(half& a, const half& b) {
+  a = a + b;
+  return a;
+}
+
+DEVICE inline half& operator-=(half& a, const half& b) {
+  a = a - b;
+  return a;
+}
+
+DEVICE inline half& operator*=(half& a, const half& b) {
+  a = a * b;
+  return a;
+}
+
+DEVICE inline half& operator/=(half& a, const half& b) {
+  a = a / b;
+  return a;
+}
+
+DEVICE inline bool operator==(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __heq(a, b);
+#else
+  return float(float16(a)) == float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator!=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hne(a, b);
+#else
+  return float(float16(a)) != float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator<(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(a, b);
+#else
+  return float(float16(a)) < float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator<=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hle(a, b);
+#else
+  return float(float16(a)) <= float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator>(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hgt(a, b);
+#else
+  return float(float16(a)) > float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator>=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hge(a, b);
+#else
+  return float(float16(a)) >= float(float16(b));
+#endif
+}
+
+#endif  // PADDLE_CUDA_FP16
+
+// Arithmetic operators on ARMv8.2-A CPU
+#if defined(PADDLE_WITH_NATIVE_FP16)
+HOST inline float16 operator+(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fadd h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator-(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fsub h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator*(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fmul h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator/(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fdiv h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator-(const float16& a) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "fneg h0, h0\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0");
+  return res;
+}
+
+HOST inline float16& operator+=(float16& a, const float16& b) {
+  a = a + b;
+  return a;
+}
+
+HOST inline float16& operator-=(float16& a, const float16& b) {
+  a = a - b;
+  return a;
+}
+
+HOST inline float16& operator*=(float16& a, const float16& b) {
+  a = a * b;
+  return a;
+}
+
+HOST inline float16& operator/=(float16& a, const float16& b) {
+  a = a / b;
+  return a;
+}
+
+HOST inline bool operator==(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmeq h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator!=(const float16& a, const float16& b) {
+  return !(a == b);
+}
+
+HOST inline bool operator<(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator<=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator>(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator>=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+// Arithmetic operators, software emulated on other CPU
+#else
+HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
+  return float16(float(a) + float(b));
+}
+
+HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
+  return float16(float(a) - float(b));
+}
+
+HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
+  return float16(float(a) * float(b));
+}
+
+HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
+  return float16(float(a) / float(b));
+}
+
+HOSTDEVICE inline float16 operator-(const float16& a) {
+  float16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+
+HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
+  a = float16(float(a) + float(b));
+  return a;
+}
+
+HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
+  a = float16(float(a) - float(b));
+  return a;
+}
+
+HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
+  a = float16(float(a) * float(b));
+  return a;
+}
+
+HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
+  a = float16(float(a) / float(b));
+  return a;
+}
+
+HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
+  return float(a) == float(b);
+}
+
+HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
+  return float(a) != float(b);
+}
+
+HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
+  return float(a) < float(b);
+}
+
+HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
+  return float(a) <= float(b);
+}
+
+HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
+  return float(a) > float(b);
+}
+
+HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
+  return float(a) >= float(b);
+}
+#endif
+}  // namespace paddle
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index d8b7f9e3fc..dcd2a34583 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -22,6 +22,7 @@ if(WITH_GPU)
     link_paddle_test(test_Tensor)
     CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
     link_paddle_test(test_lazyAssign)
+    nv_test(test_float16_gpu SRCS test_float16.cu)
 else()
     compile_cu_as_cpp(test_Tensor.cu)
     add_unittest(test_Tensor test_Tensor.cu)
@@ -33,3 +34,4 @@ add_simple_unittest(test_FPException)
 add_simple_unittest(test_GpuProfiler)
 add_simple_unittest(test_BaseMatrix)
 add_simple_unittest(test_Matrix)
+add_simple_unittest(test_float16)
diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp
new file mode 100644
index 0000000000..74cc55aa37
--- /dev/null
+++ b/paddle/math/tests/test_float16.cpp
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/math/float16.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+TEST(float16, conversion_cpu) {
+  // Explicit conversion from Eigen::half
+  EXPECT_EQ(float16(Eigen::half(1.0f)).x, 0x3c00);
+  EXPECT_EQ(float16(Eigen::half(0.5f)).x, 0x3800);
+  EXPECT_EQ(float16(Eigen::half(0.33333f)).x, 0x3555);
+  EXPECT_EQ(float16(Eigen::half(0.0f)).x, 0x0000);
+  EXPECT_EQ(float16(Eigen::half(-0.0f)).x, 0x8000);
+  EXPECT_EQ(float16(Eigen::half(65504.0f)).x, 0x7bff);
+  EXPECT_EQ(float16(Eigen::half(65536.0f)).x, 0x7c00);
+
+  // Conversion from float
+  EXPECT_EQ(float16(1.0f).x, 0x3c00);
+  EXPECT_EQ(float16(0.5f).x, 0x3800);
+  EXPECT_EQ(float16(0.33333f).x, 0x3555);
+  EXPECT_EQ(float16(0.0f).x, 0x0000);
+  EXPECT_EQ(float16(-0.0f).x, 0x8000);
+  EXPECT_EQ(float16(65504.0f).x, 0x7bff);
+  EXPECT_EQ(float16(65536.0f).x, 0x7c00);
+
+  // Conversion from double
+  EXPECT_EQ(float16(1.0).x, 0x3c00);
+  EXPECT_EQ(float16(0.5).x, 0x3800);
+  EXPECT_EQ(float16(0.33333).x, 0x3555);
+  EXPECT_EQ(float16(0.0).x, 0x0000);
+  EXPECT_EQ(float16(-0.0).x, 0x8000);
+  EXPECT_EQ(float16(65504.0).x, 0x7bff);
+  EXPECT_EQ(float16(65536.0).x, 0x7c00);
+
+  // Conversion from int
+  EXPECT_EQ(float16(-1).x, 0xbc00);
+  EXPECT_EQ(float16(0).x, 0x0000);
+  EXPECT_EQ(float16(1).x, 0x3c00);
+  EXPECT_EQ(float16(2).x, 0x4000);
+  EXPECT_EQ(float16(3).x, 0x4200);
+
+  // Conversion from bool
+  EXPECT_EQ(float16(true).x, 0x3c00);
+  EXPECT_EQ(float16(false).x, 0x0000);
+
+  // Default constructor
+  float16 v_def;
+  EXPECT_EQ(v_def.x, 0x0000);
+
+  // Assignment operator
+  float16 v_assign;
+  v_assign = v_def;
+  EXPECT_EQ(v_assign.x, 0x0000);
+  v_assign = Eigen::half(1.0f);
+  EXPECT_EQ(v_assign.x, 0x3c00);
+  v_assign = 0.5f;
+  EXPECT_EQ(v_assign.x, 0x3800);
+  v_assign = 0.33333;
+  EXPECT_EQ(v_assign.x, 0x3555);
+  v_assign = -1;
+  EXPECT_EQ(v_assign.x, 0xbc00);
+  v_assign = true;
+  EXPECT_EQ(v_assign.x, 0x3c00);
+
+  // Conversion operator
+  EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
+  EXPECT_EQ(float(float16(0.5f)), 0.5f);
+  EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001);
+  EXPECT_EQ(int(float16(-1)), -1);
+  EXPECT_EQ(bool(float16(true)), true);
+}
+
+TEST(float16, arithmetic_cpu) {
+  EXPECT_EQ(float(float16(1) + float16(1)), 2);
+  EXPECT_EQ(float(float16(5) + float16(-5)), 0);
+  EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001);
+  EXPECT_EQ(float(float16(3) - float16(5)), -2);
+  EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001);
+  EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
+  EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
+  EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001);
+  EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f);
+  EXPECT_EQ(float(-float16(512.0f)), -512.0f);
+  EXPECT_EQ(float(-float16(-512.0f)), 512.0f);
+}
+
+TEST(float16, comparison_cpu) {
+  EXPECT_TRUE(float16(1.0f) == float16(1.0f));
+  EXPECT_FALSE(float16(-1.0f) == float16(-0.5f));
+  EXPECT_TRUE(float16(1.0f) != float16(0.5f));
+  EXPECT_FALSE(float16(-1.0f) != float16(-1.0f));
+  EXPECT_TRUE(float16(1.0f) < float16(2.0f));
+  EXPECT_FALSE(float16(-1.0f) < float16(-1.0f));
+  EXPECT_TRUE(float16(1.0f) <= float16(1.0f));
+  EXPECT_TRUE(float16(2.0f) > float16(1.0f));
+  EXPECT_FALSE(float16(-2.0f) > float16(-2.0f));
+  EXPECT_TRUE(float16(2.0f) >= float16(2.0f));
+
+  EXPECT_TRUE(float16(0.0f) == float16(-0.0f));
+  EXPECT_TRUE(float16(0.0f) <= float16(-0.0f));
+  EXPECT_TRUE(float16(0.0f) >= float16(-0.0f));
+  EXPECT_FALSE(float16(0.0f) < float16(-0.0f));
+  EXPECT_FALSE(float16(-0.0f) < float16(0.0f));
+  EXPECT_FALSE(float16(0.0f) > float16(-0.0f));
+  EXPECT_FALSE(float16(-0.0f) > float16(0.0f));
+}
+
+}  // namespace paddle
diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu
new file mode 100644
index 0000000000..4b520feaaf
--- /dev/null
+++ b/paddle/math/tests/test_float16.cu
@@ -0,0 +1,213 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/math/float16.h"
+
+#include <gtest/gtest.h>
+
+#include "paddle/utils/Logging.h"
+
+#define ARITHMETIC_KERNEL(op_type, sign)                                 \
+  __global__ void op_type(const half* in1, const half* in2, half* out) { \
+    out[0] = in1[0] sign in2[0];                                         \
+  }
+
+#define COMPOUND_KERNEL(op_type, sign) \
+  __global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; }
+
+#define COMPARISON_KERNEL(op_type, sign)                                 \
+  __global__ void op_type(const half* in1, const half* in2, bool* out) { \
+    out[0] = in1[0] sign in2[0];                                         \
+  }
+
+#define ARITHMETIC_KERNEL_LAUNCH(op_type)                     \
+  void Test##op_type(float v_in1, float v_in2, float v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
+    half *in1, *in2, *out;                                    \
+    half *d_in1, *d_in2, *d_out;                              \
+    int size = sizeof(half);                                  \
+    cudaMalloc((void**)&d_in1, size);                         \
+    cudaMalloc((void**)&d_in2, size);                         \
+    cudaMalloc((void**)&d_out, size);                         \
+    in1 = (half*)malloc(size);                                \
+    in2 = (half*)malloc(size);                                \
+    out = (half*)malloc(size);                                \
+    in1[0] = half(float16(v_in1));                            \
+    in2[0] = half(float16(v_in2));                            \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
+    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                   \
+    cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);     \
+    EXPECT_EQ(float(float16(out[0])), v_out);                 \
+    free(in1);                                                \
+    free(in2);                                                \
+    free(out);                                                \
+    cudaFree(d_in1);                                          \
+    cudaFree(d_in2);                                          \
+    cudaFree(d_out);                                          \
+  }
+
+#define COMPOUND_KERNEL_LAUNCH(op_type)                       \
+  void Test##op_type(float v_in1, float v_in2, float v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
+    half *in1, *in2;                                          \
+    half *d_in1, *d_in2;                                      \
+    int size = sizeof(half);                                  \
+    cudaMalloc((void**)&d_in1, size);                         \
+    cudaMalloc((void**)&d_in2, size);                         \
+    in1 = (half*)malloc(size);                                \
+    in2 = (half*)malloc(size);                                \
+    in1[0] = half(float16(v_in1));                            \
+    in2[0] = half(float16(v_in2));                            \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
+    op_type<<<1, 1>>>(d_in1, d_in2);                          \
+    cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost);     \
+    EXPECT_EQ(float(float16(in1[0])), v_out);                 \
+    free(in1);                                                \
+    free(in2);                                                \
+    cudaFree(d_in1);                                          \
+    cudaFree(d_in2);                                          \
+  }
+
+#define COMPARISON_KERNEL_LAUNCH(op_type)                    \
+  void Test##op_type(float v_in1, float v_in2, bool v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";          \
+    half *in1, *in2;                                         \
+    half *d_in1, *d_in2;                                     \
+    bool *out, *d_out;                                       \
+    int size = sizeof(half);                                 \
+    cudaMalloc((void**)&d_in1, size);                        \
+    cudaMalloc((void**)&d_in2, size);                        \
+    cudaMalloc((void**)&d_out, 1);                           \
+    in1 = (half*)malloc(size);                               \
+    in2 = (half*)malloc(size);                               \
+    out = (bool*)malloc(1);                                  \
+    in1[0] = half(float16(v_in1));                           \
+    in2[0] = half(float16(v_in2));                           \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);    \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);    \
+    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                  \
+    cudaMemcpy(out, d_out, 1, cudaMemcpyDeviceToHost);       \
+    EXPECT_EQ(out[0], v_out);                                \
+    free(in1);                                               \
+    free(in2);                                               \
+    free(out);                                               \
+    cudaFree(d_in1);                                         \
+    cudaFree(d_in2);                                         \
+    cudaFree(d_out);                                         \
+  }
+
+#ifdef PADDLE_CUDA_FP16
+namespace paddle {
+
+#if CUDA_VERSION < 9000
+ARITHMETIC_KERNEL(Add, +)
+ARITHMETIC_KERNEL(Sub, -)
+ARITHMETIC_KERNEL(Mul, *)
+ARITHMETIC_KERNEL(Div, /)
+
+ARITHMETIC_KERNEL_LAUNCH(Add)
+ARITHMETIC_KERNEL_LAUNCH(Sub)
+ARITHMETIC_KERNEL_LAUNCH(Mul)
+ARITHMETIC_KERNEL_LAUNCH(Div)
+
+// Negative sign kernel
+__global__ void Neg(half* in) { in[0] = -in[0]; }
+
+void TestNeg(float v_in, float v_out) {
+  LOG(INFO) << "Test Neg on GPU!";
+  half *in, *d_in;
+  int size = sizeof(half);
+  cudaMalloc((void**)&d_in, size);
+  in = (half*)malloc(size);
+  in[0] = half(float16(v_in));
+  cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
+  Neg<<<1, 1>>>(d_in);
+  cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
+  EXPECT_EQ(float(float16(in[0])), v_out);
+  free(in);
+  cudaFree(d_in);
+}
+
+COMPOUND_KERNEL(AddAssign, +=)
+COMPOUND_KERNEL(SubAssign, -=)
+COMPOUND_KERNEL(MulAssign, *=)
+COMPOUND_KERNEL(DivAssign, /=)
+
+COMPOUND_KERNEL_LAUNCH(AddAssign)
+COMPOUND_KERNEL_LAUNCH(SubAssign)
+COMPOUND_KERNEL_LAUNCH(MulAssign)
+COMPOUND_KERNEL_LAUNCH(DivAssign)
+
+COMPARISON_KERNEL(Equal, ==)
+COMPARISON_KERNEL(NotEqual, !=)
+COMPARISON_KERNEL(Less, <)
+COMPARISON_KERNEL(LessEqual, <=)
+COMPARISON_KERNEL(Greater, >)
+COMPARISON_KERNEL(GreaterEqual, >=)
+
+COMPARISON_KERNEL_LAUNCH(Equal)
+COMPARISON_KERNEL_LAUNCH(NotEqual)
+COMPARISON_KERNEL_LAUNCH(Less)
+COMPARISON_KERNEL_LAUNCH(LessEqual)
+COMPARISON_KERNEL_LAUNCH(Greater)
+COMPARISON_KERNEL_LAUNCH(GreaterEqual)
+
+TEST(float16, arithmetic_on_gpu) {
+  TestAdd(1, 2, 3);
+  TestSub(2, 1, 1);
+  TestMul(2, 3, 6);
+  TestDiv(6, 2, 3);
+  TestNeg(1, -1);
+}
+
+TEST(float16, compound_on_gpu) {
+  TestAddAssign(1, 2, 3);
+  TestSubAssign(2, 1, 1);
+  TestMulAssign(2, 3, 6);
+  TestDivAssign(6, 2, 3);
+}
+
+TEST(float16, comparision_on_gpu) {
+  TestEqual(1, 1, true);
+  TestEqual(1, 2, false);
+  TestNotEqual(2, 3, true);
+  TestNotEqual(2, 2, false);
+  TestLess(3, 4, true);
+  TestLess(3, 3, false);
+  TestLessEqual(3, 3, true);
+  TestLessEqual(3, 2, false);
+  TestGreater(4, 3, true);
+  TestGreater(4, 4, false);
+  TestGreaterEqual(4, 4, true);
+  TestGreaterEqual(4, 5, false);
+}
+#endif  // CUDA_VERSION
+
+TEST(float16, conversion_on_gpu) {
+  // Explicit conversion to and from cuda half
+  EXPECT_EQ(float16(half(float16(1.0f))).x, 0x3c00);
+  EXPECT_EQ(float16(half(float16(0.5f))).x, 0x3800);
+  EXPECT_EQ(float16(half(float16(0.33333f))).x, 0x3555);
+  EXPECT_EQ(float16(half(float16(0.0f))).x, 0x0000);
+  EXPECT_EQ(float16(half(float16(-0.0f))).x, 0x8000);
+  EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff);
+  EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00);
+
+  // Assignment operator
+  float16 v_assign;
+  v_assign = half(float16(1.0f));
+  EXPECT_EQ(v_assign.x, 0x3c00);
+}
+
+}  // namespace paddle
+#endif  // PADDLE_CUDA_FP16
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 7e5a1db44a..afb8d9d599 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -244,7 +244,7 @@ TEST(Matrix, unary) {
     LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK"
                  << "support so we cannot test matrix inverse. To test "
                  << "matrix inverse, please install LAPACKE "
-                 << "and MKL/Openblas/ATLAS, and re-build PaddlePaddle.";
+                 << "and MKL/Openblas, and re-build PaddlePaddle.";
 #endif
   }
 }
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 6b4e46f56a..509250debc 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock
+#include <algorithm>   // for std::max
 
 #include "gflags/gflags.h"
 
@@ -28,7 +29,7 @@ limitations under the License. */
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
 DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
-
+DECLARE_double(fraction_of_gpu_memory_to_use);
 namespace paddle {
 namespace memory {
 namespace detail {
@@ -43,7 +44,7 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) {
 
   void* p;
 
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
   // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
   // memory alignment
   PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0);
@@ -77,45 +78,20 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
   // if size is 0.  We just make sure it does.
   if (size <= 0) return nullptr;
-
-  size_t available = 0;
-  size_t capacity = 0;
-  paddle::platform::GpuMemoryUsage(available, capacity);
-
-  // Reserve memory for page tables, etc.
-  size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
-  size_t usable = available > reserving ? available - reserving : 0;
-
-  // If remaining size no less than expected size, using general
-  // cudaMalloc to allocate GPU memory.
-  void* p = 0;
-  if (size <= usable) {
-    cudaError_t result = cudaMalloc(&p, size);
-    if (result == cudaSuccess) {
-      index = 0;
-      gpu_alloc_size_ += size;
-      return p;
-    }
-  }
-
-  // If remaining size less than expected size or cudaMalloc failed,
-  // cudaMallocHost will be considered as a fallback allocator.
-  //
-  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
-  // of host fallback allocation. Allocates too much would reduce
-  // the amount of memory available to the underlying system for paging.
-  usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
-
-  if (size > usable) return nullptr;
-
-  cudaError_t result = cudaMallocHost(&p, size);
+  void* p;
+  cudaError_t result = cudaMalloc(&p, size);
   if (result == cudaSuccess) {
-    index = 1;
-    fallback_alloc_size_ += size;
+    index = 0;
+    gpu_alloc_size_ += size;
     return p;
+  } else {
+    LOG(WARNING)
+        << "Cannot malloc " << size / 1024.0 / 1024.0
+        << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use "
+           "environment variable to a lower value. Current value is "
+        << FLAGS_fraction_of_gpu_memory_to_use;
+    return nullptr;
   }
-
-  return nullptr;
 }
 
 void GPUAllocator::Free(void* p, size_t size, size_t index) {
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 95cfe2525e..9cafdfda75 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -64,19 +64,21 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
     int gpu_num = platform::GetCUDADeviceCount();
     as = new BuddyAllocator*[gpu_num];
     for (int gpu = 0; gpu < gpu_num; gpu++) {
-      platform::SetDeviceId(gpu);
-      as[gpu] = new BuddyAllocator(new detail::GPUAllocator,
-                                   platform::GpuMinChunkSize(),
-                                   platform::GpuMaxChunkSize());
+      as[gpu] = nullptr;
     }
+  }
+  platform::SetDeviceId(gpu_id);
+  if (!as[gpu_id]) {
+    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator,
+                                    platform::GpuMinChunkSize(),
+                                    platform::GpuMaxChunkSize());
     VLOG(10) << "\n\nNOTE: each GPU device use "
              << FLAGS_fraction_of_gpu_memory_to_use * 100
              << "% of GPU memory.\n"
-             << "You can set environment variable '"
-             << platform::kEnvFractionGpuMemoryToUse
+             << "You can set GFlags environment variable '"
+             << "FLAGS_fraction_of_gpu_memory_to_use"
              << "' to change the fraction of GPU usage.\n\n";
   }
-  platform::SetDeviceId(gpu_id);
   return as[gpu_id];
 }
 
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 38b89b9eb1..5aaaf99332 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -138,7 +138,7 @@ function(op_library TARGET)
     if ("${TARGET}" STREQUAL "nccl_op")
         set(pybind_flag 1)
         # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
+        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
     endif()
 
     # reduce_op contains several operators
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 2785a8c6fb..76da21c472 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -57,7 +57,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
-        ctx.device_context());
+        ctx.GetPlace());
   }
 };
 
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index d2dcab4e54..539a935302 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -104,5 +104,6 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 
 // FIXME(typhoonzero): types of T is for inference data.
 // label data is always int64
-REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
-                       paddle::operators::AccuracyOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(accuracy,
+                        paddle::operators::AccuracyOpCUDAKernel<float>,
+                        paddle::operators::AccuracyOpCUDAKernel<double>);
diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h
index d060e6eddd..04104a695f 100644
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@@ -21,7 +21,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AccuracyKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index 83262f950e..63490f0ec9 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -506,6 +506,22 @@ It is recommended to use the defaults for this activation.
   }
 };
 
+class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SwishOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Swish operator");
+    AddOutput("Y", "Output of Swish operator");
+    AddAttr<float>("beta", "Constant beta of swish operator").SetDefault(1.0f);
+    AddComment(R"DOC(
+Swish Activation Operator.
+
+$$y = \frac{x}{1 + e^{- \beta x}}$$
+
+)DOC");
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -592,16 +608,20 @@ REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker,
 REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
             hard_sigmoid_grad, ops::ActivationOpGrad);
 
-#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)       \
-  REGISTER_OP_CPU_KERNEL(                                                     \
-      act_type,                                                               \
-      ops::ActivationKernel<paddle::platform::CPUPlace, ops::functor<float>>, \
-      ops::ActivationKernel<paddle::platform::CPUPlace,                       \
-                            ops::functor<double>>);                           \
-  REGISTER_OP_CPU_KERNEL(                                                     \
-      act_type##_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace,  \
-                                                 ops::grad_functor<float>>,   \
-      ops::ActivationGradKernel<paddle::platform::CPUPlace,                   \
+REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
+            ops::ActivationOpGrad);
+
+#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)   \
+  REGISTER_OP_CPU_KERNEL(                                                 \
+      act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
+                                      ops::functor<float>>,               \
+      ops::ActivationKernel<paddle::platform::CPUDeviceContext,           \
+                            ops::functor<double>>);                       \
+  REGISTER_OP_CPU_KERNEL(                                                 \
+      act_type##_grad,                                                    \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
+                                ops::grad_functor<float>>,                \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
                                 ops::grad_functor<double>>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu
index 97737857ab..856d3fc35d 100644
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@@ -17,16 +17,17 @@
 
 namespace ops = paddle::operators;
 
-#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor)       \
-  REGISTER_OP_GPU_KERNEL(                                                     \
-      act_type,                                                               \
-      ops::ActivationKernel<paddle::platform::GPUPlace, ops::functor<float>>, \
-      ops::ActivationKernel<paddle::platform::GPUPlace,                       \
-                            ops::functor<double>>);                           \
-  REGISTER_OP_GPU_KERNEL(                                                     \
-      act_type##_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace,  \
-                                                 ops::grad_functor<float>>,   \
-      ops::ActivationGradKernel<paddle::platform::GPUPlace,                   \
+#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor)   \
+  REGISTER_OP_CUDA_KERNEL(                                                 \
+      act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext, \
+                                      ops::functor<float>>,                \
+      ops::ActivationKernel<paddle::platform::CUDADeviceContext,           \
+                            ops::functor<double>>);                        \
+  REGISTER_OP_CUDA_KERNEL(                                                 \
+      act_type##_grad,                                                     \
+      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
+                                ops::grad_functor<float>>,                 \
+      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
                                 ops::grad_functor<double>>);
 
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL);
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 8cd3bfbbd3..75eefca8b8 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class ActivationKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
@@ -32,18 +32,19 @@ class ActivationKernel
 
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
 
     auto attrs = functor.GetAttrs();
     for (auto& attr : attrs) {
       *attr.second = context.Attr<float>(attr.first);
     }
-    functor(place, x, y);
+    functor(*place, x, y);
   }
 };
 
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class ActivationGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
@@ -59,13 +60,14 @@ class ActivationGradKernel
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto y = framework::EigenVector<T>::Flatten(*Y);
     auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
     auto attrs = functor.GetAttrs();
     for (auto& attr : attrs) {
       *attr.second = context.Attr<float>(attr.first);
     }
-    functor(place, x, y, dy, dx);
+    functor(*place, x, y, dy, dx);
   }
 };
 
@@ -700,6 +702,35 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct SwishFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x / (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+  }
+};
+
+template <typename T>
+struct SwishGradFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    auto temp1 = static_cast<T>(1) /
+                 (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+    auto temp2 = temp1 * (static_cast<T>(1) - (beta * y));
+    dx.device(d) = dy * ((beta * y) + temp2);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -730,4 +761,5 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   __macro(elu, ELUFunctor, ELUGradFunctor);                          \
   __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor);    \
   __macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \
+  __macro(swish, SwishFunctor, SwishGradFunctor);                    \
   __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor);
diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc
index 16a7794d5b..507811e7b5 100644
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -92,12 +92,12 @@ for gradient descent.
 
 Adadelta updates are as follows:
 
-$$avgSquaredGradOut = \rho * avgSquaredGrad + (1 - \rho) * grad * grad \break
-paramUpdate =  - $\sqrt{((avgSquaredUpdate + \epsilon) /
-                       (avgSquaredGrad_out + \epsilon))}$ * grad \break
-avgSquaredUpdateOut = \rho * avgSquaredUpdate + (1 - \rho) *
-                                  {(paramUpdate)}^2 \break
-paramOut = param + paramUpdate$$
+$$
+avg\_squared\_grad\_out = \rho * avg\_squared\_grad + (1 - \rho) * grad * grad \\
+param\_update =  - \sqrt{\frac{avg\_squared\_update + \epsilon}{avg\_squared\_grad\_out + \epsilon}} * grad \\
+avg\_squared\_update\_out = \rho * avg\_squared\_update + (1 - \rho) * {param\_update}^2 \\
+param\_out = param + param\_update
+$$
 
 )DOC");
   }
@@ -109,5 +109,5 @@ paramOut = param + paramUpdate$$
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>,
-    ops::AdadeltaOpKernel<paddle::platform::CPUPlace, double>);
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adadelta_op.cu b/paddle/operators/adadelta_op.cu
index 9fb6185207..eee2d0a2f5 100644
--- a/paddle/operators/adadelta_op.cu
+++ b/paddle/operators/adadelta_op.cu
@@ -16,6 +16,6 @@
 #include "paddle/operators/adadelta_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>,
-    ops::AdadeltaOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adadelta_op.h b/paddle/operators/adadelta_op.h
index a8c5f0c8aa..819d0845db 100644
--- a/paddle/operators/adadelta_op.h
+++ b/paddle/operators/adadelta_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdadeltaOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -51,7 +51,7 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
         framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
     auto avg_squared_update_out =
         framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     avg_squared_grad_out.device(place) =
         rho * avg_squared_grad + (1 - rho) * grad.square();
diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc
index d6686e3ef3..5d00716316 100644
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -80,8 +80,8 @@ Adaptive Gradient Algorithm (Adagrad).
 
 The update is done as follows:
 
-$$momentOut = moment + grad * grad \break
-paramOut = param - learningRate * grad / ($\sqrt{momentOut}$ + \epsilon) \break
+$$moment\_out = moment + grad * grad \\
+param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
 $$
 
 The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
@@ -100,8 +100,8 @@ size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
 }  // namespace
 
 template <typename T>
-struct SparseAdagradFunctor<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& grad,
                   const framework::Tensor& learning_rate, T epsilon,
                   framework::Tensor* moment, framework::Tensor* param) {
@@ -120,7 +120,7 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
             {static_cast<int64_t>(merge_rows.size()), grad_width}),
         context.GetPlace());
 
-    math::SetConstant<platform::CPUPlace, T> constant_functor;
+    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
     constant_functor(context, grad_merge->mutable_value(), 0.0);
 
     auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
@@ -144,9 +144,9 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
     auto gs =
         framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
     auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
-    gs.device(*context.GetEigenDevice<platform::CPUPlace>()) = gm * gm;
+    gs.device(*context.eigen_device()) = gm * gm;
 
-    math::SelectedRowsAddToTensor<platform::CPUPlace, T> functor;
+    math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
     functor(context, *grad_square, moment);
 
     // 3. update parameter
@@ -164,13 +164,13 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template struct SparseAdagradFunctor<platform::CPUPlace, float>;
-template struct SparseAdagradFunctor<platform::CPUPlace, double>;
+template struct SparseAdagradFunctor<platform::CPUDeviceContext, float>;
+template struct SparseAdagradFunctor<platform::CPUDeviceContext, double>;
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::CPUPlace, float>,
-    ops::AdagradOpKernel<paddle::platform::CPUPlace, double>);
+    adagrad, ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu
index 1c870214b2..585b2d9289 100644
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -72,8 +72,8 @@ __global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
 }  // namespace
 
 template <typename T>
-struct SparseAdagradFunctor<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& grad,
                   const framework::Tensor& learning_rate, T epsilon,
                   framework::Tensor* moment, framework::Tensor* param) {
@@ -92,7 +92,7 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
             {static_cast<int64_t>(merge_rows.size()), grad_width}),
         context.GetPlace());
 
-    math::SetConstant<platform::GPUPlace, T> constant_functor;
+    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
     constant_functor(context, grad_merge->mutable_value(), 0.0);
 
     auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
@@ -119,9 +119,9 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
     auto gs =
         framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
     auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
-    gs.device(*context.GetEigenDevice<platform::GPUPlace>()) = gm * gm;
+    gs.device(*context.eigen_device()) = gm * gm;
 
-    math::SelectedRowsAddToTensor<platform::GPUPlace, T> functor;
+    math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
     functor(context, *grad_square, moment);
 
     // 3. update parameter
@@ -139,13 +139,13 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
   }
 };
 
-template struct SparseAdagradFunctor<platform::GPUPlace, float>;
-template struct SparseAdagradFunctor<platform::GPUPlace, double>;
+template struct SparseAdagradFunctor<platform::CUDADeviceContext, float>;
+template struct SparseAdagradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::GPUPlace, float>,
-    ops::AdagradOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    adagrad, ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adagrad_op.h b/paddle/operators/adagrad_op.h
index 4d4a6434c7..0d77dbcbac 100644
--- a/paddle/operators/adagrad_op.h
+++ b/paddle/operators/adagrad_op.h
@@ -19,15 +19,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SparseAdagradFunctor {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::SelectedRows& grad,
                   const framework::Tensor& learning_rate, T epsilon,
                   framework::Tensor* moment, framework::Tensor* param);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -52,11 +52,11 @@ class AdagradOpKernel : public framework::OpKernel<T> {
 
       auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
       auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-      auto place = ctx.GetEigenDevice<Place>();
+      auto* place = ctx.template device_context<DeviceContext>().eigen_device();
 
-      moment_out.device(place) = moment + grad * grad;
+      moment_out.device(*place) = moment + grad * grad;
       Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-      param_out.device(place) =
+      param_out.device(*place) =
           param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto* param_tensor = ctx.Input<framework::Tensor>("Param");
@@ -65,8 +65,9 @@ class AdagradOpKernel : public framework::OpKernel<T> {
       auto* moment_tensor = ctx.Input<framework::Tensor>("Moment");
       PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);
 
-      SparseAdagradFunctor<Place, T> functor;
-      functor(ctx.device_context(), *ctx.Input<framework::SelectedRows>("Grad"),
+      SparseAdagradFunctor<DeviceContext, T> functor;
+      functor(ctx.template device_context<DeviceContext>(),
+              *ctx.Input<framework::SelectedRows>("Grad"),
               *ctx.Input<framework::Tensor>("LearningRate"), epsilon,
               moment_out_tensor, param_out_tensor);
     } else {
diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc
index 03faa2a7c5..cf6ef6dd53 100644
--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@@ -112,11 +112,13 @@ adaptive estimates of lower-order moments.
 
 Adam updates:
 
-$$moment_1_{out} = \beta_1 * moment_1 + (1 - \beta_1) * grad \break
-moment_2_{out} = \beta_2 * moment_2 + (1 - \beta_2) * grad * grad \break
-learningRate = learningRate *
-                  $\sqrt{(1 - \beta_2_{pow})}$ / (1 - \beta_1_{pow}) \break
-paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
+$$
+moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
+moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
+learning\_rate = learning\_rate *
+                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+$$
 
 )DOC");
   }
@@ -126,6 +128,6 @@ paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
-REGISTER_OP_CPU_KERNEL(adam,
-                       ops::AdamOpKernel<paddle::platform::CPUPlace, float>,
-                       ops::AdamOpKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    adam, ops::AdamOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdamOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adam_op.cu b/paddle/operators/adam_op.cu
index 6e34f7818c..c135b37378 100644
--- a/paddle/operators/adam_op.cu
+++ b/paddle/operators/adam_op.cu
@@ -16,6 +16,6 @@
 #include "paddle/operators/adam_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(adam,
-                       ops::AdamOpKernel<paddle::platform::GPUPlace, float>,
-                       ops::AdamOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    adam, ops::AdamOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdamOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h
index 7f7fa1da1c..45157842a6 100644
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdamOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -52,17 +52,17 @@ class AdamOpKernel : public framework::OpKernel<T> {
     auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
     auto moment1_out = framework::EigenVector<T>::Flatten(*moment1_out_tensor);
     auto moment2_out = framework::EigenVector<T>::Flatten(*moment2_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
 
-    moment1_out.device(place) = beta1 * moment1 + (1 - beta1) * grad;
-    moment2_out.device(place) = beta2 * moment2 + (1 - beta2) * grad.square();
+    moment1_out.device(*place) = beta1 * moment1 + (1 - beta1) * grad;
+    moment2_out.device(*place) = beta2 * moment2 + (1 - beta2) * grad.square();
 
     // All of these are tensors of 1 element
     auto lr_t = lr * (1 - beta2_pow).sqrt() / (1 - beta1_pow);
     // Eigen does not support automatic broadcast
     // Get dimensions of moment vector to broadcast lr_t
     Eigen::DSizes<int, 1> m_dsize(moment1_out_tensor->numel());
-    param_out.device(place) =
+    param_out.device(*place) =
         param -
         lr_t.broadcast(m_dsize) *
             (moment1_out / (moment2_out.sqrt() + epsilon));
diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc
index 867ddd9790..49ce497bb7 100644
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@@ -108,10 +108,10 @@ Adam algorithm based on the infinity norm.
 Adamax updates:
 
 $$
-  momentOut = \beta_{1} * moment + (1 - \beta_{1}) * grad \\
-  infNormOut = max(\beta_{2} * infNorm + \epsilon, |grad|) \\
-  learningRate = \frac{learningRate}{1 - \beta_{1}^{Beta1Pow}} \\
-  paramOut = param - learningRate * \frac{momentOut}{infNormOut}
+moment\_out = \beta_1 * moment + (1 - \beta_1) * grad \\
+inf\_norm\_out = max(\beta_2 * inf\_norm + \epsilon, |grad|) \\
+learning\_rate = \frac{learning\_rate}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out}
 $$
 
 The original paper does not have an epsilon attribute.
@@ -127,6 +127,6 @@ division by 0 error.
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
-REGISTER_OP_CPU_KERNEL(adamax,
-                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, float>,
-                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    adamax, ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adamax_op.cu b/paddle/operators/adamax_op.cu
index 057ef39025..2d143905c4 100644
--- a/paddle/operators/adamax_op.cu
+++ b/paddle/operators/adamax_op.cu
@@ -16,6 +16,6 @@
 #include "paddle/operators/adamax_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(adamax,
-                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, float>,
-                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    adamax, ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h
index bf36ed7860..172c179c5f 100644
--- a/paddle/operators/adamax_op.h
+++ b/paddle/operators/adamax_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdamaxOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -51,14 +51,14 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
     auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
     auto inf_norm_out =
         framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
 
-    moment_out.device(place) = beta1 * moment + (1 - beta1) * grad;
-    inf_norm_out.device(place) =
+    moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad;
+    inf_norm_out.device(*place) =
         grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
     auto lr_t = lr / (1 - beta1_pow);
     Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-    param_out.device(place) =
+    param_out.device(*place) =
         param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
   }
 };
diff --git a/paddle/operators/auc_op.h b/paddle/operators/auc_op.h
index e5ac57b038..b80509e2a9 100644
--- a/paddle/operators/auc_op.h
+++ b/paddle/operators/auc_op.h
@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AucKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index ac97bd83ab..94a972b7ab 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -135,7 +135,8 @@ The required data format for this layer is one of the following:
 };
 
 template <typename T>
-class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
+class BatchNormKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const float epsilon = ctx.Attr<float>("epsilon");
@@ -318,12 +319,12 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
       PADDLE_THROW("can't find Y@GRAD");
     }
     return framework::OpKernelType(framework::ToDataType(t->type()),
-                                   ctx.device_context());
+                                   ctx.GetPlace());
   }
 };
 
 template <typename T>
-class BatchNormGradKernel<platform::CPUPlace, T>
+class BatchNormGradKernel<platform::CPUDeviceContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -436,8 +437,9 @@ class BatchNormGradKernel<platform::CPUPlace, T>
 namespace ops = paddle::operators;
 REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
             batch_norm_grad, ops::BatchNormGradOp);
-REGISTER_OP_CPU_KERNEL(batch_norm,
-                       ops::BatchNormKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    batch_norm,
+    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::CPUPlace, float>);
+    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc
index 7b2f318700..c7adc3d80e 100644
--- a/paddle/operators/batch_norm_op.cu.cc
+++ b/paddle/operators/batch_norm_op.cu.cc
@@ -47,7 +47,8 @@ void ExtractNCWHD(const framework::DDim &dims,
 }
 
 template <typename T>
-class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
+class BatchNormKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
@@ -121,11 +122,12 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
     saved_mean->mutable_data<T>(ctx.GetPlace());
     saved_variance->mutable_data<T>(ctx.GetPlace());
 
-    math::SetConstant<platform::GPUPlace, T> functor;
-    functor(ctx.device_context(), saved_mean, 0);
-    functor(ctx.device_context(), saved_variance, 0);
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    math::SetConstant<platform::CUDADeviceContext, T> functor;
+    functor(dev_ctx, saved_mean, 0);
+    functor(dev_ctx, saved_variance, 0);
 
-    auto handle = ctx.cuda_device_context().cudnn_handle();
+    auto handle = dev_ctx.cudnn_handle();
 
     // Now, depending on whether we are running test or not, we have two paths.
     if (is_test) {
@@ -171,7 +173,7 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
 };
 
 template <typename T>
-class BatchNormGradKernel<platform::GPUPlace, T>
+class BatchNormGradKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -244,11 +246,12 @@ class BatchNormGradKernel<platform::GPUPlace, T>
     const void *saved_mean_data = saved_mean->template data<T>();
     const void *saved_var_data = saved_var->template data<T>();
 
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
-        ctx.cuda_device_context().cudnn_handle(), mode_,
-        CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
-        CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(), data_desc_,
-        x->template data<T>(), data_desc_, d_y->template data<T>(), data_desc_,
+        dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
+        data_desc_, d_y->template data<T>(), data_desc_,
         d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
         scale->template data<T>(),
         d_scale->template mutable_data<T>(ctx.GetPlace()),
@@ -266,8 +269,9 @@ class BatchNormGradKernel<platform::GPUPlace, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(batch_norm,
-                       ops::BatchNormKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    batch_norm,
+    ops::BatchNormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::GPUPlace, float>);
+    ops::BatchNormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/batch_norm_op.h b/paddle/operators/batch_norm_op.h
index 4e80134a1a..8d99b68647 100644
--- a/paddle/operators/batch_norm_op.h
+++ b/paddle/operators/batch_norm_op.h
@@ -34,13 +34,13 @@ inline TensorFormat StringToTensorFormat(const std::string& str) {
   }
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BatchNormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BatchNormGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override;
diff --git a/paddle/operators/bilinear_tensor_product_op.cc b/paddle/operators/bilinear_tensor_product_op.cc
index c88b2c9beb..217fd52366 100644
--- a/paddle/operators/bilinear_tensor_product_op.cc
+++ b/paddle/operators/bilinear_tensor_product_op.cc
@@ -159,9 +159,12 @@ REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
             ops::BilinearTensorProductOpGrad);
 REGISTER_OP_CPU_KERNEL(
     bilinear_tensor_product,
-    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, float>,
-    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, double>);
+    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
 REGISTER_OP_CPU_KERNEL(
     bilinear_tensor_product_grad,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, float>,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, double>);
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
+                                         float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
+                                         double>);
diff --git a/paddle/operators/bilinear_tensor_product_op.cu b/paddle/operators/bilinear_tensor_product_op.cu
index 858d2668d0..0f48010716 100644
--- a/paddle/operators/bilinear_tensor_product_op.cu
+++ b/paddle/operators/bilinear_tensor_product_op.cu
@@ -16,11 +16,15 @@ limitations under the License. */
 #include "paddle/operators/bilinear_tensor_product_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     bilinear_tensor_product,
-    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, float>,
-    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
+    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
+                                     double>);
+REGISTER_OP_CUDA_KERNEL(
     bilinear_tensor_product_grad,
-    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, float>,
-    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, double>);
+    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
+                                         float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
+                                         double>);
diff --git a/paddle/operators/bilinear_tensor_product_op.h b/paddle/operators/bilinear_tensor_product_op.h
index 1113a4c6f3..ba9a2c5ce3 100644
--- a/paddle/operators/bilinear_tensor_product_op.h
+++ b/paddle/operators/bilinear_tensor_product_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BilinearTensorProductKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -46,7 +46,8 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
     int out_dim = weight_dims[0];
     auto x_dim = weight_dims[1];
     auto y_dim = weight_dims[2];
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
     // Create the intermediate variable to caculate the result of
     // Input(X) multiplied by Input(Weight_i), the formula is:
@@ -60,9 +61,9 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
       auto output_col_vec = output_mat.chip(i, 1);
       Tensor weight_mat =
           weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
-      math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
-                           batch_size, y_dim, x_dim, 1, x->data<T>(),
-                           weight_mat.data<T>(), 0, left_mul.data<T>());
+      math::gemm<DeviceContext, T>(dev_ctx, CblasNoTrans, CblasNoTrans,
+                                   batch_size, y_dim, x_dim, 1, x->data<T>(),
+                                   weight_mat.data<T>(), 0, left_mul.data<T>());
       output_col_vec.device(place) =
           (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
     }
@@ -74,7 +75,7 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -96,8 +97,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
     auto x_mat = EigenMatrix<T>::From(*x);
     auto y_mat = EigenMatrix<T>::From(*y);
     auto d_out_mat = EigenMatrix<T>::From(*d_out);
-    auto place = ctx.GetEigenDevice<Place>();
-
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
     // Create the intermediate variable to caculate the Output(Y@Grad).
     Tensor x_scale;
     x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}),
@@ -110,18 +111,18 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
                             ctx.GetPlace());
     auto y_scale_mat = EigenMatrix<T>::From(y_scale);
 
-    math::SetConstant<Place, T> set_zero;
+    math::SetConstant<DeviceContext, T> set_zero;
 
     // Set Output(X@Grad) be zero.
     if (d_x) {
       d_x->mutable_data<T>(ctx.GetPlace());
-      set_zero(ctx.device_context(), d_x, static_cast<T>(0));
+      set_zero(dev_ctx, d_x, static_cast<T>(0));
     }
 
     // Set Output(Y@Grad) be zero.
     if (d_y) {
       d_y->mutable_data<T>(ctx.GetPlace());
-      set_zero(ctx.device_context(), d_y, static_cast<T>(0));
+      set_zero(dev_ctx, d_y, static_cast<T>(0));
     }
 
     // Caculate the Output(X@Grad) and Output(Y@Grad).
@@ -137,18 +138,18 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
               output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                   .broadcast(bcast_for_x) *
               y_mat;
-          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasTrans,
-                               batch_size, x_dim, y_dim, 1, y_scale.data<T>(),
-                               weight_i.data<T>(), 1, d_x->data<T>());
+          math::gemm<DeviceContext, T>(
+              dev_ctx, CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
+              y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
         }
         if (d_y) {
           x_scale_mat.device(place) =
               output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                   .broadcast(bcast_for_y) *
               x_mat;
-          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
-                               batch_size, y_dim, x_dim, 1, x_scale.data<T>(),
-                               weight_i.data<T>(), 1, d_y->data<T>());
+          math::gemm<DeviceContext, T>(
+              dev_ctx, CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
+              x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
         }
       }
     }
@@ -165,9 +166,9 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
             output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                 .broadcast(bcast_for_weight) *
             x_mat;
-        math::gemm<Place, T>(ctx.device_context(), CblasTrans, CblasNoTrans,
-                             x_dim, y_dim, batch_size, 1, x_scale.data<T>(),
-                             y->data<T>(), 0, d_weight_i.data<T>());
+        math::gemm<DeviceContext, T>(dev_ctx, CblasTrans, CblasNoTrans, x_dim,
+                                     y_dim, batch_size, 1, x_scale.data<T>(),
+                                     y->data<T>(), 0, d_weight_i.data<T>());
       }
     }
 
diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc
index 3082a53ccf..d641b8fc9f 100644
--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
@@ -68,10 +68,11 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUPlace;
+using CPU = paddle::platform::CPUDeviceContext;
 REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape,
                         ops::CastOpProtoMaker);
 REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
                        ops::CastOpKernel<CPU, double>,
                        ops::CastOpKernel<CPU, int>,
-                       ops::CastOpKernel<CPU, int64_t>);
+                       ops::CastOpKernel<CPU, int64_t>,
+                       ops::CastOpKernel<CPU, bool>);
diff --git a/paddle/operators/cast_op.cu b/paddle/operators/cast_op.cu
index fb75ddbabf..91e6fb391c 100644
--- a/paddle/operators/cast_op.cu
+++ b/paddle/operators/cast_op.cu
@@ -16,7 +16,8 @@
 
 template <typename T>
 using CastOpKernel =
-    paddle::operators::CastOpKernel<paddle::platform::GPUPlace, T>;
+    paddle::operators::CastOpKernel<paddle::platform::CUDADeviceContext, T>;
 
-REGISTER_OP_GPU_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
-                       CastOpKernel<int>, CastOpKernel<int64_t>);
+REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
+                        CastOpKernel<int>, CastOpKernel<int64_t>,
+                        CastOpKernel<bool>);
diff --git a/paddle/operators/cast_op.h b/paddle/operators/cast_op.h
index 850dc8e349..a6773f13a8 100644
--- a/paddle/operators/cast_op.h
+++ b/paddle/operators/cast_op.h
@@ -27,13 +27,13 @@ struct CastOpTransformFunctor {
   HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
 };
 
-template <typename Place, typename InT>
+template <typename DeviceContext, typename InT>
 struct CastOpFunctor {
   const framework::Tensor* in_;
   framework::Tensor* out_;
-  const platform::DeviceContext& ctx_;
+  const DeviceContext& ctx_;
   CastOpFunctor(const framework::Tensor* in, framework::Tensor* out,
-                const platform::DeviceContext& ctx)
+                const DeviceContext& ctx)
       : in_(in), out_(out), ctx_(ctx) {}
 
   template <typename OutT>
@@ -42,13 +42,13 @@ struct CastOpFunctor {
     auto numel = in_->numel();
     auto* in_end = in_begin + numel;
     auto* out_begin = out_->mutable_data<OutT>(ctx_.GetPlace());
-    platform::Transform<Place> trans;
+    platform::Transform<DeviceContext> trans;
     trans(ctx_, in_begin, in_end, out_begin,
           CastOpTransformFunctor<InT, OutT>());
   }
 };
 
-template <typename Place, typename InT>
+template <typename DeviceContext, typename InT>
 class CastOpKernel : public framework::OpKernel<InT> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -56,7 +56,8 @@ class CastOpKernel : public framework::OpKernel<InT> {
     auto* out = context.Output<framework::Tensor>("Out");
     framework::VisitDataType(
         static_cast<framework::DataType>(context.Attr<int>("out_dtype")),
-        CastOpFunctor<Place, InT>(in, out, context.device_context()));
+        CastOpFunctor<DeviceContext, InT>(
+            in, out, context.template device_context<DeviceContext>()));
   }
 };
 
diff --git a/paddle/operators/chunk_eval_op.cc b/paddle/operators/chunk_eval_op.cc
index 94127ab33e..894f355deb 100644
--- a/paddle/operators/chunk_eval_op.cc
+++ b/paddle/operators/chunk_eval_op.cc
@@ -32,6 +32,13 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
                    "Output(Recall) of ChunkEvalOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("F1-Score"),
                    "Output(F1-Score) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("NumInferChunks"),
+                   "Output(NumInferChunks) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("NumLabelChunks"),
+                   "Output(NumLabelChunks) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NumCorrectChunks"),
+        "Output(NumCorrectChunks) of ChunkEvalOp should not be null.");
 
     auto inference_dim = ctx->GetInputDim("Inference");
     auto label_dim = ctx->GetInputDim("Label");
@@ -42,6 +49,9 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Precision", {1});
     ctx->SetOutputDim("Recall", {1});
     ctx->SetOutputDim("F1-Score", {1});
+    ctx->SetOutputDim("NumInferChunks", {1});
+    ctx->SetOutputDim("NumLabelChunks", {1});
+    ctx->SetOutputDim("NumCorrectChunks", {1});
   }
 
  protected:
@@ -70,6 +80,16 @@ class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
               "sensitivity) of chunks on the given mini-batch.");
     AddOutput("F1-Score",
               "(float). The evaluated F1-Score on the given mini-batch.");
+    AddOutput("NumInferChunks",
+              "(int64_t). The number of chunks in Inference on the given "
+              "mini-batch.");
+    AddOutput(
+        "NumLabelChunks",
+        "(int64_t). The number of chunks in Label on the given mini-batch.");
+    AddOutput(
+        "NumCorrectChunks",
+        "(int64_t). The number of chunks both in Inference and Label on the "
+        "given mini-batch.");
     AddAttr<int>("num_chunk_types",
                  "(int). The number of chunk type. See below for details.");
     AddAttr<std::string>(
diff --git a/paddle/operators/chunk_eval_op.h b/paddle/operators/chunk_eval_op.h
index dd88f2553b..74ab435c86 100644
--- a/paddle/operators/chunk_eval_op.h
+++ b/paddle/operators/chunk_eval_op.h
@@ -23,7 +23,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ChunkEvalKernel : public framework::OpKernel<T> {
  public:
   struct Segment {
@@ -111,9 +111,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     std::vector<Segment> label_segments;
     std::vector<Segment> output_segments;
     std::set<int> excluded_chunk_types;
-    int64_t num_output_segments = 0;
-    int64_t num_label_segments = 0;
-    int64_t num_correct = 0;
+
     if (context.Attr<std::string>("chunk_scheme") == "IOB") {
       num_tag_types = 2;
       tag_begin = 0;
@@ -151,12 +149,24 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     auto* precision = context.Output<Tensor>("Precision");
     auto* recall = context.Output<Tensor>("Recall");
     auto* f1 = context.Output<Tensor>("F1-Score");
+    auto* num_infer_chunks = context.Output<Tensor>("NumInferChunks");
+    auto* num_label_chunks = context.Output<Tensor>("NumLabelChunks");
+    auto* num_correct_chunks = context.Output<Tensor>("NumCorrectChunks");
 
     const int64_t* inference_data = inference->data<int64_t>();
     const int64_t* label_data = label->data<int64_t>();
     T* precision_data = precision->mutable_data<T>(context.GetPlace());
     T* racall_data = recall->mutable_data<T>(context.GetPlace());
     T* f1_data = f1->mutable_data<T>(context.GetPlace());
+    int64_t* num_infer_chunks_data =
+        num_infer_chunks->mutable_data<int64_t>(context.GetPlace());
+    int64_t* num_label_chunks_data =
+        num_label_chunks->mutable_data<int64_t>(context.GetPlace());
+    int64_t* num_correct_chunks_data =
+        num_correct_chunks->mutable_data<int64_t>(context.GetPlace());
+    *num_infer_chunks_data = 0;
+    *num_label_chunks_data = 0;
+    *num_correct_chunks_data = 0;
 
     auto lod = label->lod();
     PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
@@ -166,17 +176,23 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     for (int i = 0; i < num_sequences; ++i) {
       int seq_length = lod[0][i + 1] - lod[0][i];
       EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
-                 output_segments, label_segments, num_output_segments,
-                 num_label_segments, num_correct, num_chunk_types,
-                 num_tag_types, other_chunk_type, tag_begin, tag_inside,
-                 tag_end, tag_single, excluded_chunk_types);
+                 output_segments, label_segments, *num_infer_chunks_data,
+                 *num_label_chunks_data, *num_correct_chunks_data,
+                 num_chunk_types, num_tag_types, other_chunk_type, tag_begin,
+                 tag_inside, tag_end, tag_single, excluded_chunk_types);
     }
-    *precision_data = !num_output_segments ? 0 : static_cast<T>(num_correct) /
-                                                     num_output_segments;
-    *racall_data = !num_label_segments ? 0 : static_cast<T>(num_correct) /
-                                                 num_label_segments;
-    *f1_data = !num_correct ? 0 : 2 * (*precision_data) * (*racall_data) /
-                                      ((*precision_data) + (*racall_data));
+    *precision_data = !(*num_infer_chunks_data)
+                          ? 0
+                          : static_cast<T>(*num_correct_chunks_data) /
+                                (*num_infer_chunks_data);
+    *racall_data = !(*num_label_chunks_data)
+                       ? 0
+                       : static_cast<T>(*num_correct_chunks_data) /
+                             (*num_label_chunks_data);
+    *f1_data = !(*num_correct_chunks_data)
+                   ? 0
+                   : 2 * (*precision_data) * (*racall_data) /
+                         ((*precision_data) + (*racall_data));
   }
 
   void EvalOneSeq(const int64_t* output, const int64_t* label, int length,
diff --git a/paddle/operators/clip_by_norm_op.cc b/paddle/operators/clip_by_norm_op.cc
index f73d55bbe3..0b7975a63f 100644
--- a/paddle/operators/clip_by_norm_op.cc
+++ b/paddle/operators/clip_by_norm_op.cc
@@ -71,4 +71,5 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp,
                              ops::ClipByNormOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    clip_by_norm, ops::ClipByNormKernel<paddle::platform::CPUPlace, float>);
+    clip_by_norm,
+    ops::ClipByNormKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/clip_by_norm_op.cu b/paddle/operators/clip_by_norm_op.cu
index 2593a24ebb..acd7543823 100644
--- a/paddle/operators/clip_by_norm_op.cu
+++ b/paddle/operators/clip_by_norm_op.cu
@@ -15,5 +15,6 @@
 #include "paddle/operators/clip_by_norm_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    clip_by_norm, ops::ClipByNormKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    clip_by_norm,
+    ops::ClipByNormKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/clip_by_norm_op.h b/paddle/operators/clip_by_norm_op.h
index b26476cae9..d8db1566b0 100644
--- a/paddle/operators/clip_by_norm_op.h
+++ b/paddle/operators/clip_by_norm_op.h
@@ -26,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ClipByNormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -38,7 +38,8 @@ class ClipByNormKernel : public framework::OpKernel<T> {
     auto x = EigenVector<T>::Flatten(*input);
     auto out = EigenVector<T>::Flatten(*output);
     auto x_norm = x.square().sum().sqrt();
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto temp = (x_norm <= max_norm).template cast<T>().eval();
     auto scaling = temp + (static_cast<T>(1) - temp) * max_norm / x_norm;
diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index 4ddf24dea3..6092212de4 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -83,7 +83,7 @@ class ClipOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
             ops::ClipOpGrad);
-REGISTER_OP_CPU_KERNEL(clip,
-                       ops::ClipKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(clip_grad,
-                       ops::ClipGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/clip_op.cu b/paddle/operators/clip_op.cu
index ca9701298f..bb7dcc671a 100644
--- a/paddle/operators/clip_op.cu
+++ b/paddle/operators/clip_op.cu
@@ -15,7 +15,7 @@
 #include "paddle/operators/clip_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(clip,
-                       ops::ClipKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(clip_grad,
-                       ops::ClipGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    clip, ops::ClipKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    clip_grad, ops::ClipGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/clip_op.h b/paddle/operators/clip_op.h
index ac702e9935..0c40797410 100644
--- a/paddle/operators/clip_op.h
+++ b/paddle/operators/clip_op.h
@@ -55,7 +55,7 @@ class ClipGradFunctor {
   T max_;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ClipKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -66,13 +66,13 @@ class ClipKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
     const T* x_data = x->data<T>();
     int64_t numel = x->numel();
-    Transform<Place> trans;
-    trans(context.device_context(), x_data, x_data + numel, out_data,
-          ClipFunctor<T>(min, max));
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x_data,
+          x_data + numel, out_data, ClipFunctor<T>(min, max));
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ClipGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -86,9 +86,9 @@ class ClipGradKernel : public framework::OpKernel<T> {
       auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
       const T* d_out_data = d_out->data<T>();
       const T* x_data = x->data<T>();
-      Transform<Place> trans;
-      trans(context.device_context(), d_out_data, d_out_data + numel, x_data,
-            d_x_data, ClipGradFunctor<T>(min, max));
+      Transform<DeviceContext> trans;
+      trans(context.template device_context<DeviceContext>(), d_out_data,
+            d_out_data + numel, x_data, d_x_data, ClipGradFunctor<T>(min, max));
     }
   }
 };
diff --git a/paddle/operators/compare_op.cu b/paddle/operators/compare_op.cu
index 6ac8c124b9..596a878bcf 100644
--- a/paddle/operators/compare_op.cu
+++ b/paddle/operators/compare_op.cu
@@ -14,10 +14,10 @@
 
 #include "paddle/operators/compare_op.h"
 
-REGISTER_LOGICAL_KERNEL(less_than, GPU, paddle::operators::LessThanFunctor);
-REGISTER_LOGICAL_KERNEL(less_equal, GPU, paddle::operators::LessEqualFunctor);
-REGISTER_LOGICAL_KERNEL(greater_than, GPU,
+REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_KERNEL(greater_than, CUDA,
                         paddle::operators::GreaterThanFunctor);
-REGISTER_LOGICAL_KERNEL(greater_equal, GPU,
+REGISTER_LOGICAL_KERNEL(greater_equal, CUDA,
                         paddle::operators::GreaterEqualFunctor);
-REGISTER_LOGICAL_KERNEL(equal, GPU, paddle::operators::EqualFunctor);
+REGISTER_LOGICAL_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h
index afdf3ab3e0..a56536e155 100644
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
@@ -59,7 +59,7 @@ struct EqualFunctor {
   }
 };
 
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class CompareOpKernel
     : public framework::OpKernel<typename Functor::ELEM_TYPE> {
  public:
@@ -69,24 +69,23 @@ class CompareOpKernel
     auto* y = context.Input<framework::Tensor>("Y");
     auto* out = context.Output<framework::Tensor>("Out");
     Functor binary_func;
-    platform::Transform<Place> trans;
-    trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
-          y->data<T>(), out->mutable_data<bool>(context.GetPlace()),
-          binary_func);
+    platform::Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x->data<T>(),
+          x->data<T>() + x->numel(), y->data<T>(),
+          out->mutable_data<bool>(context.GetPlace()), binary_func);
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor)                     \
-  REGISTER_OP_##dev##_KERNEL(                                              \
-      op_type,                                                             \
-      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
-                                           functor<int>>,                  \
-      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
-                                           functor<int64_t>>,              \
-      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
-                                           functor<float>>,                \
-      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
-                                           functor<double>>);
+#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor)                    \
+  REGISTER_OP_##dev##_KERNEL(                                             \
+      op_type, ::paddle::operators::CompareOpKernel<                      \
+                   ::paddle::platform::dev##DeviceContext, functor<int>>, \
+      ::paddle::operators::CompareOpKernel<                               \
+          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,      \
+      ::paddle::operators::CompareOpKernel<                               \
+          ::paddle::platform::dev##DeviceContext, functor<float>>,        \
+      ::paddle::operators::CompareOpKernel<                               \
+          ::paddle::platform::dev##DeviceContext, functor<double>>);
diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
index 6134ac78b1..cf522d6921 100644
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -41,14 +41,18 @@ class ConcatOp : public framework::OperatorWithKernel {
       for (size_t j = 0; j < in_zero_dims_size; j++) {
         if (j == axis) {
           out_dims[axis] += ins[i][j];
-          continue;
+        } else {
+          PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
+                            "Input tensors should have the same "
+                            "elements except the specify axis.");
         }
-        PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
-                          "Input tensors should have the same "
-                          "elements except the specify axis.");
       }
     }
+    if (out_dims[axis] < 0) {
+      out_dims[axis] = -1;
+    }
     ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/operators/concat_op.cu.cc b/paddle/operators/concat_op.cu.cc
index ede832ddcd..7b46452d3d 100644
--- a/paddle/operators/concat_op.cu.cc
+++ b/paddle/operators/concat_op.cu.cc
@@ -14,7 +14,8 @@ limitations under the License. */
 
 #include "paddle/operators/concat_op.h"
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(concat,
-                       ops::ConcatKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    concat_grad, ops::ConcatGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    concat_grad,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/concat_op.h b/paddle/operators/concat_op.h
index c113f19fb5..de4011585a 100644
--- a/paddle/operators/concat_op.h
+++ b/paddle/operators/concat_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ConcatKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -43,7 +43,7 @@ class ConcatKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ConcatGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc
index d5b124682d..6f2ef9174e 100644
--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
@@ -65,7 +65,7 @@ class ConditionalBlockOp : public ConditionalOp {
       scopes->front() = &scope.NewScope();
       auto &cur_scope = *scopes->front();
 
-      auto *block = Attr<framework::BlockDescBind *>("block");
+      auto *block = Attr<framework::BlockDescBind *>("sub_block");
       framework::Executor exec(dev_ctx);
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
     }
@@ -88,7 +88,7 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
               "unify the conditional block, rnn and while op, the type of "
               "scope is std::vector<Scope*>");
     AddAttr<framework::BlockDescBind *>(
-        "block", "The step block of conditional block operator");
+        "sub_block", "The step block of conditional block operator");
     AddComment(R"DOC(Conditional block operator
 
 Run the sub-block if X is not empty. Params is the other inputs and Out is the
@@ -117,7 +117,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
       auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
       framework::Scope &cur_scope = *scopes[0];
 
-      auto *block = Attr<framework::BlockDescBind *>("block");
+      auto *block = Attr<framework::BlockDescBind *>("sub_block");
       framework::Executor exec(dev_ctx);
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
 
@@ -142,9 +142,9 @@ class ConditionalBlockGradOp : public ConditionalOp {
         continue;
       }
       auto new_in_grad_name = cur_scope.Rename(in_grad_name);
-      auto assign =
-          framework::OpRegistry::CreateOp("assign", {{"X", {new_in_grad_name}}},
-                                          {{"Out", {out_grad_name}}}, {});
+      auto assign = framework::OpRegistry::CreateOp(
+          "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}},
+          framework::AttributeMap{});
       assign->Run(cur_scope, dev_ctx);
       cur_scope.Rename(new_in_grad_name, in_grad_name);
     }
@@ -181,7 +181,7 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
     grad_op->SetInput("Scope", Output("Scope"));
     grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     grad_op->SetOutput(framework::GradVarName("Params"), InputGrad("Params"));
-    grad_op->SetBlockAttr("block", *this->grad_block_[0]);
+    grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]);
     return std::unique_ptr<framework::OpDescBind>(grad_op);
   }
 };
diff --git a/paddle/operators/conv_cudnn_op.cc b/paddle/operators/conv_cudnn_op.cc
index 0dd8c13b2a..008bf01885 100644
--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
@@ -57,18 +57,20 @@ REGISTER_OP(conv2d_cudnn, ops::ConvOp, ops::CudnnConv2DOpMaker,
 REGISTER_OP(conv3d_cudnn, ops::ConvOp, ops::CudnnConv3DOpMaker,
             conv3d_cudnn_grad, ops::ConvOpGrad);
 
-REGISTER_OP_CPU_KERNEL(conv2d_cudnn,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_cudnn,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     conv2d_cudnn_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
 
-REGISTER_OP_CPU_KERNEL(conv3d_cudnn,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_cudnn,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     conv3d_cudnn_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc
index 3f97dc7ee0..3da0a9001a 100644
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -28,7 +28,8 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using DataLayout = platform::DataLayout;
 
-static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 1024 * 1024 * 1024;
+static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
+    static_cast<size_t>(1024) * 1024 * 1024;
 
 template <typename T>
 class CudnnConvOpKernel : public framework::OpKernel<T> {
@@ -44,7 +45,8 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     int groups = ctx.Attr<int>("groups");
-    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+    int64_t user_workspace_size =
+        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
 
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
@@ -116,7 +118,8 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
     }
     // ------------------- cudnn conv algorithm ---------------------
     cudnnConvolutionFwdAlgo_t algo;
-    auto handle = ctx.cuda_device_context().cudnn_handle();
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
 
     PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
@@ -163,7 +166,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     int groups = ctx.Attr<int>("groups");
-    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+    int64_t user_workspace_size =
+        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
 
     // ------------------- cudnn descriptors ---------------------
     ScopedTensorDescriptor input_desc;
@@ -235,7 +239,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
       workspace_size_limit = user_workspace_size * 1024 * 1024;
     }
 
-    auto handle = ctx.cuda_device_context().cudnn_handle();
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
       PADDLE_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
@@ -310,16 +315,16 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(conv2d_cudnn,
-                       paddle::operators::CudnnConvOpKernel<float>,
-                       paddle::operators::CudnnConvOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(conv2d_cudnn_grad,
-                       paddle::operators::CudnnConvGradOpKernel<float>,
-                       paddle::operators::CudnnConvGradOpKernel<double>);
-
-REGISTER_OP_GPU_KERNEL(conv3d_cudnn,
-                       paddle::operators::CudnnConvOpKernel<float>,
-                       paddle::operators::CudnnConvOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(conv3d_cudnn_grad,
-                       paddle::operators::CudnnConvGradOpKernel<float>,
-                       paddle::operators::CudnnConvGradOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(conv2d_cudnn,
+                        paddle::operators::CudnnConvOpKernel<float>,
+                        paddle::operators::CudnnConvOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(conv2d_cudnn_grad,
+                        paddle::operators::CudnnConvGradOpKernel<float>,
+                        paddle::operators::CudnnConvGradOpKernel<double>);
+
+REGISTER_OP_CUDA_KERNEL(conv3d_cudnn,
+                        paddle::operators::CudnnConvOpKernel<float>,
+                        paddle::operators::CudnnConvOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(conv3d_cudnn_grad,
+                        paddle::operators::CudnnConvGradOpKernel<float>,
+                        paddle::operators::CudnnConvGradOpKernel<double>);
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index 462e6d9cbc..7ef805fd44 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -235,16 +235,18 @@ namespace ops = paddle::operators;
 REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
             ops::ConvOpGrad);
 
-REGISTER_OP_CPU_KERNEL(conv2d,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
+    conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
 
-REGISTER_OP_CPU_KERNEL(conv3d,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
+    conv3d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc
index 546451234a..38615a8bef 100644
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
@@ -16,16 +16,18 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(conv2d,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
-    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv2d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
 
-REGISTER_OP_GPU_KERNEL(conv3d,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
-    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv3d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv3d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h
index 09bff0a68d..d2de4e80f7 100644
--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
@@ -72,7 +72,7 @@ class ConvOpGrad : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GemmConvKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -141,9 +141,10 @@ class GemmConvKernel : public framework::OpKernel<T> {
     int in_step = static_cast<int>(input->dims()[1]) / groups;
     int out_step = static_cast<int>(output->dims()[1]) / groups;
 
-    math::Vol2ColFunctor<Place, T> vol2col;
-    math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
+    math::Vol2ColFunctor<DeviceContext, T> vol2col;
+    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
 
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     for (int i = 0; i < batch_size; i++) {
       Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
       Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
@@ -157,27 +158,26 @@ class GemmConvKernel : public framework::OpKernel<T> {
           col_matrix.Resize(col_matrix_shape);
         } else if (data_dim == 2U) {
           // im2col
-          im2col(context.device_context(), in_slice, dilations, strides,
+          im2col(dev_ctx, in_slice, dilations, strides,
                  std::vector<int>{paddings[0], paddings[1], paddings[0],
                                   paddings[1]},
                  &col);
         } else if (data_dim == 3U) {
           // vol2col
-          vol2col(context.device_context(), in_slice, dilations, strides,
-                  paddings, &col);
+          vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
         }
 
         // gemm
         Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
         Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        math::matmul<Place, T>(context.device_context(), filter_slice, false,
-                               col_matrix, false, T(1.0), &out_slice, T(0.0));
+        math::matmul<DeviceContext, T>(dev_ctx, filter_slice, false, col_matrix,
+                                       false, T(1.0), &out_slice, T(0.0));
       }
     }
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GemmConvGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -256,14 +256,19 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
       col_matrix.Resize(col_matrix_shape);
     }
 
-    math::SetConstant<Place, T> set_zero;
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
 
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
-      set_zero(context.device_context(), input_grad, static_cast<T>(0));
 
-      math::Col2VolFunctor<Place, T> col2vol;
-      math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
+      // if is_expand is false, the operation of set_zero is unnecessary,
+      // because math::matmul will reset input_grad.
+      if (is_expand) {
+        set_zero(dev_ctx, input_grad, static_cast<T>(0));
+      }
+      math::Col2VolFunctor<DeviceContext, T> col2vol;
+      math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
 
       for (int i = 0; i < batch_size; i++) {
         Tensor out_grad_batch =
@@ -282,18 +287,17 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
             col_matrix.ShareDataWith(in_grad_slice);
             col_matrix.Resize(col_matrix_shape);
           }
-          math::matmul<Place, T>(context.device_context(), filter_slice, true,
-                                 out_grad_slice, false, T(1.0), &col_matrix,
-                                 T(0.0));
+          math::matmul<DeviceContext, T>(dev_ctx, filter_slice, true,
+                                         out_grad_slice, false, T(1.0),
+                                         &col_matrix, T(0.0));
 
           if (is_expand && data_dim == 2U) {
-            col2im(context.device_context(), col, dilations, strides,
+            col2im(dev_ctx, col, dilations, strides,
                    std::vector<int>{paddings[0], paddings[1], paddings[0],
                                     paddings[1]},
                    &in_grad_slice);
           } else if (is_expand && data_dim == 3U) {
-            col2vol(context.device_context(), col, dilations, strides, paddings,
-                    &in_grad_slice);
+            col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
           }
         }
       }
@@ -303,9 +307,9 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
       filter_grad->mutable_data<T>(context.GetPlace());
       Tensor filter_grad_ = *filter_grad;
       filter_grad_.Resize(filter_matrix_shape);
-      set_zero(context.device_context(), filter_grad, static_cast<T>(0));
-      math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
-      math::Vol2ColFunctor<Place, T> vol2col;
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
+      math::Vol2ColFunctor<DeviceContext, T> vol2col;
       for (int i = 0; i < batch_size; i++) {
         Tensor out_grad_batch =
             output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
@@ -321,21 +325,20 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
             col_matrix.ShareDataWith(col);
             col_matrix.Resize(col_matrix_shape);
           } else if (data_dim == 2U) {
-            im2col(context.device_context(), in_slice, dilations, strides,
+            im2col(dev_ctx, in_slice, dilations, strides,
                    std::vector<int>{paddings[0], paddings[1], paddings[0],
                                     paddings[1]},
                    &col);
           } else if (data_dim == 3U) {
-            vol2col(context.device_context(), in_slice, dilations, strides,
-                    paddings, &col);
+            vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
           }
 
           // gemm
           Tensor filter_grad_slice =
               filter_grad_.Slice(g * out_step, (g + 1) * out_step);
-          math::matmul<Place, T>(context.device_context(), out_grad_slice,
-                                 false, col_matrix, true, T(1.0),
-                                 &filter_grad_slice, T(1.0));
+          math::matmul<DeviceContext, T>(dev_ctx, out_grad_slice, false,
+                                         col_matrix, true, T(1.0),
+                                         &filter_grad_slice, T(1.0));
         }
       }
     }
diff --git a/paddle/operators/conv_shift_op.cu b/paddle/operators/conv_shift_op.cu
index 95e13c38a8..f7ca82ce26 100644
--- a/paddle/operators/conv_shift_op.cu
+++ b/paddle/operators/conv_shift_op.cu
@@ -111,7 +111,8 @@ __global__ void ConvShiftDy(const T *x, const T *dout, int x_width, int y_width,
 }  // namespace
 
 template <typename T>
-class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
+class ConvShiftKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     const Tensor *X = context.Input<Tensor>("X");
@@ -132,7 +133,8 @@ class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
 
     dim3 grid_dim(num_x_blocks, batch_size);
 
-    auto stream = context.cuda_device_context().stream();
+    auto stream =
+        context.template device_context<platform::CUDADeviceContext>().stream();
 
     ConvShiftForward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
         x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data);
@@ -140,7 +142,7 @@ class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
 };
 
 template <typename T>
-class ConvShiftGradKernel<platform::GPUPlace, T>
+class ConvShiftGradKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -159,8 +161,9 @@ class ConvShiftGradKernel<platform::GPUPlace, T>
     int y_width = Y->dims()[1];
     int y_half_width = (y_width - 1) / 2;
 
-    auto &device_ctx = context.cuda_device_context();
-    math::SetConstant<platform::GPUPlace, T> zero;
+    auto &device_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    math::SetConstant<platform::CUDADeviceContext, T> zero;
 
     const int x_per_block = 256;
     int num_x_blocks = DivUp(x_width, x_per_block);
@@ -186,8 +189,9 @@ class ConvShiftGradKernel<platform::GPUPlace, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(conv_shift,
-                       ops::ConvShiftKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    conv_shift,
+    ops::ConvShiftKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     conv_shift_grad,
-    ops::ConvShiftGradKernel<paddle::platform::GPUPlace, float>);
+    ops::ConvShiftGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/conv_shift_op.h b/paddle/operators/conv_shift_op.h
index 5a160b0f16..1a70b38a0d 100644
--- a/paddle/operators/conv_shift_op.h
+++ b/paddle/operators/conv_shift_op.h
@@ -18,13 +18,13 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ConvShiftKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ConvShiftGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override;
diff --git a/paddle/operators/conv_transpose_cudnn_op.cc b/paddle/operators/conv_transpose_cudnn_op.cc
index 0192178ce3..4cb6a2ccff 100644
--- a/paddle/operators/conv_transpose_cudnn_op.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cc
@@ -61,12 +61,13 @@ REGISTER_OP(conv2d_transpose_cudnn, ops::ConvTransposeOp,
 
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose_cudnn,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose_cudnn_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
 
 REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp,
             ops::CudnnConv3DTransposeOpMaker, conv3d_transpose_cudnn_grad,
@@ -74,9 +75,10 @@ REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp,
 
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose_cudnn,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose_cudnn_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
diff --git a/paddle/operators/conv_transpose_cudnn_op.cu.cc b/paddle/operators/conv_transpose_cudnn_op.cu.cc
index 494904fe52..f0297f6c40 100644
--- a/paddle/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc
@@ -83,7 +83,8 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
     }
     // ------------------- cudnn conv algorithm ---------------------
     cudnnConvolutionBwdDataAlgo_t algo;
-    auto handle = ctx.cuda_device_context().cudnn_handle();
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
     // Get the algorithm
     PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
         handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
@@ -165,7 +166,8 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
       workspace_size_limit = user_workspace_size * 1024 * 1024;
     }
 
-    auto handle = ctx.cuda_device_context().cudnn_handle();
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
       // choose backward algorithm for data
       PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
@@ -234,16 +236,16 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn,
-                       ops::CudnnConvTransposeOpKernel<float>,
-                       ops::CudnnConvTransposeOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn_grad,
-                       ops::CudnnConvTransposeGradOpKernel<float>,
-                       ops::CudnnConvTransposeGradOpKernel<double>);
-
-REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn,
-                       ops::CudnnConvTransposeOpKernel<float>,
-                       ops::CudnnConvTransposeOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn_grad,
-                       ops::CudnnConvTransposeGradOpKernel<float>,
-                       ops::CudnnConvTransposeGradOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(conv2d_transpose_cudnn,
+                        ops::CudnnConvTransposeOpKernel<float>,
+                        ops::CudnnConvTransposeOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(conv2d_transpose_cudnn_grad,
+                        ops::CudnnConvTransposeGradOpKernel<float>,
+                        ops::CudnnConvTransposeGradOpKernel<double>);
+
+REGISTER_OP_CUDA_KERNEL(conv3d_transpose_cudnn,
+                        ops::CudnnConvTransposeOpKernel<float>,
+                        ops::CudnnConvTransposeOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(conv3d_transpose_cudnn_grad,
+                        ops::CudnnConvTransposeGradOpKernel<float>,
+                        ops::CudnnConvTransposeGradOpKernel<double>);
diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc
index 678b192dea..ca063e94bb 100644
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -197,21 +197,23 @@ REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
 
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
 
 REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
             conv3d_transpose_grad, ops::ConvTransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
diff --git a/paddle/operators/conv_transpose_op.cu.cc b/paddle/operators/conv_transpose_op.cu.cc
index 4165eb0c7b..b91ebd7922 100644
--- a/paddle/operators/conv_transpose_op.cu.cc
+++ b/paddle/operators/conv_transpose_op.cu.cc
@@ -16,20 +16,24 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
     conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     double>);
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
     conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     double>);
diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h
index 1cacb770e6..1171b0435f 100644
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -52,7 +52,7 @@ class ConvTransposeOpGrad : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GemmConvTransposeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -109,11 +109,12 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     filter.Resize(filter_matrix_shape);
 
     output->mutable_data<T>(context.GetPlace());
-    math::SetConstant<Place, T> set_zero;
-    set_zero(context.device_context(), output, static_cast<T>(0));
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    set_zero(dev_ctx, output, static_cast<T>(0));
 
-    math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
-    math::Col2VolFunctor<Place, T> col2vol;
+    math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
+    math::Col2VolFunctor<DeviceContext, T> col2vol;
     std::vector<int> dilations({1, 1, 1});
 
     // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
@@ -127,29 +128,27 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
 
       // col_matrix = filter * input_batch
       // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
-      math::matmul<Place, T>(context.device_context(), filter, true,
-                             input_batch, false, static_cast<T>(1.0),
-                             &col_matrix, static_cast<T>(0.0));
+      math::matmul<DeviceContext, T>(dev_ctx, filter, true, input_batch, false,
+                                     static_cast<T>(1.0), &col_matrix,
+                                     static_cast<T>(0.0));
 
       if (data_dim == 2U) {
         // col2im: col_matrix -> dy
         // from (c * k_h * k_w, h * w) to (c, o_h, o_w)
-        col2im(context.device_context(), col,
-               std::vector<int>{dilations[0], dilations[1]}, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
+        col2im(dev_ctx, col, std::vector<int>{dilations[0], dilations[1]},
+               strides, std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                         paddings[1]},
                &output_batch);
       } else if (data_dim == 3U) {
         // col2vol: col_matrix -> dy
         // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
-        col2vol(context.device_context(), col, dilations, strides, paddings,
-                &output_batch);
+        col2vol(dev_ctx, col, dilations, strides, paddings, &output_batch);
       }
     }
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -206,6 +205,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
     // convolution transpose grad on input:
     // im2col + gemm (similar to conv-forward)
     // input need to compute gradient
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (input_grad || filter_grad) {
       Tensor col;
       col.mutable_data<T>(col_shape, context.GetPlace());
@@ -217,19 +217,18 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
       col_matrix.Resize(col_matrix_shape);
 
       Tensor filter_grad_;
-      math::SetConstant<Place, T> set_zero;
+      math::SetConstant<DeviceContext, T> set_zero;
 
-      math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
-      math::Vol2ColFunctor<Place, T> vol2col;
+      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
+      math::Vol2ColFunctor<DeviceContext, T> vol2col;
       std::vector<int> dilations({1, 1, 1});
 
       if (input_grad) {
         input_grad->mutable_data<T>(context.GetPlace());
-        set_zero(context.device_context(), input_grad, static_cast<T>(0));
       }
       if (filter_grad) {  // filter size (m, c, k_h, k_w)
         filter_grad->mutable_data<T>(context.GetPlace());
-        set_zero(context.device_context(), filter_grad, static_cast<T>(0));
+        set_zero(dev_ctx, filter_grad, static_cast<T>(0));
         filter_grad_ = *filter_grad;
         filter_grad_.Resize(filter_matrix_shape);
       }
@@ -242,7 +241,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
         if (data_dim == 2U) {
           // im2col: dy -> col matrix
           // from (c, o_h, o_w) to (c * k_h * k_w, h * w)
-          im2col(context.device_context(), output_grad_batch,
+          im2col(dev_ctx, output_grad_batch,
                  std::vector<int>{dilations[0], dilations[1]}, strides,
                  std::vector<int>{paddings[0], paddings[1], paddings[0],
                                   paddings[1]},
@@ -250,8 +249,8 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
         } else if (data_dim == 3U) {
           // vol2col: dy -> col_matrix
           // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w)
-          vol2col(context.device_context(), output_grad_batch, dilations,
-                  strides, paddings, &col);
+          vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings,
+                  &col);
         }
 
         if (input_grad) {
@@ -263,9 +262,9 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
           // or
           // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
           // d, h, w)
-          math::matmul<Place, T>(context.device_context(), filter, false,
-                                 col_matrix, false, static_cast<T>(1.0),
-                                 &input_grad_batch, static_cast<T>(0.0));
+          math::matmul<DeviceContext, T>(
+              dev_ctx, filter, false, col_matrix, false, static_cast<T>(1.0),
+              &input_grad_batch, static_cast<T>(0.0));
         }
         if (filter_grad) {
           // input batch
@@ -275,9 +274,9 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
           // or
           // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
           // k_h * k_w)
-          math::matmul<Place, T>(context.device_context(), in_batch, false,
-                                 col_matrix, true, static_cast<T>(1.0),
-                                 &filter_grad_, static_cast<T>(1.0));
+          math::matmul<DeviceContext, T>(dev_ctx, in_batch, false, col_matrix,
+                                         true, static_cast<T>(1.0),
+                                         &filter_grad_, static_cast<T>(1.0));
         }
       }
     }
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
index 312264ccd4..440c427cba 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -155,7 +155,8 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad,
             ops::CosSimOpGrad);
-REGISTER_OP_CPU_KERNEL(cos_sim,
-                       ops::CosSimKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    cos_sim_grad, ops::CosSimGradKernel<paddle::platform::CPUPlace, float>);
+    cos_sim, ops::CosSimKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    cos_sim_grad,
+    ops::CosSimGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu
index 0cb8fd26de..1cb01f5945 100644
--- a/paddle/operators/cos_sim_op.cu
+++ b/paddle/operators/cos_sim_op.cu
@@ -16,7 +16,8 @@
 #include "paddle/operators/cos_sim_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(cos_sim,
-                       ops::CosSimKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    cos_sim_grad, ops::CosSimGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    cos_sim, ops::CosSimKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    cos_sim_grad,
+    ops::CosSimGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
index 62a4e484ec..fecb5a79b2 100644
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CosSimKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -51,7 +51,8 @@ class CosSimKernel : public framework::OpKernel<T> {
     auto y_norm = EigenVector<T>::Flatten(*out_y_norm);
 
     // compute
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     auto row_along = Eigen::array<int, 1>({{1}});
     x_norm.device(place) = x.square().sum(row_along).sqrt();
     y_norm.device(place) = y.square().sum(row_along).sqrt();
@@ -66,7 +67,7 @@ class CosSimKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CosSimGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -96,7 +97,8 @@ class CosSimGradKernel : public framework::OpKernel<T> {
     auto z_bcast = z.broadcast(bcast_cols);
     auto dz_bcast = dz.broadcast(bcast_cols);
     auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast_cols);
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     if (rows_x == rows_y) {
       auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_cols);
       auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast_cols);
diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc
index 291b23ed1b..1ce189fa6e 100644
--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
@@ -135,5 +135,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(crf_decoding, ops::CRFDecodingOp,
                              ops::CRFDecodingOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    crf_decoding, ops::CRFDecodingOpKernel<paddle::platform::CPUPlace, float>,
-    ops::CRFDecodingOpKernel<paddle::platform::CPUPlace, double>);
+    crf_decoding,
+    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/crf_decoding_op.h b/paddle/operators/crf_decoding_op.h
index 57b5e21b3a..f6827b7b11 100644
--- a/paddle/operators/crf_decoding_op.h
+++ b/paddle/operators/crf_decoding_op.h
@@ -24,7 +24,7 @@ using framework::LoDTensor;
 using framework::LoD;
 using framework::Tensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CRFDecodingOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -44,8 +44,8 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     const size_t seq_num = lod[level].size() - 1;
 
     int64_t* path = decoded_path->mutable_data<int64_t>(platform::CPUPlace());
-    math::SetConstant<platform::CPUPlace, int64_t>()(ctx.device_context(),
-                                                     decoded_path, 0);
+    math::SetConstant<DeviceContext, int64_t>()(
+        ctx.template device_context<DeviceContext>(), decoded_path, 0);
     for (size_t i = 0; i < seq_num; ++i) {
       int start_pos = static_cast<int>(lod[level][i]);
       int end_pos = static_cast<int>(lod[level][i + 1]);
diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
index 6752eb8c1c..5c973fbb3c 100644
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -88,7 +88,8 @@ There are two ways to set shape:
 
 The input should be a k-D tensor(k > 0 and k < 7). As an example:
 
-Given:
+Case 1:
+Given
 
     X = [[0, 1, 2, 0, 0]
          [0, 3, 4, 0, 0]
@@ -107,6 +108,27 @@ we get:
     Out = [[1, 2],
            [3, 4]].
 
+
+Case 2:
+Given
+
+    X = [[0, 1, 2, 5, 0]
+         [0, 3, 4, 6, 0]
+         [0, 0, 0, 0, 0]],
+
+and
+
+    offsets = [0, 1],
+
+and
+
+    Y = [[0, 0, 0]
+         [0, 0, 0]],
+
+we get:
+
+    Out = [[1, 2, 5],
+           [3, 4, 6]].
 )DOC");
   }
 };
@@ -133,5 +155,5 @@ class CropOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad);
 REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
-REGISTER_OP_CPU_KERNEL(crop_grad,
-                       ops::CropGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/crop_op.cu b/paddle/operators/crop_op.cu
index f8ee18a1d6..90fd83ca10 100644
--- a/paddle/operators/crop_op.cu
+++ b/paddle/operators/crop_op.cu
@@ -16,6 +16,6 @@
 #include "paddle/operators/crop_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(crop, ops::CropKernel<float>);
-REGISTER_OP_GPU_KERNEL(crop_grad,
-                       ops::CropGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CUDA_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h
index 2e72583d68..d531a19c78 100644
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
@@ -49,7 +49,7 @@ class CropKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T, size_t D>
+template <typename DeviceContext, typename T, size_t D>
 void CropGradFunction(const framework::ExecutionContext& context) {
   auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
   if (d_x != nullptr) {
@@ -63,12 +63,13 @@ void CropGradFunction(const framework::ExecutionContext& context) {
     }
     auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
     auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
-    d_x_tensor.device(context.GetEigenDevice<Place>()) =
+    d_x_tensor.device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
         d_out_tensor.pad(paddings, 0);
   }
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CropGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -76,22 +77,22 @@ class CropGradKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
     switch (rank) {
       case 1:
-        CropGradFunction<Place, T, 1>(context);
+        CropGradFunction<DeviceContext, T, 1>(context);
         break;
       case 2:
-        CropGradFunction<Place, T, 2>(context);
+        CropGradFunction<DeviceContext, T, 2>(context);
         break;
       case 3:
-        CropGradFunction<Place, T, 3>(context);
+        CropGradFunction<DeviceContext, T, 3>(context);
         break;
       case 4:
-        CropGradFunction<Place, T, 4>(context);
+        CropGradFunction<DeviceContext, T, 4>(context);
         break;
       case 5:
-        CropGradFunction<Place, T, 5>(context);
+        CropGradFunction<DeviceContext, T, 5>(context);
         break;
       case 6:
-        CropGradFunction<Place, T, 6>(context);
+        CropGradFunction<DeviceContext, T, 6>(context);
         break;
       default:
         PADDLE_THROW(
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 1e82742eaf..2b06012b69 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -95,6 +95,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                         "Input(Label) should be 1.");
     }
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
   }
 
  protected:
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 6212e39dfd..0546964588 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -53,8 +53,9 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     Tensor* y = ctx.Output<Tensor>("Y");
     y->mutable_data<T>(ctx.GetPlace());
 
-    math::CrossEntropyFunctor<platform::GPUPlace, T>()(
-        ctx.device_context(), y, x, label, ctx.Attr<bool>("soft_label"));
+    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
+        ctx.template device_context<platform::CUDADeviceContext>(), y, x, label,
+        ctx.Attr<bool>("soft_label"));
   }
 };
 
@@ -80,15 +81,17 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
 
     int block = 512;
     int grid = (batch_size * class_num + block - 1) / block;
-    auto stream = ctx.cuda_device_context().stream();
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto stream = dev_ctx.stream();
 
     if (ctx.Attr<bool>("soft_label")) {
       auto* label_data = label->data<T>();
       SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
           dx_data, dy_data, x_data, label_data, batch_size, class_num);
     } else {
-      math::SetConstant<platform::GPUPlace, T> functor;
-      functor(ctx.device_context(), dx, 0);
+      math::SetConstant<platform::CUDADeviceContext, T> functor;
+      functor(dev_ctx, dx, 0);
       auto* label_data = label->data<int64_t>();
       grid = (batch_size + block - 1) / block;
       CrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
@@ -101,8 +104,8 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>,
-                       ops::CrossEntropyOpCUDAKernel<double>);
-REGISTER_OP_GPU_KERNEL(cross_entropy_grad,
-                       ops::CrossEntropyGradientOpCUDAKernel<float>,
-                       ops::CrossEntropyGradientOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>,
+                        ops::CrossEntropyOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
+                        ops::CrossEntropyGradientOpCUDAKernel<float>,
+                        ops::CrossEntropyGradientOpCUDAKernel<double>);
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index 37db0a930a..5623d2ded1 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -37,8 +37,9 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
     Tensor* y = ctx.Output<Tensor>("Y");
     y->mutable_data<T>(ctx.GetPlace());
 
-    math::CrossEntropyFunctor<platform::CPUPlace, T>()(
-        ctx.device_context(), y, x, labels, ctx.Attr<bool>("soft_label"));
+    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
+        ctx.template device_context<platform::CPUDeviceContext>(), y, x, labels,
+        ctx.Attr<bool>("soft_label"));
   }
 };
 
@@ -61,7 +62,8 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
       auto lbl_mat = EigenMatrix<T>::From(*label);
       auto dx_mat = EigenMatrix<T>::From(*dx);
 
-      dx_mat.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
+      dx_mat.device(*ctx.template device_context<platform::CPUDeviceContext>()
+                         .eigen_device()) =
           -(lbl_mat *
             dy_mat.broadcast(Eigen::DSizes<int64_t, 2>(1, class_num)) / x_mat);
     } else {
@@ -70,8 +72,8 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
       const T* x_data = x->data<T>();
       const int64_t* label_data = label->data<int64_t>();
 
-      math::SetConstant<platform::CPUPlace, T> functor;
-      functor(ctx.device_context(), dx, 0);
+      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      functor(ctx.template device_context<platform::CPUDeviceContext>(), dx, 0);
 
       for (int64_t i = 0; i < batch_size; ++i) {
         PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
diff --git a/paddle/operators/decayed_adagrad_op.cc b/paddle/operators/decayed_adagrad_op.cc
index 640b4e7744..fd29c7270b 100644
--- a/paddle/operators/decayed_adagrad_op.cc
+++ b/paddle/operators/decayed_adagrad_op.cc
@@ -99,4 +99,4 @@ REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp,
                              ops::DecayedAdagradOpMaker);
 REGISTER_OP_CPU_KERNEL(
     decayed_adagrad,
-    ops::DecayedAdagradOpKernel<paddle::platform::CPUPlace, float>);
+    ops::DecayedAdagradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/decayed_adagrad_op.cu b/paddle/operators/decayed_adagrad_op.cu
index 6fce77fe4e..282b90f275 100644
--- a/paddle/operators/decayed_adagrad_op.cu
+++ b/paddle/operators/decayed_adagrad_op.cu
@@ -16,6 +16,6 @@
 #include "paddle/operators/decayed_adagrad_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     decayed_adagrad,
-    ops::DecayedAdagradOpKernel<paddle::platform::GPUPlace, float>);
+    ops::DecayedAdagradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/decayed_adagrad_op.h b/paddle/operators/decayed_adagrad_op.h
index 0fe0fc5acd..fec9705cfc 100644
--- a/paddle/operators/decayed_adagrad_op.h
+++ b/paddle/operators/decayed_adagrad_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class DecayedAdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -43,7 +43,7 @@ class DecayedAdagradOpKernel : public framework::OpKernel<T> {
 
     auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
     auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     moment_out.device(place) = decay * moment + (1 - decay) * grad * grad;
     Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index 932c0bf8fb..acd526ae80 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -100,6 +100,8 @@ namespace ops = paddle::operators;
 REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad,
             ops::DropoutOpGrad<float>);
 REGISTER_OP_CPU_KERNEL(
-    dropout, ops::CPUDropoutKernel<paddle::platform::CPUPlace, float, float>);
+    dropout,
+    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float, float>);
 REGISTER_OP_CPU_KERNEL(
-    dropout_grad, ops::DropoutGradKernel<paddle::platform::CPUPlace, float>);
+    dropout_grad,
+    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
index db3578b9bf..10c670751d 100644
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -58,7 +58,7 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
     auto X = EigenMatrix<T>::Reshape(*x, 1);
     auto Y = EigenMatrix<T>::Reshape(*y, 1);
 
-    auto place = context.GetEigenDevice<Place>();
+    auto& place = *context.template device_context<Place>().eigen_device();
     if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
@@ -80,7 +80,9 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    dropout, ops::GPUDropoutKernel<paddle::platform::GPUPlace, float, float>);
-REGISTER_OP_GPU_KERNEL(
-    dropout_grad, ops::DropoutGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    dropout,
+    ops::GPUDropoutKernel<paddle::platform::CUDADeviceContext, float, float>);
+REGISTER_OP_CUDA_KERNEL(
+    dropout_grad,
+    ops::DropoutGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h
index d9a130fdc0..84ad39f0bb 100644
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T, typename AttrType>
+template <typename DeviceContext, typename T, typename AttrType>
 class CPUDropoutKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -55,13 +55,14 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
     } else {
       auto X = EigenMatrix<T>::Reshape(*x, 1);
       auto Y = EigenMatrix<T>::Reshape(*y, 1);
-      auto place = context.GetEigenDevice<Place>();
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
       Y.device(place) = X * dropout_prob;
     }
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class DropoutGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -77,7 +78,8 @@ class DropoutGradKernel : public framework::OpKernel<T> {
     auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
     auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
 
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     dX.device(place) = dY * M;
   }
 };
diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/operators/elementwise_add_op.cc
index 432b9ba6f7..a62eeeeb95 100644
--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/operators/elementwise_add_op.cc
@@ -34,13 +34,13 @@ REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker,
             elementwise_add_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_add,
-    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_add_op.cu b/paddle/operators/elementwise_add_op.cu
index 7591428ac7..78642bb424 100644
--- a/paddle/operators/elementwise_add_op.cu
+++ b/paddle/operators/elementwise_add_op.cu
@@ -17,15 +17,16 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     elementwise_add,
-    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, int64_t>);
-REGISTER_OP_GPU_KERNEL(
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
     elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, int64_t>);
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h
index 3a198c167e..069bdaf0ab 100644
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
@@ -21,10 +21,10 @@ namespace operators {
 
 template <typename T>
 struct AddFunctor {
-  HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -34,8 +34,8 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
     auto* y = ctx.Input<Tensor>("Y");
     auto* z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
-    TransformFunctor<AddFunctor<T>, T, Place> functor(
-        x, y, z, ctx.device_context(), AddFunctor<T>());
+    TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
+        x, y, z, ctx.template device_context<DeviceContext>(), AddFunctor<T>());
 
     auto x_dims = x->dims();
     auto y_dims = y->dims();
@@ -137,11 +137,11 @@ struct ElementwiseAddBroadCast2GradFunctor {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseAddGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseGradCompute<Place, T, ElementwiseAddGradFunctor<T>,
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseAddGradFunctor<T>,
                            ElementwiseAddOneGradFunctor<T>,
                            ElementwiseAddBroadCastGradFunctor<T>,
                            ElementwiseAddBroadCast2GradFunctor<T>>(ctx);
diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/operators/elementwise_div_op.cc
index 7a325199bd..1c3e9e70ee 100644
--- a/paddle/operators/elementwise_div_op.cc
+++ b/paddle/operators/elementwise_div_op.cc
@@ -35,13 +35,13 @@ REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
             elementwise_div_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_div_op.cu b/paddle/operators/elementwise_div_op.cu
index de4d0c3344..502c528936 100644
--- a/paddle/operators/elementwise_div_op.cu
+++ b/paddle/operators/elementwise_div_op.cu
@@ -17,15 +17,16 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, int64_t>);
-REGISTER_OP_GPU_KERNEL(
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
     elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, int64_t>);
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h
index 8946ff3d25..d91313db42 100644
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
@@ -19,11 +19,11 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseDivKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseCompute<EigenDivFunctor, Place, T>(ctx);
+    ElementwiseCompute<EigenDivFunctor, DeviceContext, T>(ctx);
   }
 };
 
@@ -102,11 +102,11 @@ struct ElementwiseDivBroadCast2GradFunctor {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseDivGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseGradCompute<Place, T, ElementwiseDivGradFunctor<T>,
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>,
                            ElementwiseDivGradFunctor<T>,
                            ElementwiseDivBroadCastGradFunctor<T>,
                            ElementwiseDivBroadCast2GradFunctor<T>>(ctx);
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index 8851267a52..aadb95cbe3 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -36,13 +36,13 @@ REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
             elementwise_mul_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_mul_op.cu b/paddle/operators/elementwise_mul_op.cu
index b0dfdee1cc..089451b3e1 100644
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
@@ -17,15 +17,16 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, int64_t>);
-REGISTER_OP_GPU_KERNEL(
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
     elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, int64_t>);
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h
index 4469b07eaa..16fa5ec4b3 100644
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -18,11 +18,11 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseMulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseCompute<EigenMulFunctor, Place, T>(ctx);
+    ElementwiseCompute<EigenMulFunctor, DeviceContext, T>(ctx);
   }
 };
 
@@ -101,11 +101,11 @@ struct ElementwiseMulBroadCast2GradFunctor {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseMulGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseGradCompute<Place, T, ElementwiseMulGradFunctor<T>,
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>,
                            ElementwiseMulGradFunctor<T>,
                            ElementwiseMulBroadCastGradFunctor<T>,
                            ElementwiseMulBroadCast2GradFunctor<T>>(ctx);
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h
index ec448a9e95..7ebfc7df8c 100644
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -59,29 +59,31 @@ inline void get_mid_dims(const framework::DDim& x_dims,
   }
 }
 
-template <typename T, typename Place>
+template <typename T, typename DeviceContext>
 class RowwiseTransformIterator;
-template <typename T, typename Place>
+template <typename T, typename DeviceContext>
 class MidWiseTransformIterator;
 
 template <typename T>
-class RowwiseTransformIterator<T, platform::CPUPlace> {
+class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
  public:
   RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
 
-  RowwiseTransformIterator<T, platform::CPUPlace>& operator++() {
+  RowwiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
     ++i_;
-    i_ %= n_;
+    if (UNLIKELY(i_ == n_)) {
+      i_ = 0;
+    }
     return *this;
   }
 
-  bool operator==(
-      const RowwiseTransformIterator<T, platform::CPUPlace>& rhs) const {
+  bool operator==(const RowwiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
     return (ptr_ + i_) == &(*rhs);
   }
 
-  bool operator!=(
-      const RowwiseTransformIterator<T, platform::CPUPlace>& rhs) const {
+  bool operator!=(const RowwiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
     return (ptr_ + i_) != &(*rhs);
   }
 
@@ -94,23 +96,28 @@ class RowwiseTransformIterator<T, platform::CPUPlace> {
 };
 
 template <typename T>
-class MidWiseTransformIterator<T, platform::CPUPlace> {
+class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
  public:
   MidWiseTransformIterator(const T* ptr, int n, int post)
       : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
 
-  MidWiseTransformIterator<T, platform::CPUPlace>& operator++() {
-    i_ = (++j_ / post_) % n_;
+  MidWiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
+    ++j_;
+    i_ = j_ / post_;
+    if (UNLIKELY(i_ == n_)) {
+      j_ = 0;
+      i_ = 0;
+    }
     return *this;
   }
 
-  bool operator==(
-      const MidWiseTransformIterator<T, platform::CPUPlace>& rhs) const {
+  bool operator==(const MidWiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
     return (ptr_ + i_) == &(*rhs);
   }
 
-  bool operator!=(
-      const MidWiseTransformIterator<T, platform::CPUPlace>& rhs) const {
+  bool operator!=(const MidWiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
     return (ptr_ + i_) != &(*rhs);
   }
 
@@ -126,12 +133,12 @@ class MidWiseTransformIterator<T, platform::CPUPlace> {
 
 #ifdef __NVCC__
 template <typename T>
-class RowwiseTransformIterator<T, platform::GPUPlace>
+class RowwiseTransformIterator<T, platform::CUDADeviceContext>
     : public thrust::iterator_adaptor<
-          RowwiseTransformIterator<T, platform::GPUPlace>, const T*> {
+          RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*> {
  public:
   typedef thrust::iterator_adaptor<
-      RowwiseTransformIterator<T, platform::GPUPlace>, const T*>
+      RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
       super_t;
   HOSTDEVICE RowwiseTransformIterator(const T* x, int n)
       : super_t(x), begin_(x), n_(n){};
@@ -146,12 +153,12 @@ class RowwiseTransformIterator<T, platform::GPUPlace>
 };
 
 template <typename T>
-class MidWiseTransformIterator<T, platform::GPUPlace>
+class MidWiseTransformIterator<T, platform::CUDADeviceContext>
     : public thrust::iterator_adaptor<
-          MidWiseTransformIterator<T, platform::GPUPlace>, const T*> {
+          MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*> {
  public:
   typedef thrust::iterator_adaptor<
-      MidWiseTransformIterator<T, platform::GPUPlace>, const T*>
+      MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
       super_t;
   HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post)
       : super_t(x), begin_(x), n_(n), post_(post){};
@@ -167,12 +174,11 @@ class MidWiseTransformIterator<T, platform::GPUPlace>
 };
 #endif
 
-template <typename Functor, typename T, typename Place>
+template <typename Functor, typename T, typename DeviceContext>
 class TransformFunctor {
  public:
   TransformFunctor(const framework::Tensor* x, const framework::Tensor* y,
-                   framework::Tensor* z, const platform::DeviceContext& ctx,
-                   Functor func)
+                   framework::Tensor* z, const DeviceContext& ctx, Functor func)
       : x_(x->data<T>()),
         y_(y->data<T>()),
         z_(z->mutable_data<T>(ctx.GetPlace())),
@@ -181,20 +187,20 @@ class TransformFunctor {
         func_(func) {}
 
   inline void Run() const {
-    platform::Transform<Place> trans;
+    platform::Transform<DeviceContext> trans;
     trans(ctx_, x_, x_ + nx_, y_, z_, func_);
   }
 
   inline void RunRowWise(int n, int pre) const {
-    platform::Transform<Place> trans;
-    trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator<T, Place>(y_, n), z_,
-          func_);
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator<T, DeviceContext>(y_, n),
+          z_, func_);
   }
 
   inline void RunMidWise(int n, int pre, int post) const {
-    platform::Transform<Place> trans;
-    trans(ctx_, x_, x_ + nx_, MidWiseTransformIterator<T, Place>(y_, n, post),
-          z_, func_);
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, x_, x_ + nx_,
+          MidWiseTransformIterator<T, DeviceContext>(y_, n, post), z_, func_);
   }
 
  private:
@@ -202,22 +208,24 @@ class TransformFunctor {
   const T* y_;
   T* z_;
   int64_t nx_;
-  const platform::DeviceContext& ctx_;
+  const DeviceContext& ctx_;
   Functor func_;
 };
 
 #define EIGEN_FUNCTOR(name, eigen_op)                                          \
   struct Eigen##name##Functor {                                                \
-    template <typename Place, typename T>                                      \
+    template <typename DeviceContext, typename T>                              \
     inline void Run(const framework::Tensor* x, const framework::Tensor* y,    \
                     framework::Tensor* z,                                      \
                     const framework::ExecutionContext& ctx) {                  \
       auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
       auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
       auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_e);            \
+      z_e.device(                                                              \
+          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
+          eigen_op(x_e, y_e);                                                  \
     }                                                                          \
-    template <typename Place, typename T>                                      \
+    template <typename DeviceContext, typename T>                              \
     inline void RunBroadCast(const framework::Tensor* x,                       \
                              const framework::Tensor* y, framework::Tensor* z, \
                              const framework::ExecutionContext& ctx, int pre,  \
@@ -228,9 +236,11 @@ class TransformFunctor {
       auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))                  \
                          .broadcast(Eigen::DSizes<int, 2>(pre, 1))             \
                          .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
+      z_e.device(                                                              \
+          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
+          eigen_op(x_e, y_bcast);                                              \
     }                                                                          \
-    template <typename Place, typename T>                                      \
+    template <typename DeviceContext, typename T>                              \
     inline void RunBroadCast2(const framework::Tensor* x,                      \
                               const framework::Tensor* y,                      \
                               framework::Tensor* z,                            \
@@ -242,11 +252,13 @@ class TransformFunctor {
       auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))               \
                          .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))       \
                          .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
+      z_e.device(                                                              \
+          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
+          eigen_op(x_e, y_bcast);                                              \
     }                                                                          \
   }
 
-template <class functor, typename Place, typename T>
+template <class functor, typename DeviceContext, typename T>
 void ElementwiseCompute(const framework::ExecutionContext& ctx) {
   using Tensor = framework::Tensor;
 
@@ -262,7 +274,7 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) {
 
   if (x_dims == y_dims) {
     functor f;
-    f.template Run<Place, T>(x, y, z, ctx);
+    f.template Run<DeviceContext, T>(x, y, z, ctx);
     return;
   }
 
@@ -275,11 +287,11 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) {
   get_mid_dims(x_dims, y_dims, axis, pre, n, post);
   if (post == 1) {
     functor f;
-    f.template RunBroadCast<Place, T>(x, y, z, ctx, pre, n);
+    f.template RunBroadCast<DeviceContext, T>(x, y, z, ctx, pre, n);
     return;
   } else {
     functor f;
-    f.template RunBroadCast2<Place, T>(x, y, z, ctx, pre, n, post);
+    f.template RunBroadCast2<DeviceContext, T>(x, y, z, ctx, pre, n, post);
     return;
   }
 }
@@ -296,8 +308,9 @@ EIGEN_FUNCTOR(Mul, EIGEN_MUL);
 #define EIGEN_DIV(x, y) ((x) / (y))
 EIGEN_FUNCTOR(Div, EIGEN_DIV);
 
-template <typename Place, typename T, typename functor, typename functor1,
-          typename broadcastfunctor, typename broadcast2functor>
+template <typename DeviceContext, typename T, typename functor,
+          typename functor1, typename broadcastfunctor,
+          typename broadcast2functor>
 void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
   using Tensor = framework::Tensor;
 
@@ -306,7 +319,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
   auto* out = ctx.Input<Tensor>("Out");
   auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
-  auto place = ctx.GetEigenDevice<Place>();
+  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
   auto x_dims = x->dims();
   auto y_dims = y->dims();
diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/operators/elementwise_sub_op.cc
index 95d7979e39..3e4d19361e 100644
--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/operators/elementwise_sub_op.cc
@@ -34,13 +34,13 @@ REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
             elementwise_sub_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_sub_op.cu b/paddle/operators/elementwise_sub_op.cu
index ec23bec35f..0b2f0f7d4d 100644
--- a/paddle/operators/elementwise_sub_op.cu
+++ b/paddle/operators/elementwise_sub_op.cu
@@ -17,15 +17,16 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, int64_t>);
-REGISTER_OP_GPU_KERNEL(
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
     elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, int64_t>);
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h
index 3f40c1c5bc..731a30c5e3 100644
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
@@ -18,11 +18,11 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseSubKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseCompute<EigenSubFunctor, Place, T>(ctx);
+    ElementwiseCompute<EigenSubFunctor, DeviceContext, T>(ctx);
   }
 };
 
@@ -101,11 +101,11 @@ struct ElementwiseSubBroadCast2GradFunctor {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseSubGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseGradCompute<Place, T, ElementwiseSubGradFunctor<T>,
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>,
                            ElementwiseSubOneGradFunctor<T>,
                            ElementwiseSubBroadCastGradFunctor<T>,
                            ElementwiseSubBroadCast2GradFunctor<T>>(ctx);
diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc
index 282775fcda..8b3cddbb94 100644
--- a/paddle/operators/expand_op.cc
+++ b/paddle/operators/expand_op.cc
@@ -130,7 +130,8 @@ class ExpandGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
             ops::ExpandGradOp);
-REGISTER_OP_CPU_KERNEL(expand,
-                       ops::ExpandKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    expand_grad, ops::ExpandGradKernel<paddle::platform::CPUPlace, float>);
+    expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    expand_grad,
+    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/expand_op.cu b/paddle/operators/expand_op.cu
index 6744562b6c..99ee584d08 100644
--- a/paddle/operators/expand_op.cu
+++ b/paddle/operators/expand_op.cu
@@ -17,7 +17,8 @@
 #include "paddle/operators/expand_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(expand,
-                       ops::ExpandKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    expand_grad, ops::ExpandGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_grad,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h
index 4d7996ad1e..14ef8b0912 100644
--- a/paddle/operators/expand_op.h
+++ b/paddle/operators/expand_op.h
@@ -56,7 +56,7 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -83,12 +83,13 @@ class ExpandKernel : public framework::OpKernel<T> {
     auto x = EigenTensor<T, Rank>::From(*in0);
     out0->mutable_data<T>(context.GetPlace());
     auto y = EigenTensor<T, Rank>::From(*out0);
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     y.device(place) = x.broadcast(bcast_dims);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -164,7 +165,8 @@ class ExpandGradKernel : public framework::OpKernel<T> {
       reduce_dims[i] = reduce_dims_vec[i];
     }
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(context.GetEigenDevice<Place>()) =
+    x_grad.device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
         out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions());
   }
 };
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc
index 892922cd3a..7fb74e2b95 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -100,8 +100,11 @@ REGISTER_OPERATOR(fill_constant_batch_size_like,
                   ops::FillConstantBatchSizeLikeOpMaker);
 REGISTER_OP_CPU_KERNEL(
     fill_constant_batch_size_like,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, double>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, int>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
+                                           float>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
+                                           double>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
+                                           int>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
                                            int64_t>);
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
index 9e7a1eeab8..2e0e15f36b 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cu.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
@@ -16,10 +16,13 @@
 #include "paddle/framework/op_registry.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     fill_constant_batch_size_like,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, double>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, int>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           float>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           double>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           int>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
                                            int64_t>);
diff --git a/paddle/operators/fill_constant_batch_size_like_op.h b/paddle/operators/fill_constant_batch_size_like_op.h
index 339d97a30a..66da9d0307 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/operators/fill_constant_batch_size_like_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -27,8 +27,9 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(ctx.GetPlace());
     auto value = ctx.Attr<float>("value");
 
-    math::SetConstant<Place, T> setter;
-    setter(ctx.device_context(), out, static_cast<T>(value));
+    math::SetConstant<DeviceContext, T> setter;
+    setter(ctx.template device_context<DeviceContext>(), out,
+           static_cast<T>(value));
   }
 };
 
diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc
new file mode 100644
index 0000000000..382e161c5d
--- /dev/null
+++ b/paddle/operators/fill_op.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+struct FillOpVisitor {
+  FillOpVisitor(framework::LoDTensor *tensor, const std::vector<float> &value)
+      : tensor_(tensor), value_(value) {}
+
+  template <typename T>
+  void operator()() const {
+    platform::CPUPlace cpu;
+    auto *data = tensor_->mutable_data<T>(cpu);
+    std::transform(value_.data(), value_.data() + tensor_->numel(), data,
+                   [](float dat) { return static_cast<T>(dat); });
+  }
+
+  framework::LoDTensor *tensor_;
+  const std::vector<float> &value_;
+};
+
+class FillOp : public framework::OperatorBase {
+ public:
+  FillOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &out =
+        detail::Ref(detail::Ref(scope.FindVar(Output("Out")),
+                                "Cannot find variable %s", Output("Out"))
+                        .GetMutable<framework::LoDTensor>());
+    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    auto dtype = static_cast<framework::DataType>(Attr<int>("dtype"));
+    platform::CPUPlace cpu;
+    auto force_cpu = Attr<bool>("force_cpu");
+    out.mutable_data(force_cpu ? cpu : dev_ctx.GetPlace(),
+                     framework::ToTypeIndex(dtype));
+
+    framework::LoDTensor tensor;
+
+    if (force_cpu || platform::is_cpu_place(dev_ctx.GetPlace())) {
+      tensor.ShareDataWith(out);
+    } else {
+      // Always make tensor in CPU memory.
+      tensor.Resize(out.dims());
+      tensor.mutable_data(cpu, framework::ToTypeIndex(dtype));
+    }
+
+    framework::VisitDataType(
+        dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
+
+    if (!force_cpu && platform::is_gpu_place(dev_ctx.GetPlace())) {
+      // Copy tensor to out
+      framework::CopyFrom(tensor, dev_ctx.GetPlace(), dev_ctx, &out);
+    }
+  }
+};
+
+class FillOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FillOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddComment(R"DOC(Fill operator
+
+Fill an tensor with `value` and `shape`. The type of the tensor is specify by
+`dtype`.
+)DOC");
+    AddOutput("Out", "(LoDTensor) The output tensor.");
+    AddAttr<std::vector<float>>(
+        "value", "The float values of tensor, which are flatten in row major");
+    AddAttr<std::vector<int>>("shape", "The shape of output tensor");
+    AddAttr<int>("dtype", "The data type of output tensor, Default is float")
+        .SetDefault(framework::DataType::FP32);
+    AddAttr<bool>("force_cpu",
+                  "Whether the output tensor must be at CPU memory or not. "
+                  "Default is false.")
+        .SetDefault(false);
+  }
+};
+
+class FillOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim(
+        "Out",
+        framework::make_ddim(context->Attrs().Get<std::vector<int>>("shape")));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fill, ops::FillOp, ops::FillOpInferShape, ops::FillOpMaker);
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
index 95fb5932b8..720c11f5f1 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -54,8 +54,9 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp,
                              ops::FillZerosLikeOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    fill_zeros_like, ops::FillZerosLikeKernel<paddle::platform::CPUPlace, int>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, int64_t>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, float>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, double>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, bool>);
+    fill_zeros_like,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
diff --git a/paddle/operators/fill_zeros_like_op.cu.cc b/paddle/operators/fill_zeros_like_op.cu.cc
index 1501a17441..9f412306bb 100644
--- a/paddle/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/operators/fill_zeros_like_op.cu.cc
@@ -16,9 +16,10 @@
 #include "paddle/framework/op_registry.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    fill_zeros_like, ops::FillZerosLikeKernel<paddle::platform::GPUPlace, int>,
-    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, int64_t>,
-    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, float>,
-    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, double>,
-    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    fill_zeros_like,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
index 7e7d78eea2..a6e2941f52 100644
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -19,15 +19,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class FillZerosLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* out = context.Output<framework::Tensor>("Y");
     out->mutable_data<T>(context.GetPlace());
 
-    math::SetConstant<Place, T> setter;
-    setter(context.device_context(), out, static_cast<T>(0));
+    math::SetConstant<DeviceContext, T> setter;
+    setter(context.template device_context<DeviceContext>(), out,
+           static_cast<T>(0));
   }
 };
 
diff --git a/paddle/operators/ftrl_op.cc b/paddle/operators/ftrl_op.cc
index cb7ae69196..b14913ff21 100644
--- a/paddle/operators/ftrl_op.cc
+++ b/paddle/operators/ftrl_op.cc
@@ -135,5 +135,5 @@ The paper that proposed Follow The Regularized Leader (FTRL):
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(ftrl, ops::FTRLOp, ops::FTRLOpMaker);
-REGISTER_OP_CPU_KERNEL(ftrl,
-                       ops::FTRLOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    ftrl, ops::FTRLOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/ftrl_op.cu b/paddle/operators/ftrl_op.cu
index 97b36dade6..abbbe7adbe 100644
--- a/paddle/operators/ftrl_op.cu
+++ b/paddle/operators/ftrl_op.cu
@@ -15,5 +15,5 @@ specific language governing permissions and limitations under the License. */
 #include "paddle/operators/ftrl_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(ftrl,
-                       ops::FTRLOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    ftrl, ops::FTRLOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/ftrl_op.h b/paddle/operators/ftrl_op.h
index b040162f8d..4eea04cd8d 100644
--- a/paddle/operators/ftrl_op.h
+++ b/paddle/operators/ftrl_op.h
@@ -24,7 +24,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class FTRLOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -53,7 +53,7 @@ class FTRLOpKernel : public framework::OpKernel<T> {
     auto p_out = EigenVector<T>::Flatten(*param_out);
     auto s_acc_out = EigenVector<T>::Flatten(*sq_accum_out);
     auto l_acc_out = EigenVector<T>::Flatten(*lin_accum_out);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<int, 1> grad_dsize(grad->numel());
 
diff --git a/paddle/operators/gather.cu.h b/paddle/operators/gather.cu.h
index 8d04ecd284..c806aa5f05 100644
--- a/paddle/operators/gather.cu.h
+++ b/paddle/operators/gather.cu.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 using framework::Tensor;
-using platform::Place;
+using platform::DeviceContext;
 
 #define CUDA_1D_KERNEL_LOOP(i, n)                              \
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu
index 92219d6a43..b37f0576e2 100644
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
@@ -49,7 +49,8 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
 
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto place = ctx.GetEigenDevice<platform::GPUPlace>();
+    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
+                       .eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
 
     GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
@@ -60,5 +61,5 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
-REGISTER_OP_GPU_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
diff --git a/paddle/operators/gather_op.h b/paddle/operators/gather_op.h
index 8276ed0d3d..1a1ba0c41a 100644
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
@@ -53,7 +53,8 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
 
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto place = ctx.GetEigenDevice<platform::CPUPlace>();
+    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
 
     ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
index 315560bf1b..ffce6f7138 100644
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@@ -60,5 +60,5 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(gaussian_random,
-                       paddle::operators::GPUGaussianRandomKernel<float>);
+REGISTER_OP_CUDA_KERNEL(gaussian_random,
+                        paddle::operators::GPUGaussianRandomKernel<float>);
diff --git a/paddle/operators/gru_op.cc b/paddle/operators/gru_op.cc
index 5aa03f8916..311e7edcf1 100644
--- a/paddle/operators/gru_op.cc
+++ b/paddle/operators/gru_op.cc
@@ -213,8 +213,9 @@ class GRUGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
-REGISTER_OP_CPU_KERNEL(gru, ops::GRUKernel<paddle::platform::CPUPlace, float>,
-                       ops::GRUKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(gru_grad,
-                       ops::GRUGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::GRUGradKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    gru, ops::GRUKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    gru_grad, ops::GRUGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/gru_op.cu.cc b/paddle/operators/gru_op.cu.cc
index 0ceff94ec3..458630ca61 100644
--- a/paddle/operators/gru_op.cu.cc
+++ b/paddle/operators/gru_op.cu.cc
@@ -15,8 +15,9 @@
 #include "paddle/operators/gru_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(gru, ops::GRUKernel<paddle::platform::GPUPlace, float>,
-                       ops::GRUKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(gru_grad,
-                       ops::GRUGradKernel<paddle::platform::GPUPlace, float>,
-                       ops::GRUGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    gru, ops::GRUKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    gru_grad, ops::GRUGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h
index 564489d3a9..6d02dff578 100644
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@@ -27,16 +27,16 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
-inline void ReorderInitState(const platform::DeviceContext& ctx,
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
                              const framework::Tensor& src, const size_t* index,
                              framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<Place, T> row_shuffle;
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
   row_shuffle(ctx, src, index, *dst, indexed_src);
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GRUKernel : public framework::OpKernel<T> {
  public:
   void BatchCompute(const framework::ExecutionContext& context) const {
@@ -60,12 +60,12 @@ class GRUKernel : public framework::OpKernel<T> {
     auto hidden_dims = hidden->dims();
 
     bool is_reverse = context.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<Place, T> to_batch;
-    auto& dev_ctx = context.device_context();
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     to_batch(dev_ctx, *input, *batch_gate, true, is_reverse);
 
     if (bias) {
-      math::RowwiseAdd<Place, T> add_bias;
+      math::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
     }
 
@@ -80,8 +80,9 @@ class GRUKernel : public framework::OpKernel<T> {
       // Since the batch computing for GRU reorders the input sequences
       // according to their length. The initialized cell state also needs
       // to reorder.
-      ReorderInitState<Place, T>(context.device_context(), *h0, order,
-                                 &ordered_h0, true);
+      ReorderInitState<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), *h0, order,
+          &ordered_h0, true);
       gru_value.prev_out_value = ordered_h0.data<T>();
     } else {
       gru_value.prev_out_value = nullptr;
@@ -99,14 +100,14 @@ class GRUKernel : public framework::OpKernel<T> {
       gru_value.output_value = hidden_t.data<T>();
       gru_value.gate_value = gate_t.data<T>();
       gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-      math::GRUUnitFunctor<Place, T>::compute(
+      math::GRUUnitFunctor<DeviceContext, T>::compute(
           dev_ctx, gru_value, frame_size, cur_batch_size,
           math::ActiveType(context.Attr<std::string>("activation")),
           math::ActiveType(context.Attr<std::string>("gate_activation")));
       gru_value.prev_out_value = gru_value.output_value;
     }
 
-    math::Batch2LoDTensorFunctor<Place, T> to_seq;
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_hidden->set_lod(batch_gate->lod());
     to_seq(dev_ctx, *batch_hidden, *hidden);
   }
@@ -116,7 +117,7 @@ class GRUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GRUGradKernel : public framework::OpKernel<T> {
  public:
   void BatchCompute(const framework::ExecutionContext& context) const {
@@ -141,14 +142,14 @@ class GRUGradKernel : public framework::OpKernel<T> {
     auto hidden_dims = hidden->dims();
     int frame_size = hidden_dims[1];
 
-    math::LoDTensor2BatchFunctor<Place, T> to_batch;
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad;
     batch_hidden_grad.mutable_data<T>(hidden_dims, context.GetPlace());
     batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
     batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
                                                  context.GetPlace());
-    math::SetConstant<Place, T> zero;
-    auto& dev_ctx = context.device_context();
+    math::SetConstant<DeviceContext, T> zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     zero(dev_ctx, &batch_hidden_grad, static_cast<T>(0.0));
     zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
     zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
@@ -156,12 +157,13 @@ class GRUGradKernel : public framework::OpKernel<T> {
     Tensor ordered_h0, ordered_h0_grad;
     const size_t* order = batch_gate->lod()[2].data();
     if (h0) {
-      ReorderInitState<Place, T>(context.device_context(), *h0, order,
-                                 &ordered_h0, true);
+      ReorderInitState<DeviceContext, T>(dev_ctx, *h0, order, &ordered_h0,
+                                         true);
     }
     if (h0_grad) {
       ordered_h0_grad.mutable_data<T>(h0_grad->dims(), context.GetPlace());
-      zero(context.device_context(), &ordered_h0_grad, static_cast<T>(0.0));
+      zero(context.template device_context<DeviceContext>(), &ordered_h0_grad,
+           static_cast<T>(0.0));
     }
 
     bool is_reverse = context.Attr<bool>("is_reverse");
@@ -216,25 +218,25 @@ class GRUGradKernel : public framework::OpKernel<T> {
         gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
       }
 
-      math::GRUUnitGradFunctor<Place, T>::compute(
+      math::GRUUnitGradFunctor<DeviceContext, T>::compute(
           dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size,
           math::ActiveType(context.Attr<std::string>("activation")),
           math::ActiveType(context.Attr<std::string>("gate_activation")));
     }
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
-      math::Batch2LoDTensorFunctor<Place, T> to_seq;
+      math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
       batch_gate_grad.set_lod(batch_gate->lod());
       to_seq(dev_ctx, batch_gate_grad, *input_grad);
     }
     if (bias_grad) {
       bias_grad->mutable_data<T>(context.GetPlace());
-      math::ColwiseSum<Place, T> col_sum;
+      math::ColwiseSum<DeviceContext, T> col_sum;
       col_sum(dev_ctx, batch_gate_grad, bias_grad);
     }
     if (h0 && h0_grad) {
-      ReorderInitState<Place, T>(context.device_context(), ordered_h0_grad,
-                                 order, h0_grad, false);
+      ReorderInitState<DeviceContext, T>(dev_ctx, ordered_h0_grad, order,
+                                         h0_grad, false);
     }
   }
 
diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc
index 877c969103..705de87be5 100644
--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
@@ -201,9 +201,10 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
             ops::GRUUnitGradOp);
-REGISTER_OP_CPU_KERNEL(gru_unit,
-                       ops::GRUUnitKernel<paddle::platform::CPUPlace, float>,
-                       ops::GRUUnitKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GRUUnitGradKernel<paddle::platform::CPUPlace, double>);
+    gru_unit, ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    gru_unit_grad,
+    ops::GRUUnitGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUUnitGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/gru_unit_op.cu b/paddle/operators/gru_unit_op.cu
index 821c8c6421..7c752db494 100644
--- a/paddle/operators/gru_unit_op.cu
+++ b/paddle/operators/gru_unit_op.cu
@@ -16,9 +16,10 @@
 #include "paddle/operators/gru_unit_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(gru_unit,
-                       ops::GRUUnitKernel<paddle::platform::GPUPlace, float>,
-                       ops::GRUUnitKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
-    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::GPUPlace, float>,
-    ops::GRUUnitGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    gru_unit, ops::GRUUnitKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUUnitKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    gru_unit_grad,
+    ops::GRUUnitGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUUnitGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h
index 3398c0934e..8fe60c750d 100644
--- a/paddle/operators/gru_unit_op.h
+++ b/paddle/operators/gru_unit_op.h
@@ -34,7 +34,7 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GRUUnitKernel : public framework::OpKernel<T> {
  public:
   template <typename Device, typename X, typename Y>
@@ -71,7 +71,8 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     auto g = EigenMatrix<T>::From(*gate);
     auto r_h_p = EigenMatrix<T>::From(*reset_hidden_prev);
     auto h = EigenMatrix<T>::From(*hidden);
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     // calculate unactivated gate outputs
     if (bias) {
@@ -86,10 +87,10 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     const T* weight_data = weight->data<T>();
     T* gate_data = gate->data<T>();
     T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    math::gemm<Place, T>(context.device_context(), false, false, batch_size,
-                         2 * frame_size, frame_size, 1, hidden_prev_data,
-                         frame_size, weight_data, frame_size * 2, 1, gate_data,
-                         frame_size * 3);
+    math::gemm<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), false, false,
+        batch_size, 2 * frame_size, frame_size, 1, hidden_prev_data, frame_size,
+        weight_data, frame_size * 2, 1, gate_data, frame_size * 3);
 
     // calculate activited gate
     Eigen::array<int, 2> extents({{batch_size, frame_size}});
@@ -102,11 +103,11 @@ class GRUUnitKernel : public framework::OpKernel<T> {
                g.slice(r_offsets, extents), g.slice(r_offsets, extents));
     auto r = g.slice(r_offsets, extents);  // reset gate
     r_h_p.device(place) = r * h_p;         // reset previous hidden state
-    math::gemm<Place, T>(context.device_context(), false, false, batch_size,
-                         frame_size, frame_size, 1, reset_hidden_prev_data,
-                         frame_size, weight_data + frame_size * frame_size * 2,
-                         frame_size, 1, gate_data + frame_size * 2,
-                         frame_size * 3);
+    math::gemm<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), false, false,
+        batch_size, frame_size, frame_size, 1, reset_hidden_prev_data,
+        frame_size, weight_data + frame_size * frame_size * 2, frame_size, 1,
+        gate_data + frame_size * 2, frame_size * 3);
 
     Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
     ActCompute(context.Attr<int>("activation"), place,
@@ -118,7 +119,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GRUUnitGradKernel : public framework::OpKernel<T> {
  public:
   template <typename Device, typename X, typename Y, typename DX, typename DY>
@@ -166,7 +167,8 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     auto d_h = EigenMatrix<T>::From(*hidden_grad);
     auto d_g = EigenMatrix<T>::From(gate_grad);
     auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     int batch_size = input->dims()[0];
     int frame_size = hidden_prev->dims()[1];
@@ -186,11 +188,11 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     ActGradCompute(context.Attr<int>("activation"), place, c, c,
                    d_g.slice(c_offsets, extents), d_h * u);
     // backward for reset_hidden_prev
-    math::gemm<Place, T>(context.device_context(), false, true, batch_size,
-                         frame_size, frame_size, 1,
-                         gate_grad_data + frame_size * 2, frame_size * 3,
-                         weight_data + frame_size * frame_size * 2, frame_size,
-                         0, reset_hidden_prev_grad_data, frame_size);
+    math::gemm<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), false, true,
+        batch_size, frame_size, frame_size, 1, gate_grad_data + frame_size * 2,
+        frame_size * 3, weight_data + frame_size * frame_size * 2, frame_size,
+        0, reset_hidden_prev_grad_data, frame_size);
     // backward for unactivated reset gate
     ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
                    d_g.slice(r_offsets, extents), d_r_h_p * h_p);
@@ -198,17 +200,18 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     if (weight_grad) {
       T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
       // backward for state_weight
-      math::gemm<Place, T>(
-          context.device_context(), true, false, frame_size, frame_size,
-          batch_size, 1, reset_hidden_prev_data, frame_size,
-          gate_grad_data + frame_size * 2, frame_size * 3, 0,
+      math::gemm<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), true, false,
+          frame_size, frame_size, batch_size, 1, reset_hidden_prev_data,
+          frame_size, gate_grad_data + frame_size * 2, frame_size * 3, 0,
           weight_grad_data + frame_size * frame_size * 2, frame_size);
 
       // backward for update_gate_weight and reset_gate_weight
-      math::gemm<Place, T>(context.device_context(), true, false, frame_size,
-                           frame_size * 2, batch_size, 1, hidden_prev_data,
-                           frame_size, gate_grad_data, frame_size * 3, 0,
-                           weight_grad_data, frame_size * 2);
+      math::gemm<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), true, false,
+          frame_size, frame_size * 2, batch_size, 1, hidden_prev_data,
+          frame_size, gate_grad_data, frame_size * 3, 0, weight_grad_data,
+          frame_size * 2);
     }
     // backward for hidden_prev
     if (hidden_prev_grad) {
@@ -216,10 +219,11 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
           hidden_prev_grad->mutable_data<T>(context.GetPlace());
       auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
       d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
-      math::gemm<Place, T>(context.device_context(), false, true, batch_size,
-                           frame_size, frame_size * 2, 1, gate_grad_data,
-                           frame_size * 3, weight_data, frame_size * 2, 1,
-                           hidden_prev_grad_data, frame_size);
+      math::gemm<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), false, true,
+          batch_size, frame_size, frame_size * 2, 1, gate_grad_data,
+          frame_size * 3, weight_data, frame_size * 2, 1, hidden_prev_grad_data,
+          frame_size);
     }
     // backward for input
     if (input_grad) {
diff --git a/paddle/operators/hinge_loss_op.cc b/paddle/operators/hinge_loss_op.cc
index 1e13897bb6..373b4d99b4 100644
--- a/paddle/operators/hinge_loss_op.cc
+++ b/paddle/operators/hinge_loss_op.cc
@@ -106,8 +106,9 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
             hinge_loss_grad, ops::HingeLossGradOp);
-REGISTER_OP_CPU_KERNEL(hinge_loss,
-                       ops::HingeLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    hinge_loss,
+    ops::HingeLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     hinge_loss_grad,
-    ops::HingeLossGradKernel<paddle::platform::CPUPlace, float>);
+    ops::HingeLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/hinge_loss_op.cu b/paddle/operators/hinge_loss_op.cu
index ec20b08e30..31a5bde292 100644
--- a/paddle/operators/hinge_loss_op.cu
+++ b/paddle/operators/hinge_loss_op.cu
@@ -16,8 +16,9 @@
 #include "paddle/operators/hinge_loss_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(hinge_loss,
-                       ops::HingeLossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    hinge_loss,
+    ops::HingeLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     hinge_loss_grad,
-    ops::HingeLossGradKernel<paddle::platform::GPUPlace, float>);
+    ops::HingeLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/hinge_loss_op.h b/paddle/operators/hinge_loss_op.h
index c0be496f9c..91369cfb8a 100644
--- a/paddle/operators/hinge_loss_op.h
+++ b/paddle/operators/hinge_loss_op.h
@@ -19,14 +19,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class HingeLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* pred = context.Input<framework::Tensor>("Logits");
     auto* label = context.Input<framework::Tensor>("Labels");
     auto* loss = context.Output<framework::Tensor>("Loss");
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto x = framework::EigenVector<T>::Flatten(*pred);
     auto y = framework::EigenVector<T>::Flatten(*label);
@@ -38,7 +39,7 @@ class HingeLossKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class HingeLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -48,7 +49,8 @@ class HingeLossGradKernel : public framework::OpKernel<T> {
         context.Input<framework::Tensor>(framework::GradVarName("Loss"));
     auto* dpred =
         context.Output<framework::Tensor>(framework::GradVarName("Logits"));
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto x = framework::EigenVector<T>::Flatten(*pred);
     auto y = framework::EigenVector<T>::Flatten(*label);
diff --git a/paddle/operators/huber_loss_op.cc b/paddle/operators/huber_loss_op.cc
index 938803d5b3..11828d083a 100644
--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/operators/huber_loss_op.cc
@@ -124,8 +124,9 @@ class HuberLossGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
             huber_loss_grad, ops::HuberLossGradOp);
-REGISTER_OP_CPU_KERNEL(huber_loss,
-                       ops::HuberLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    huber_loss,
+    ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CPUPlace, float>);
+    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/huber_loss_op.cu b/paddle/operators/huber_loss_op.cu
index 317321dc6c..d49a4d9d42 100644
--- a/paddle/operators/huber_loss_op.cu
+++ b/paddle/operators/huber_loss_op.cu
@@ -16,8 +16,9 @@
 #include "paddle/operators/huber_loss_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(huber_loss,
-                       ops::HuberLossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    huber_loss,
+    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::GPUPlace, float>);
+    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/huber_loss_op.h b/paddle/operators/huber_loss_op.h
index 4e7bc55432..4dd20e8b08 100644
--- a/paddle/operators/huber_loss_op.h
+++ b/paddle/operators/huber_loss_op.h
@@ -41,7 +41,7 @@ struct HuberLossForward {
   T delta;
 };
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class HuberLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -50,7 +50,8 @@ class HuberLossKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>("Residual");
     auto* out1 = context.Output<Tensor>("Out");
     auto delta = static_cast<T>(context.Attr<AttrType>("delta"));
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto x = EigenVector<T>::Flatten(*in0);
     auto y = EigenVector<T>::Flatten(*in1);
@@ -85,7 +86,7 @@ struct HuberLossBackward {
   T delta;
 };
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class HuberLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -94,7 +95,8 @@ class HuberLossGradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     auto* out1 = context.Output<Tensor>(framework::GradVarName("Y"));
     auto delta = static_cast<T>(context.op().Attr<AttrType>("delta"));
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto residual = EigenVector<T>::Flatten(*in0);
     auto out_grad = EigenVector<T>::Flatten(*in1);
diff --git a/paddle/operators/l1_norm_op.cc b/paddle/operators/l1_norm_op.cc
index 02ebf02296..c0b51202c6 100644
--- a/paddle/operators/l1_norm_op.cc
+++ b/paddle/operators/l1_norm_op.cc
@@ -69,7 +69,8 @@ $$Out = \sum{|X|}$$
 namespace ops = paddle::operators;
 REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad,
             ops::L1NormGradOp);
-REGISTER_OP_CPU_KERNEL(l1_norm,
-                       ops::L1NormKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    l1_norm_grad, ops::L1NormGradKernel<paddle::platform::CPUPlace, float>);
+    l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    l1_norm_grad,
+    ops::L1NormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/l1_norm_op.cu b/paddle/operators/l1_norm_op.cu
index 1c206e04cc..fd725f86f6 100644
--- a/paddle/operators/l1_norm_op.cu
+++ b/paddle/operators/l1_norm_op.cu
@@ -16,7 +16,8 @@
 #include "paddle/operators/l1_norm_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(l1_norm,
-                       ops::L1NormKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    l1_norm_grad, ops::L1NormGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    l1_norm, ops::L1NormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    l1_norm_grad,
+    ops::L1NormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/l1_norm_op.h b/paddle/operators/l1_norm_op.h
index 3c60dc3dc7..ae3878f2b7 100644
--- a/paddle/operators/l1_norm_op.h
+++ b/paddle/operators/l1_norm_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 // Out = sum(abs(X))
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class L1NormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -30,14 +30,15 @@ class L1NormKernel : public framework::OpKernel<T> {
 
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto out = framework::EigenScalar<T>::From(*Out);
-    auto place = context.GetEigenDevice<Place>();
+    auto &place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     out.device(place) = x.abs().sum();
   }
 };
 
 // dX = dout * sign(X)
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class L1NormGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -52,7 +53,8 @@ class L1NormGradKernel : public framework::OpKernel<T> {
     auto x_eigen = framework::EigenVector<T>::Flatten(*x);
     auto d_out_eigen = framework::EigenVector<T>::Flatten(*d_out);
     auto dx_eigen = framework::EigenVector<T>::Flatten(*dx);
-    auto place = context.GetEigenDevice<Place>();
+    auto &place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<int, 1> x_dsize(x->numel());
     dx_eigen.device(place) = d_out_eigen.broadcast(x_dsize) * x_eigen.sign();
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
index 8e079a14e0..896e3657d4 100644
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -261,9 +261,10 @@ REGISTER_OP(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFOpMaker,
             linear_chain_crf_grad, ops::LinearChainCRFGradOp);
 REGISTER_OP_CPU_KERNEL(
     linear_chain_crf,
-    ops::LinearChainCRFOpKernel<paddle::platform::CPUPlace, float>,
-    ops::LinearChainCRFOpKernel<paddle::platform::CPUPlace, double>);
+    ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     linear_chain_crf_grad,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUPlace, float>,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUPlace, double>);
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUDeviceContext,
+                                    double>);
diff --git a/paddle/operators/linear_chain_crf_op.cu b/paddle/operators/linear_chain_crf_op.cu
index 6fc8995f4c..3b105ec341 100644
--- a/paddle/operators/linear_chain_crf_op.cu
+++ b/paddle/operators/linear_chain_crf_op.cu
@@ -16,11 +16,12 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     linear_chain_crf,
-    ops::LinearChainCRFOpKernel<paddle::platform::GPUPlace, float>,
-    ops::LinearChainCRFOpKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
+    ops::LinearChainCRFOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LinearChainCRFOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
     linear_chain_crf_grad,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::GPUPlace, float>,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::GPUPlace, double>);
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CUDADeviceContext,
+                                    double>);
diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h
index 014bbfa758..694584e79c 100644
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
@@ -50,7 +50,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LinearChainCRFOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -137,7 +137,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
         framework::make_ddim({static_cast<int64_t>(batch_size), 1}),
         platform::CPUPlace());
 
-    auto place = ctx.GetEigenDevice<platform::CPUPlace>();
+    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
     auto x = EigenMatrix<T>::From(*emission_weights);
     auto x_row_max = EigenMatrix<T>::From(emission_row_max);
     x_row_max.device(place) =
@@ -287,7 +288,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -359,8 +360,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     emission_grad->mutable_data<T>(platform::CPUPlace());
     if (transition_grad) {
       transition_grad->mutable_data<T>(platform::CPUPlace());
-      math::SetConstant<platform::CPUPlace, T>()(ctx.device_context(),
-                                                 transition_grad, 0.);
+      math::set_constant(ctx.device_context(), transition_grad, 0.);
     }
     // Now, all the inputs and outputs should be on the CPU memory.
 
@@ -384,10 +384,10 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
       Tensor one_seq_beta = beta.Slice(start_pos, end_pos);
       Tensor one_seq_emission_grad = emission_grad->Slice(start_pos, end_pos);
 
-      BackwardOneSequence(ctx.device_context(), ll_grad[i],
-                          one_seq_emission_exps, *transition_exps,
-                          one_seq_alpha, one_seq_label, &one_seq_beta,
-                          transition_grad, &one_seq_emission_grad);
+      BackwardOneSequence(
+          ctx.template device_context<platform::CPUDeviceContext>(), ll_grad[i],
+          one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label,
+          &one_seq_beta, transition_grad, &one_seq_emission_grad);
     }
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
@@ -441,8 +441,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     copyTensor(ctx, transition_grad_src, transition_grad_dst);
   }
 
-  void BackwardOneSequence(const platform::DeviceContext& ctx, const T ll_grad,
-                           const Tensor& emission_exps,
+  void BackwardOneSequence(const platform::CPUDeviceContext& ctx,
+                           const T ll_grad, const Tensor& emission_exps,
                            const Tensor& transition_exps, const Tensor& alpha,
                            const Tensor& label, Tensor* beta,
                            Tensor* transition_grad,
@@ -481,7 +481,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     auto alpha_mat = EigenMatrix<T>::From(alpha);
     auto beta_mat = EigenMatrix<T>::From(*beta);
 
-    auto* place = ctx.GetEigenDevice<platform::CPUPlace>();
+    auto* place = ctx.eigen_device();
     auto prob = alpha_mat * beta_mat;
     auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
                        .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
diff --git a/paddle/operators/lod_reset_op.cu b/paddle/operators/lod_reset_op.cu
index 5244a17c3a..f7c2358980 100644
--- a/paddle/operators/lod_reset_op.cu
+++ b/paddle/operators/lod_reset_op.cu
@@ -16,9 +16,10 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(lod_reset,
-                       ops::LoDResetKernel<paddle::platform::GPUPlace, float>,
-                       ops::LoDResetKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
-    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::GPUPlace, float>,
-    ops::LoDResetGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lod_reset, ops::LoDResetKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lod_reset_grad,
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/lod_reset_op.h b/paddle/operators/lod_reset_op.h
index cbcbf80adc..b86f8b1313 100644
--- a/paddle/operators/lod_reset_op.h
+++ b/paddle/operators/lod_reset_op.h
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LoDResetKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -65,7 +65,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LoDResetGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
diff --git a/paddle/operators/log_loss_op.cc b/paddle/operators/log_loss_op.cc
index 257e5c8a49..4524229a33 100644
--- a/paddle/operators/log_loss_op.cc
+++ b/paddle/operators/log_loss_op.cc
@@ -109,7 +109,8 @@ class LogLossGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
             ops::LogLossGradOp);
-REGISTER_OP_CPU_KERNEL(log_loss,
-                       ops::LogLossKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    log_loss_grad, ops::LogLossGradKernel<paddle::platform::CPUPlace, float>);
+    log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    log_loss_grad,
+    ops::LogLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/log_loss_op.cu b/paddle/operators/log_loss_op.cu
index 6c189ef341..e87ac7d12a 100644
--- a/paddle/operators/log_loss_op.cu
+++ b/paddle/operators/log_loss_op.cu
@@ -16,7 +16,8 @@
 #include "paddle/operators/log_loss_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(log_loss,
-                       ops::LogLossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    log_loss_grad, ops::LogLossGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    log_loss_grad,
+    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/log_loss_op.h b/paddle/operators/log_loss_op.h
index 73404fce91..743eddb740 100644
--- a/paddle/operators/log_loss_op.h
+++ b/paddle/operators/log_loss_op.h
@@ -24,7 +24,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class LogLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -38,7 +38,7 @@ class LogLossKernel : public framework::OpKernel<T> {
     auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
 
     auto loss = EigenVector<T>::Flatten(*loss_out);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     loss.device(place) = (-(label * (prediction + epsilon).log()) -
                           ((static_cast<T>(1) - label) *
@@ -46,7 +46,7 @@ class LogLossKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class LogLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -59,7 +59,7 @@ class LogLossGradKernel : public framework::OpKernel<T> {
     auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
 
     auto dl = EigenVector<T>::Flatten(*dloss);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     if (dpred) {
       dpred->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/operators/logical_op.cc b/paddle/operators/logical_op.cc
index a37582c1d8..c818d5e9c1 100644
--- a/paddle/operators/logical_op.cc
+++ b/paddle/operators/logical_op.cc
@@ -139,15 +139,16 @@ class LogicalOp : public framework::OperatorWithKernel {
       ::paddle::operators::UnaryLogicalOpInferShape<_##op_type##Comment>, \
       ::paddle::framework::EmptyGradOpMaker);
 
-REGISTER_BINARY_LOGICAL_OP(logical_and, "Out = X && Y");
+REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$");
 REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU,
                                paddle::operators::LogicalAndFunctor);
-REGISTER_BINARY_LOGICAL_OP(logical_or, "Out = X && Y");
+REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$");
 REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU,
                                paddle::operators::LogicalOrFunctor);
-REGISTER_UNARY_LOGICAL_OP(logical_not, "Out = !X");
+REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
 REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
                               paddle::operators::LogicalNotFunctor);
-REGISTER_BINARY_LOGICAL_OP(logical_xor, "Out = (X || Y) && !(X && Y)");
+REGISTER_BINARY_LOGICAL_OP(logical_xor,
+                           "$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$");
 REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
                                paddle::operators::LogicalXorFunctor);
diff --git a/paddle/operators/logical_op.cu b/paddle/operators/logical_op.cu
index d41239b2ca..7fef60e0c9 100644
--- a/paddle/operators/logical_op.cu
+++ b/paddle/operators/logical_op.cu
@@ -14,11 +14,11 @@
 
 #include "paddle/operators/logical_op.h"
 
-REGISTER_BINARY_LOGICAL_KERNEL(logical_and, GPU,
+REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA,
                                paddle::operators::LogicalAndFunctor);
-REGISTER_BINARY_LOGICAL_KERNEL(logical_or, GPU,
+REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CUDA,
                                paddle::operators::LogicalOrFunctor);
-REGISTER_UNARY_LOGICAL_KERNEL(logical_not, GPU,
+REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CUDA,
                               paddle::operators::LogicalNotFunctor);
-REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, GPU,
+REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CUDA,
                                paddle::operators::LogicalXorFunctor);
diff --git a/paddle/operators/logical_op.h b/paddle/operators/logical_op.h
index 6e78a7d6ed..629388cac8 100644
--- a/paddle/operators/logical_op.h
+++ b/paddle/operators/logical_op.h
@@ -47,7 +47,7 @@ struct LogicalXorFunctor {
   }
 };
 
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class BinaryLogicalOpKernel
     : public framework::OpKernel<typename Functor::ELEM_TYPE> {
  public:
@@ -57,14 +57,14 @@ class BinaryLogicalOpKernel
     auto* y = context.Input<framework::Tensor>("Y");
     auto* out = context.Output<framework::Tensor>("Out");
     Functor binary_func;
-    platform::Transform<Place> trans;
-    trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
-          y->data<T>(), out->mutable_data<bool>(context.GetPlace()),
-          binary_func);
+    platform::Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x->data<T>(),
+          x->data<T>() + x->numel(), y->data<T>(),
+          out->mutable_data<bool>(context.GetPlace()), binary_func);
   }
 };
 
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class UnaryLogicalOpKernel
     : public framework::OpKernel<typename Functor::ELEM_TYPE> {
  public:
@@ -73,8 +73,9 @@ class UnaryLogicalOpKernel
     auto* x = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
     Functor unary_func;
-    platform::Transform<Place> trans;
-    trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
+    platform::Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x->data<T>(),
+          x->data<T>() + x->numel(),
           out->mutable_data<bool>(context.GetPlace()), unary_func);
   }
 };
@@ -85,9 +86,9 @@ class UnaryLogicalOpKernel
 #define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \
   REGISTER_OP_##dev##_KERNEL(                                 \
       op_type, ::paddle::operators::BinaryLogicalOpKernel<    \
-                   ::paddle::platform::dev##Place, functor<bool>>);
+                   ::paddle::platform::dev##DeviceContext, functor<bool>>);
 
 #define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \
   REGISTER_OP_##dev##_KERNEL(                                \
       op_type, ::paddle::operators::UnaryLogicalOpKernel<    \
-                   ::paddle::platform::dev##Place, functor<bool>>);
+                   ::paddle::platform::dev##DeviceContext, functor<bool>>);
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
index 84b044184a..9431030a53 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -85,6 +85,8 @@ template <typename T>
 class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
+    auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
     bool is_sparse = context.Attr<bool>("is_sparse");
     if (is_sparse) {
       auto* ids = context.Input<LoDTensor>("Ids");
@@ -95,7 +97,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       auto* ids_data = ids->data<int64_t>();
       auto ids_dim = ids->dims();
 
-      auto stream = context.cuda_device_context().stream();
+      auto stream = dev_ctx.stream();
       // copy GPU memory to CPU pinned memory
       framework::Vector<int64_t> new_rows;
       new_rows.resize(ids_dim[0]);
@@ -129,14 +131,11 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
 
       auto t = framework::EigenVector<T>::Flatten(*d_table_t);
-      t.device(context.GetEigenDevice<platform::GPUPlace>()) =
-          t.constant(static_cast<T>(0));
+      t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
 
       dim3 threads(128, 8);
       dim3 grids(8, 1);
-      LookupTableGrad<
-          T, 128, 8,
-          8><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+      LookupTableGrad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>(
           d_table, d_output, ids, N, K, D);
     }
   }
@@ -146,7 +145,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>,
-                       ops::LookupTableCUDAKernel<double>);
-REGISTER_OP_GPU_KERNEL(lookup_table_grad, ops::LookupTableGradCUDAKernel<float>,
-                       ops::LookupTableGradCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>,
+                        ops::LookupTableCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(lookup_table_grad,
+                        ops::LookupTableGradCUDAKernel<float>,
+                        ops::LookupTableGradCUDAKernel<double>);
diff --git a/paddle/operators/lrn_op.cc b/paddle/operators/lrn_op.cc
index 00392b7967..b5b7bc940a 100644
--- a/paddle/operators/lrn_op.cc
+++ b/paddle/operators/lrn_op.cc
@@ -19,6 +19,103 @@ namespace operators {
 
 using framework::Tensor;
 
+template <typename T>
+struct LRNFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta) {
+    auto x_v = framework::EigenVector<T>::Flatten(input);
+
+    const int start = -(n - 1) / 2;
+    const int end = start + n;
+
+    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
+    e_mid = e_mid.constant(k);
+
+    auto e_x = framework::EigenTensor<T, 4>::From(input);
+    for (int m = 0; m < N; m++) {
+      for (int i = 0; i < C; i++) {
+        for (int c = start; c <= end; c++) {
+          int ch = i + c;
+          if (ch >= 0 && ch < C) {
+            auto s = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+            auto r = e_x.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                               Eigen::array<int, 4>({{1, 1, H, W}}));
+
+            s += alpha * r.square();
+          }
+        }
+      }
+    }
+
+    auto out_e = framework::EigenVector<T>::Flatten(*out);
+    out_e = x_v * e_mid.reshape(Eigen::DSizes<int, 1>(e_mid.size())).pow(-beta);
+  }
+};
+template struct LRNFunctor<platform::CPUDeviceContext, float>;
+template struct LRNFunctor<platform::CPUDeviceContext, double>;
+
+template <typename T>
+struct LRNGradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta) {
+    T ratio = -2 * alpha * beta;
+    auto x_g_e = framework::EigenVector<T>::Flatten(*x_g);
+    x_g_e = x_g_e.constant(0.0);
+
+    auto e_x = framework::EigenTensor<T, 4>::From(x);
+    auto e_x_g = framework::EigenTensor<T, 4>::From(*x_g);
+    auto e_out = framework::EigenTensor<T, 4>::From(out);
+    auto e_out_g = framework::EigenTensor<T, 4>::From(out_g);
+    auto e_mid = framework::EigenTensor<T, 4>::From(mid);
+
+    const int start = -(n - 1) / 2;
+    const int end = start + n;
+    for (int m = 0; m < N; m++) {
+      for (int i = 0; i < C; i++) {
+        auto i_x = e_x.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                             Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_x_g = e_x_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                     Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_mid = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        i_x_g = i_mid.pow(-beta) * i_out_g;
+        for (int c = start; c <= end; c++) {
+          int ch = i + c;
+          if (ch < 0 || ch >= C) {
+            continue;
+          }
+
+          auto c_out = e_out.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                   Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          auto c_mid = e_mid.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                   Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          auto c_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                       Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          i_x_g += ratio * c_out_g * c_out * i_x / c_mid;
+        }
+      }
+    }
+  }
+};
+template struct LRNGradFunctor<platform::CPUDeviceContext, float>;
+template struct LRNGradFunctor<platform::CPUDeviceContext, double>;
+
 class LRNOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -83,8 +180,8 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Local Response Normalization Operator.
 
-This operator comes from the paper
-"ImageNet Classification with Deep Convolutional Neural Networks".
+This operator comes from the paper:
+<<ImageNet Classification with Deep Convolutional Neural Networks>>.
 
 The original formula is:
 
@@ -107,7 +204,7 @@ Input(i, x, y), Output(i, x, y) represents an element in an image.
 C is the number of feature maps of one image. n is a hyper-parameter
 configured when operator is initialized. The sum in the denominator
 is the sum of the same positions in the neighboring maps.
-    
+
 )DOC");
   }
 };
@@ -119,8 +216,7 @@ class LRNOpGrad : public framework::OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("MidOut")),
-                   "Input(MidOut@GRAD) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("MidOut"), "Input(MidOut) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null");
 
@@ -134,6 +230,7 @@ class LRNOpGrad : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker<float>, lrn_grad, ops::LRNOpGrad);
-REGISTER_OP_CPU_KERNEL(lrn, ops::LRNKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(lrn_grad,
-                       ops::LRNGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    lrn, ops::LRNKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    lrn_grad, ops::LRNGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/lrn_op.cu b/paddle/operators/lrn_op.cu
index 607dc6d86a..c6857c2b6d 100644
--- a/paddle/operators/lrn_op.cu
+++ b/paddle/operators/lrn_op.cu
@@ -12,11 +12,167 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/operators/lrn_op.h"
 
-namespace ops = paddle::operators;
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void KeCMRNormFillScale(int img_size, const T* in, T* mid, int C,
+                                   int H, int W, int size, T k, T alpha) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < img_size) {
+    const int w = idx % W;
+    const int h = (idx / W) % H;
+    const int n = idx / W / H;
+    const int offset = (n * C * H + h) * W + w;
+
+    in += offset;
+    mid += offset;
+    const int step = H * W;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+
+    T accum = 0;
+    int index = 0;
+    while (index < C + post_pad) {
+      if (index < C) {
+        T val = in[index * step];
+        accum += val * val;
+      }
+      if (index >= size) {
+        T val = in[(index - size) * step];
+        accum -= val * val;
+      }
+      if (index >= post_pad) {
+        mid[(index - post_pad) * step] = k + accum * alpha;
+      }
+      ++index;
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeCMRNormOutput(int input_size, const T* in, const T* mid,
+                                T negative_beta, T* out) {
+  const int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < input_size) {
+    out[index] = in[index] * pow(mid[index], negative_beta);
+  }
+}
+
+template <typename T>
+void CrossMapNormal(const framework::ExecutionContext& ctx, const T* inputs,
+                    T* outputs, T* mid, int N, int C, int H, int W, int n, T k,
+                    T alpha, T beta) {
+  int img_size = N * H * W;
+  const int block_size = 1024;
+  int grid_size = (img_size + block_size - 1) / block_size;
+
+  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  KeCMRNormFillScale<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      img_size, inputs, mid, C, H, W, n, k, alpha);
+
+  int input_size = N * H * W * C;
+  grid_size = (input_size + block_size - 1) / block_size;
+  KeCMRNormOutput<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      input_size, inputs, mid, -beta, outputs);
+}
+
+template <typename T>
+struct LRNFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta) {
+    CrossMapNormal<T>(
+        ctx, input.data<T>(), out->mutable_data<T>(ctx.GetPlace()),
+        mid->mutable_data<T>(ctx.GetPlace()), N, C, H, W, n, k, alpha, beta);
+  }
+};
+
+template struct LRNFunctor<platform::CUDADeviceContext, float>;
+template struct LRNFunctor<platform::CUDADeviceContext, double>;
 
-REGISTER_OP_GPU_KERNEL(lrn, ops::LRNKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(lrn_grad,
-                       ops::LRNGradKernel<paddle::platform::GPUPlace, float>);
+template <typename T>
+__global__ void KeCMRNormDiff(int img_size, const T* x, const T* out,
+                              const T* mid, T* x_g, const T* out_g, int C,
+                              int H, int W, int size, T negative_beta,
+                              T ratio) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < img_size) {
+    const int w = idx % W;
+    const int h = (idx / W) % H;
+    const int n = idx / W / H;
+    const int offset = (n * C * H + h) * W + w;
+    x += offset;
+    out += offset;
+    mid += offset;
+    out_g += offset;
+    x_g += offset;
+
+    const int step = H * W;
+    const int pre_pad = size - (size + 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+
+    int index = 0;
+    T accum = 0;
+    // TODO(gongwb): optimize this with thread shared array.
+    while (index < C + post_pad) {
+      if (index < C) {
+        x_g[index * step] = 0.0;
+        accum += out_g[index * step] * out[index * step] / mid[index * step];
+      }
+      if (index >= size) {
+        accum -= out_g[(index - size) * step] * out[(index - size) * step] /
+                 mid[(index - size) * step];
+      }
+      if (index >= post_pad) {
+        x_g[(index - post_pad) * step] +=
+            out_g[(index - post_pad) * step] *
+                pow(mid[(index - post_pad) * step], negative_beta) -
+            ratio * x[(index - post_pad) * step] * accum;
+      }
+      ++index;
+    }
+  }
+}
+
+template <typename T>
+void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const T* x,
+                        const T* out, const T* mid, T* x_g, const T* out_g,
+                        int N, int C, int H, int W, int n, T alpha, T beta) {
+  int img_size = N * H * W;
+
+  const int block_size = 1024;
+  int grid_size = (img_size + block_size - 1) / block_size;
+
+  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  KeCMRNormDiff<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      img_size, x, out, mid, x_g, out_g, C, H, W, n, -beta,
+      2.0f * alpha * beta);
+}
+
+template <typename T>
+struct LRNGradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta) {
+    CrossMapNormalGrad<T>(ctx, x.data<T>(), out.data<T>(), mid.data<T>(),
+                          x_g->mutable_data<T>(ctx.GetPlace()), out_g.data<T>(),
+                          N, C, H, W, n, alpha, beta);
+  }
+};
+
+template struct LRNGradFunctor<platform::CUDADeviceContext, float>;
+template struct LRNGradFunctor<platform::CUDADeviceContext, double>;
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    lrn, ops::LRNKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    lrn_grad, ops::LRNGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/lrn_op.h b/paddle/operators/lrn_op.h
index 606c657443..44063d3e03 100644
--- a/paddle/operators/lrn_op.h
+++ b/paddle/operators/lrn_op.h
@@ -21,7 +21,15 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename place, typename T>
+struct LRNFunctor {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta);
+};
+
+template <typename DeviceContext, typename T>
 class LRNKernel : public framework::OpKernel<T> {
  public:
   using Tensor = framework::Tensor;
@@ -31,8 +39,8 @@ class LRNKernel : public framework::OpKernel<T> {
   // f(x) represents outputs
   void Compute(const framework::ExecutionContext& ctx) const override {
     // input
-    const Tensor* x = ctx.Input<Tensor>("X");
-    auto x_dims = x->dims();
+    const Tensor& x = *ctx.Input<Tensor>("X");
+    auto x_dims = x.dims();
 
     // NCHW
     int N = x_dims[0];
@@ -57,38 +65,20 @@ class LRNKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0");
     PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0");
 
-    auto x_v = framework::EigenVector<T>::Flatten(*x);
-
-    const int start = -(n - 1) / 2;
-    const int end = start + n;
-
-    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
-    e_mid.device(ctx.GetEigenDevice<Place>()) = e_mid.constant(k);
-
-    auto e_x = framework::EigenTensor<T, 4>::From(*x);
-    for (int m = 0; m < N; m++) {
-      for (int i = 0; i < C; i++) {
-        for (int c = start; c <= end; c++) {
-          int ch = i + c;
-          if (ch >= 0 && ch < C) {
-            auto s = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-            auto r = e_x.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                               Eigen::array<int, 4>({{1, 1, H, W}}));
-
-            s.device(ctx.GetEigenDevice<Place>()) += alpha * r.square();
-          }
-        }
-      }
-    }
-
-    auto out_e = framework::EigenVector<T>::Flatten(*out);
-    out_e.device(ctx.GetEigenDevice<Place>()) =
-        x_v * e_mid.reshape(Eigen::DSizes<int, 1>(e_mid.size())).pow(-beta);
+    LRNFunctor<DeviceContext, T> f;
+    f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta);
   }
 };
 
+template <typename DeviceContext, typename T>
+struct LRNGradFunctor {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta);
+};
+
 /**
  * \brief Backward calculation for normalization with across maps.
  *
@@ -97,7 +87,7 @@ class LRNKernel : public framework::OpKernel<T> {
  * The implementation of this Function is derived from the
  * CrossMapNormalFunc implementation.
  *
- * InputGrad = OutputGrad * denoms ^ (-beta)
+ * InputGrad = OutputGrad * MidOut ^ (-beta)
  *    -- upper
  *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / MidOut) * InputValue
  *    -- lower
@@ -108,23 +98,20 @@ class LRNKernel : public framework::OpKernel<T> {
  * The upper and lower is the same as forward. The logic of the sum
  * is also the same as forward.
  */
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LRNGradKernel : public framework::OpKernel<T> {
  public:
   using Tensor = framework::Tensor;
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* out = ctx.Input<Tensor>("Out");
-    const Tensor* out_g = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    const Tensor* mid = ctx.Input<Tensor>("MidOut");
+    const Tensor& x = *ctx.Input<Tensor>("X");
+    const Tensor& out = *ctx.Input<Tensor>("Out");
+    const Tensor& out_g = *ctx.Input<Tensor>(framework::GradVarName("Out"));
+    const Tensor& mid = *ctx.Input<Tensor>("MidOut");
 
     auto x_g = ctx.Output<Tensor>(framework::GradVarName("X"));
     x_g->mutable_data<T>(ctx.GetPlace());
 
-    auto x_g_e = framework::EigenVector<T>::Flatten(*x_g);
-    x_g_e.device(ctx.GetEigenDevice<Place>()) = x_g_e.constant(0.0);
-
-    auto x_dims = x->dims();
+    auto x_dims = x.dims();
     int N = x_dims[0];
     int C = x_dims[1];
     int H = x_dims[2];
@@ -133,51 +120,9 @@ class LRNGradKernel : public framework::OpKernel<T> {
     int n = ctx.Attr<int>("n");
     T alpha = ctx.Attr<T>("alpha");
     T beta = ctx.Attr<T>("beta");
-    T ratio = -2 * alpha * beta;
-
-    auto e_x = framework::EigenTensor<T, 4>::From(*x);
-    auto e_x_g = framework::EigenTensor<T, 4>::From(*x_g);
-    auto e_out = framework::EigenTensor<T, 4>::From(*out);
-    auto e_out_g = framework::EigenTensor<T, 4>::From(*out_g);
-    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
-
-    const int start = -(n - 1) / 2;
-    const int end = start + n;
-    for (int m = 0; m < N; m++) {
-      for (int i = 0; i < C; i++) {
-        auto i_x = e_x.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                             Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_x_g = e_x_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                     Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_mid = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        i_x_g.device(ctx.GetEigenDevice<Place>()) = i_mid.pow(-beta) * i_out_g;
-        for (int c = start; c <= end; c++) {
-          int ch = i + c;
-          if (ch < 0 || ch >= C) {
-            continue;
-          }
-
-          auto c_out = e_out.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                   Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          auto c_mid = e_mid.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                   Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          auto c_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                       Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          i_x_g.device(ctx.GetEigenDevice<Place>()) +=
-              ratio * c_out_g * c_out * i_x / c_mid;
-        }
-      }
-    }
+
+    LRNGradFunctor<DeviceContext, T> f;
+    f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta);
   }
 };
 
diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc
index fa8e5f2da8..2db7da30db 100644
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@@ -273,8 +273,9 @@ class LSTMGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp);
-REGISTER_OP_CPU_KERNEL(lstm, ops::LSTMKernel<paddle::platform::CPUPlace, float>,
-                       ops::LSTMKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(lstm_grad,
-                       ops::LSTMGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::LSTMGradKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    lstm_grad, ops::LSTMGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/lstm_op.cu.cc b/paddle/operators/lstm_op.cu.cc
index 610cbb03e8..48519bed6f 100644
--- a/paddle/operators/lstm_op.cu.cc
+++ b/paddle/operators/lstm_op.cu.cc
@@ -15,8 +15,9 @@
 #include "paddle/operators/lstm_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(lstm, ops::LSTMKernel<paddle::platform::GPUPlace, float>,
-                       ops::LSTMKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(lstm_grad,
-                       ops::LSTMGradKernel<paddle::platform::GPUPlace, float>,
-                       ops::LSTMGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lstm, ops::LSTMKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lstm_grad, ops::LSTMGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h
index a78f548aaf..14abd4bf0a 100644
--- a/paddle/operators/lstm_op.h
+++ b/paddle/operators/lstm_op.h
@@ -24,16 +24,16 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
-inline void ReorderInitState(const platform::DeviceContext& ctx,
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
                              const framework::Tensor& src, const size_t* index,
                              framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<Place, T> row_shuffle;
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
   row_shuffle(ctx, src, index, *dst, indexed_src);
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LSTMKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -52,8 +52,8 @@ class LSTMKernel : public framework::OpKernel<T> {
     cell_out->mutable_data<T>(ctx.GetPlace());
 
     bool is_reverse = ctx.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<Place, T> to_batch;
-    auto& device_ctx = ctx.device_context();
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
     to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
 
     auto in_dims = input->dims();
@@ -64,7 +64,7 @@ class LSTMKernel : public framework::OpKernel<T> {
       Tensor b = *bias;
       b.Resize({bias->numel(), 1});
       Tensor gate_bias = b.Slice(0, 4 * frame_size);
-      math::RowwiseAdd<Place, T> add_bias;
+      math::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
     }
 
@@ -88,8 +88,8 @@ class LSTMKernel : public framework::OpKernel<T> {
       // Since the batch computing for LSTM reorders the input sequence
       // according to their length. The initialized cell state also needs
       // to reorder.
-      ReorderInitState<Place, T>(device_ctx, *cell_t0, order, &ordered_c0,
-                                 true);
+      ReorderInitState<DeviceContext, T>(device_ctx, *cell_t0, order,
+                                         &ordered_c0, true);
       lstm_value.prev_state_value = ordered_c0.data<T>();
     }
 
@@ -121,9 +121,9 @@ class LSTMKernel : public framework::OpKernel<T> {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
-        math::matmul<Place, T>(device_ctx, pre_hidden_t, false, *weight, false,
-                               static_cast<T>(1.0), &gate_t,
-                               static_cast<T>(1.0));
+        math::matmul<DeviceContext, T>(device_ctx, pre_hidden_t, false, *weight,
+                                       false, static_cast<T>(1.0), &gate_t,
+                                       static_cast<T>(1.0));
       } else if (hidden_t0) {
         // If n == 0 and there is no initialized hidden state, that is to say
         // the H0 is zeros, the calculation W_h * H0 will be skiped.
@@ -133,24 +133,24 @@ class LSTMKernel : public framework::OpKernel<T> {
         // according to their length. The initialized hidden state also needs
         // to reorder.
         Tensor ordered_h0;
-        ReorderInitState<Place, T>(device_ctx, *hidden_t0, order, &ordered_h0,
-                                   true);
-        math::matmul<Place, T>(device_ctx, ordered_h0, false, *weight, false,
-                               static_cast<T>(1.0), &gate_t,
-                               static_cast<T>(1.0));
+        ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
+                                           &ordered_h0, true);
+        math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false, *weight,
+                                       false, static_cast<T>(1.0), &gate_t,
+                                       static_cast<T>(1.0));
       }
 
       lstm_value.gate_value = gate_t.data<T>();
       lstm_value.output_value = out_t.data<T>();
       lstm_value.state_value = cell_t.data<T>();
       lstm_value.state_active_value = cell_pre_act_t.data<T>();
-      math::LstmUnitFunctor<Place, T>::compute(device_ctx, lstm_value,
-                                               frame_size, cur_batch_size,
-                                               gate_act, cell_act, cand_act);
+      math::LstmUnitFunctor<DeviceContext, T>::compute(
+          device_ctx, lstm_value, frame_size, cur_batch_size, gate_act,
+          cell_act, cand_act);
       lstm_value.prev_state_value = lstm_value.state_value;
     }
 
-    math::Batch2LoDTensorFunctor<Place, T> to_seq;
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_hidden.set_lod(batch_gate->lod());
     // restore the output hidden in LoDTensor from the batch hidden
     to_seq(device_ctx, batch_hidden, *hidden_out);
@@ -161,7 +161,7 @@ class LSTMKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LSTMGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -187,8 +187,8 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
     auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
 
-    auto& device_ctx = ctx.device_context();
-    math::SetConstant<Place, T> zero;
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
     if (weight_g) {
       weight_g->mutable_data<T>(ctx.GetPlace());
       zero(device_ctx, weight_g, static_cast<T>(0.0));
@@ -200,7 +200,8 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
     const size_t* order = batch_gate->lod()[2].data();
     if (c0) {
-      ReorderInitState<Place, T>(device_ctx, *c0, order, &ordered_c0, true);
+      ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
+                                         true);
     }
     if (c0 && c0_g) {
       ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
@@ -240,10 +241,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       lstm_grad.check_og_grad = nullptr;
     }
 
-    math::LoDTensor2BatchFunctor<Place, T> to_batch;
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
 
     auto ToBatch = [&batch_gate, &to_batch](
-        const platform::DeviceContext& ctx, const framework::LoDTensor& src,
+        const DeviceContext& ctx, const framework::LoDTensor& src,
         const framework::DDim& dims, framework::LoDTensor& dst) {
       dst.mutable_data<T>(dims, ctx.GetPlace());
       dst.set_lod(batch_gate->lod());
@@ -299,7 +300,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       }
 
       int cur_batch_size = bend - bstart;
-      math::LstmUnitGradFunctor<Place, T>::compute(
+      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
           device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
           gate_act, cell_act, cand_act);
 
@@ -307,33 +308,34 @@ class LSTMGradKernel : public framework::OpKernel<T> {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end);
-        math::matmul<Place, T>(device_ctx, gate_g, false, *weight, true,
-                               static_cast<T>(1.0), &pre_hidden_g,
-                               static_cast<T>(1.0));
+        math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true,
+                                       static_cast<T>(1.0), &pre_hidden_g,
+                                       static_cast<T>(1.0));
         if (weight_g) {
           /* backward weight */
           auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end);
-          math::matmul<Place, T>(device_ctx, pre_hidden, true, gate_g, false,
-                                 static_cast<T>(1.0), weight_g,
-                                 static_cast<T>(1.0));
+          math::matmul<DeviceContext, T>(device_ctx, pre_hidden, true, gate_g,
+                                         false, static_cast<T>(1.0), weight_g,
+                                         static_cast<T>(1.0));
         }
       } else {
         if (h0 && weight_g) {
-          ReorderInitState<Place, T>(device_ctx, *h0, order, &ordered_h0, true);
-          math::matmul<Place, T>(device_ctx, ordered_h0, true, gate_g, false,
-                                 static_cast<T>(1.0), weight_g,
-                                 static_cast<T>(1.0));
+          ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
+                                             &ordered_h0, true);
+          math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true, gate_g,
+                                         false, static_cast<T>(1.0), weight_g,
+                                         static_cast<T>(1.0));
         }
         if (h0 && h0_g) {
           ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
-          math::matmul<Place, T>(device_ctx, gate_g, false, *weight, true,
-                                 static_cast<T>(1.0), &ordered_h0_g,
-                                 static_cast<T>(0.0));
+          math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
+                                         true, static_cast<T>(1.0),
+                                         &ordered_h0_g, static_cast<T>(0.0));
         }
       }
     }
 
-    math::Batch2LoDTensorFunctor<Place, T> to_seq;
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     if (in_g) {
       /* backward data */
       in_g->mutable_data<T>(ctx.GetPlace());
@@ -344,15 +346,17 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       Tensor b_g = *bias_g;
       b_g.Resize({bias_g->numel(), 1});
       Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
-      math::ColwiseSum<Place, T> col_sum;
+      math::ColwiseSum<DeviceContext, T> col_sum;
       col_sum(device_ctx, batch_gate_g, &gate_bias_g);
     }
 
     if (h0 && h0_g) {
-      ReorderInitState<Place, T>(device_ctx, ordered_h0_g, order, h0_g, false);
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_h0_g, order, h0_g,
+                                         false);
     }
     if (c0 && c0_g) {
-      ReorderInitState<Place, T>(device_ctx, ordered_c0_g, order, c0_g, false);
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_c0_g, order, c0_g,
+                                         false);
     }
   }
 };
diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu
index e192283aa0..291f2c295e 100644
--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/operators/lstm_unit_op.cu
@@ -173,7 +173,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>,
-                       ops::LstmUnitOpCUDAKernel<double>);
-REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>,
-                       ops::LstmUnitGradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>,
+                        ops::LstmUnitOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>,
+                        ops::LstmUnitGradOpCUDAKernel<double>);
diff --git a/paddle/operators/lstm_unit_op.h b/paddle/operators/lstm_unit_op.h
index 38cb298f92..61705675d9 100644
--- a/paddle/operators/lstm_unit_op.h
+++ b/paddle/operators/lstm_unit_op.h
@@ -35,7 +35,7 @@ inline T tanh(T x) {
   return 2. * sigmoid(2. * x) - 1.;
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LstmUnitKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -78,7 +78,7 @@ class LstmUnitKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LstmUnitGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/margin_rank_loss_op.cc b/paddle/operators/margin_rank_loss_op.cc
index d7e8a0ea76..42e8961c0e 100644
--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
@@ -117,7 +117,7 @@ REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp,
             ops::MarginRankLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     margin_rank_loss,
-    ops::MarginRankLossKernel<paddle::platform::CPUPlace, float>);
+    ops::MarginRankLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     margin_rank_loss_grad,
-    ops::MarginRankLossGradKernel<paddle::platform::CPUPlace, float>);
+    ops::MarginRankLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/margin_rank_loss_op.cu b/paddle/operators/margin_rank_loss_op.cu
index 3a639f25d4..1c2afccc5b 100644
--- a/paddle/operators/margin_rank_loss_op.cu
+++ b/paddle/operators/margin_rank_loss_op.cu
@@ -16,9 +16,9 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     margin_rank_loss,
-    ops::MarginRankLossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+    ops::MarginRankLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     margin_rank_loss_grad,
-    ops::MarginRankLossGradKernel<paddle::platform::GPUPlace, float>);
+    ops::MarginRankLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/margin_rank_loss_op.h b/paddle/operators/margin_rank_loss_op.h
index 8d0830147e..9c1f96cac1 100644
--- a/paddle/operators/margin_rank_loss_op.h
+++ b/paddle/operators/margin_rank_loss_op.h
@@ -34,7 +34,7 @@ struct Heaviside {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MarginRankLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -56,13 +56,13 @@ class MarginRankLossKernel : public framework::OpKernel<T> {
     auto x1 = framework::EigenVector<T>::Flatten(*x1_t);
     auto x2 = framework::EigenVector<T>::Flatten(*x2_t);
 
-    auto& dev = ctx.GetEigenDevice<Place>();
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
     out.device(dev) = (-label * (x1 - x2) + margin).unaryExpr(ReLU<T>());
     act.device(dev) = out.unaryExpr(Heaviside<T>());
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MarginRankLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -78,7 +78,7 @@ class MarginRankLossGradKernel : public framework::OpKernel<T> {
     auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
     auto act = framework::EigenVector<T>::Flatten(*act_t);
     auto label = framework::EigenVector<T>::Flatten(*label_t);
-    auto& dev = ctx.GetEigenDevice<Place>();
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
 
     // compute d_x1
     if (d_x1_t) {
diff --git a/paddle/operators/math/context_project.cc b/paddle/operators/math/context_project.cc
index f82ea5d7be..980dd90df8 100644
--- a/paddle/operators/math/context_project.cc
+++ b/paddle/operators/math/context_project.cc
@@ -18,8 +18,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class ContextProjectFunctor<platform::CPUPlace, float>;
-template class ContextProjectFunctor<platform::CPUPlace, double>;
+template class ContextProjectFunctor<platform::CPUDeviceContext, float>;
+template class ContextProjectFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/context_project.cu b/paddle/operators/math/context_project.cu
index 04eeed543c..934e3df645 100644
--- a/paddle/operators/math/context_project.cu
+++ b/paddle/operators/math/context_project.cu
@@ -20,8 +20,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class ContextProjectFunctor<platform::GPUPlace, float>;
-template class ContextProjectFunctor<platform::GPUPlace, double>;
+template class ContextProjectFunctor<platform::CUDADeviceContext, float>;
+template class ContextProjectFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/context_project.h b/paddle/operators/math/context_project.h
index d853507188..4036614086 100644
--- a/paddle/operators/math/context_project.h
+++ b/paddle/operators/math/context_project.h
@@ -81,17 +81,17 @@ using LoDTensor = framework::LoDTensor;
  *
  */
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ContextProjectFunctor {
  public:
-  void operator()(const platform::DeviceContext& context, const LoDTensor& in,
+  void operator()(const DeviceContext& context, const LoDTensor& in,
                   const Tensor& padding_data, bool padding_trainable,
                   const int context_start, const int context_length,
                   const int context_stride, const int up_pad,
                   const int down_pad, Tensor* col) {
     auto lod_level_0 = in.lod()[0];
 
-    math::Im2ColFunctor<math::ColFormat::kOCF, Place, float> im2col_ocf;
+    math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, float> im2col_ocf;
 
     std::vector<int> dilation({1, 1});
     std::vector<int> padding({up_pad, 0, down_pad, 0});
@@ -188,17 +188,17 @@ class ContextProjectFunctor {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ContextProjectGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context, const LoDTensor& in,
+  void operator()(const DeviceContext& context, const LoDTensor& in,
                   bool padding_trainable, const int context_start,
                   const int context_length, const int context_stride,
                   const int up_pad, const int down_pad, bool pad_grad,
                   bool input_grad, Tensor* padding_data, Tensor* col) {
     auto lod_level_0 = in.lod()[0];
 
-    math::Col2ImFunctor<math::ColFormat::kOCF, Place, float> col2im_ocf;
+    math::Col2ImFunctor<math::ColFormat::kOCF, DeviceContext, float> col2im_ocf;
 
     std::vector<int> dilation({1, 1});
     std::vector<int> padding({up_pad, 0, down_pad, 0});
@@ -258,8 +258,8 @@ class ContextProjectGradFunctor {
               Tensor out_t_sub = out_t.Slice(k * context_length,
                                              k * context_length + padding_size);
               Tensor w_sub = padding_data->Slice(k, k + padding_size);
-              axpy<Place, T>(context, w_sub.numel(), static_cast<T>(1),
-                             out_t_sub.data<T>(), w_sub.data<T>());
+              axpy<DeviceContext, T>(context, w_sub.numel(), static_cast<T>(1),
+                                     out_t_sub.data<T>(), w_sub.data<T>());
             }
           }
           if (down_pad > 0) {
@@ -290,8 +290,8 @@ class ContextProjectGradFunctor {
                   (down_pad_begin_row + t) * context_length);
               Tensor w_sub = padding_data->Slice(
                   up_pad + padding_idx, up_pad + padding_idx + padding_size);
-              axpy<Place, T>(context, w_sub.numel(), static_cast<T>(1),
-                             out_t_sub.data<T>(), w_sub.data<T>());
+              axpy<DeviceContext, T>(context, w_sub.numel(), static_cast<T>(1),
+                                     out_t_sub.data<T>(), w_sub.data<T>());
             }
           }
           out_t.Resize({sequence_height, context_length * sequence_width});
diff --git a/paddle/operators/math/cross_entropy.cc b/paddle/operators/math/cross_entropy.cc
index cf238a58e0..6011a196d4 100644
--- a/paddle/operators/math/cross_entropy.cc
+++ b/paddle/operators/math/cross_entropy.cc
@@ -24,9 +24,9 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename T>
-class CrossEntropyFunctor<platform::CPUPlace, T> {
+class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& ctx, framework::Tensor* out,
+  void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out,
                   const framework::Tensor* prob,
                   const framework::Tensor* labels, const bool softLabel) {
     const int batch_size = prob->dims()[0];
@@ -35,7 +35,7 @@ class CrossEntropyFunctor<platform::CPUPlace, T> {
       auto lbl = EigenMatrix<T>::From(*labels);
       auto loss = EigenMatrix<T>::From(*out);
 
-      loss.device(*ctx.GetEigenDevice<platform::CPUPlace>()) =
+      loss.device(*ctx.eigen_device()) =
           -((lbl * in.log().unaryExpr(math::TolerableValue<T>()))
                 .sum(Eigen::DSizes<int, 1>(1))
                 .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
@@ -53,8 +53,8 @@ class CrossEntropyFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class CrossEntropyFunctor<platform::CPUPlace, float>;
-template class CrossEntropyFunctor<platform::CPUPlace, double>;
+template class CrossEntropyFunctor<platform::CPUDeviceContext, float>;
+template class CrossEntropyFunctor<platform::CPUDeviceContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/cross_entropy.cu b/paddle/operators/math/cross_entropy.cu
index 651c08f740..2132d49c93 100644
--- a/paddle/operators/math/cross_entropy.cu
+++ b/paddle/operators/math/cross_entropy.cu
@@ -95,10 +95,10 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
 using Tensor = framework::Tensor;
 
 template <typename T>
-class CrossEntropyFunctor<platform::GPUPlace, T> {
+class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& ctx, framework::Tensor* out,
-                  const framework::Tensor* prob,
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  framework::Tensor* out, const framework::Tensor* prob,
                   const framework::Tensor* labels, bool softLabel) {
     const T* prob_data = prob->data<T>();
     T* loss_data = out->mutable_data<T>(ctx.GetPlace());
@@ -118,16 +118,14 @@ class CrossEntropyFunctor<platform::GPUPlace, T> {
       const int64_t* label_data = labels->data<int64_t>();
       int block = 512;
       int grid = (batch_size + block - 1) / block;
-      CrossEntropyKernel<T><<<
-          grid, block, 0,
-          reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      CrossEntropyKernel<T><<<grid, block, 0, ctx.stream()>>>(
           loss_data, prob_data, label_data, batch_size, class_num);
     }
   }
 };
 
-template class CrossEntropyFunctor<platform::GPUPlace, float>;
-template class CrossEntropyFunctor<platform::GPUPlace, double>;
+template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
+template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/cross_entropy.h b/paddle/operators/math/cross_entropy.h
index 70ed9ddd55..677adb5ada 100644
--- a/paddle/operators/math/cross_entropy.h
+++ b/paddle/operators/math/cross_entropy.h
@@ -33,11 +33,11 @@ struct TolerableValue {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CrossEntropyFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor* out, const framework::Tensor* prob,
+  void operator()(const DeviceContext& context, framework::Tensor* out,
+                  const framework::Tensor* prob,
                   const framework::Tensor* labels, const bool softLabel);
 };
 }  // namespace math
diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc
index ae4e47b014..d570c68cd4 100644
--- a/paddle/operators/math/gru_compute.cc
+++ b/paddle/operators/math/gru_compute.cc
@@ -19,14 +19,14 @@ namespace operators {
 namespace math {
 
 template <typename T>
-struct GRUUnitFunctor<platform::CPUPlace, T> {
-  static void compute(const platform::DeviceContext &context,
+struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext &context,
                       hl_gru_value<T> value, int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate) {
 #ifndef __NVCC__
     if (value.prev_out_value) {
-      math::gemm<platform::CPUPlace, T>(
+      math::gemm<platform::CPUDeviceContext, T>(
           context, false, false, batch_size, frame_size * 2, frame_size, 1,
           value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
           1, value.gate_value, frame_size * 3);
@@ -36,7 +36,7 @@ struct GRUUnitFunctor<platform::CPUPlace, T> {
                                  frame_size, batch_size, active_gate);
 
     if (value.prev_out_value) {
-      math::gemm<platform::CPUPlace, T>(
+      math::gemm<platform::CPUDeviceContext, T>(
           context, false, false, batch_size, frame_size, frame_size, 1,
           value.reset_output_value, frame_size, value.state_weight, frame_size,
           1, value.gate_value + frame_size * 2, frame_size * 3);
@@ -49,8 +49,8 @@ struct GRUUnitFunctor<platform::CPUPlace, T> {
 };
 
 template <typename T>
-struct GRUUnitGradFunctor<platform::CPUPlace, T> {
-  static void compute(const platform::DeviceContext &context,
+struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext &context,
                       hl_gru_value<T> value, hl_gru_grad<T> grad,
                       int frame_size, int batch_size,
                       activation_mode_t active_node,
@@ -60,13 +60,13 @@ struct GRUUnitGradFunctor<platform::CPUPlace, T> {
                                 grad, frame_size, batch_size, active_node);
 
     if (value.prev_out_value && grad.prev_out_grad) {
-      math::gemm<platform::CPUPlace, T>(
+      math::gemm<platform::CPUDeviceContext, T>(
           context, false, true, batch_size, frame_size, frame_size, 1,
           grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
           frame_size, 0, grad.reset_output_grad, frame_size);
 
       if (grad.state_weight_grad) {
-        math::gemm<platform::CPUPlace, T>(
+        math::gemm<platform::CPUDeviceContext, T>(
             context, true, false, frame_size, frame_size, batch_size, 1,
             value.reset_output_value, frame_size,
             grad.gate_grad + frame_size * 2, frame_size * 3, 1,
@@ -78,13 +78,13 @@ struct GRUUnitGradFunctor<platform::CPUPlace, T> {
                                 grad, frame_size, batch_size, active_gate);
 
     if (grad.prev_out_grad && value.prev_out_value) {
-      math::gemm<platform::CPUPlace, T>(
+      math::gemm<platform::CPUDeviceContext, T>(
           context, false, true, batch_size, frame_size, frame_size * 2, 1,
           grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
           grad.prev_out_grad, frame_size);
 
       if (grad.gate_weight_grad) {
-        math::gemm<platform::CPUPlace, T>(
+        math::gemm<platform::CPUDeviceContext, T>(
             context, true, false, frame_size, frame_size * 2, batch_size, 1,
             value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
             grad.gate_weight_grad, frame_size * 2);
@@ -94,10 +94,10 @@ struct GRUUnitGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template struct GRUUnitFunctor<platform::CPUPlace, float>;
-template struct GRUUnitFunctor<platform::CPUPlace, double>;
-template struct GRUUnitGradFunctor<platform::CPUPlace, float>;
-template struct GRUUnitGradFunctor<platform::CPUPlace, double>;
+template struct GRUUnitFunctor<platform::CPUDeviceContext, float>;
+template struct GRUUnitFunctor<platform::CPUDeviceContext, double>;
+template struct GRUUnitGradFunctor<platform::CPUDeviceContext, float>;
+template struct GRUUnitGradFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu
index 0252bdbdb6..dd518cd1e4 100644
--- a/paddle/operators/math/gru_compute.cu
+++ b/paddle/operators/math/gru_compute.cu
@@ -19,13 +19,12 @@ namespace operators {
 namespace math {
 
 template <typename T>
-struct GRUUnitFunctor<platform::GPUPlace, T> {
-  static void compute(const platform::DeviceContext &context,
+struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext &context,
                       hl_gru_value<T> value, int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate) {
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
+    auto stream = context.stream();
     dim3 threads;
     dim3 grid;
     if (batch_size == 1) {
@@ -39,7 +38,7 @@ struct GRUUnitFunctor<platform::GPUPlace, T> {
     }
 
     if (value.prev_out_value) {
-      math::gemm<platform::GPUPlace, T>(
+      math::gemm<platform::CUDADeviceContext, T>(
           context, false, false, batch_size, frame_size * 2, frame_size, 1,
           value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
           1, value.gate_value, frame_size * 3);
@@ -62,7 +61,7 @@ struct GRUUnitFunctor<platform::GPUPlace, T> {
     }
 
     if (value.prev_out_value) {
-      math::gemm<platform::GPUPlace, T>(
+      math::gemm<platform::CUDADeviceContext, T>(
           context, false, false, batch_size, frame_size, frame_size, 1,
           value.reset_output_value, frame_size, value.state_weight, frame_size,
           1, value.gate_value + frame_size * 2, frame_size * 3);
@@ -87,14 +86,13 @@ struct GRUUnitFunctor<platform::GPUPlace, T> {
 };
 
 template <typename T>
-struct GRUUnitGradFunctor<platform::GPUPlace, T> {
-  static void compute(const platform::DeviceContext &context,
+struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext &context,
                       hl_gru_value<T> value, hl_gru_grad<T> grad,
                       int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate) {
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
+    auto stream = context.stream();
     dim3 threads;
     dim3 grid;
     if (batch_size == 1) {
@@ -124,13 +122,13 @@ struct GRUUnitGradFunctor<platform::GPUPlace, T> {
     }
 
     if (value.prev_out_value && grad.prev_out_grad) {
-      math::gemm<platform::GPUPlace, T>(
+      math::gemm<platform::CUDADeviceContext, T>(
           context, false, true, batch_size, frame_size, frame_size, 1,
           grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
           frame_size, 0, grad.reset_output_grad, frame_size);
 
       if (grad.state_weight_grad) {
-        math::gemm<platform::GPUPlace, T>(
+        math::gemm<platform::CUDADeviceContext, T>(
             context, true, false, frame_size, frame_size, batch_size, 1,
             value.reset_output_value, frame_size,
             grad.gate_grad + frame_size * 2, frame_size * 3, 1,
@@ -155,13 +153,13 @@ struct GRUUnitGradFunctor<platform::GPUPlace, T> {
     }
 
     if (grad.prev_out_grad && value.prev_out_value) {
-      math::gemm<platform::GPUPlace, T>(
+      math::gemm<platform::CUDADeviceContext, T>(
           context, false, true, batch_size, frame_size, frame_size * 2, 1,
           grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
           grad.prev_out_grad, frame_size);
 
       if (grad.gate_weight_grad) {
-        math::gemm<platform::GPUPlace, T>(
+        math::gemm<platform::CUDADeviceContext, T>(
             context, true, false, frame_size, frame_size * 2, batch_size, 1,
             value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
             grad.gate_weight_grad, frame_size * 2);
@@ -170,10 +168,10 @@ struct GRUUnitGradFunctor<platform::GPUPlace, T> {
   }
 };
 
-template struct GRUUnitFunctor<platform::GPUPlace, float>;
-template struct GRUUnitFunctor<platform::GPUPlace, double>;
-template struct GRUUnitGradFunctor<platform::GPUPlace, float>;
-template struct GRUUnitGradFunctor<platform::GPUPlace, double>;
+template struct GRUUnitFunctor<platform::CUDADeviceContext, float>;
+template struct GRUUnitFunctor<platform::CUDADeviceContext, double>;
+template struct GRUUnitGradFunctor<platform::CUDADeviceContext, float>;
+template struct GRUUnitGradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/gru_compute.h b/paddle/operators/math/gru_compute.h
index 58ea59f68e..ca1343cb2c 100644
--- a/paddle/operators/math/gru_compute.h
+++ b/paddle/operators/math/gru_compute.h
@@ -40,19 +40,18 @@ struct hl_gru_grad {
   T *prev_out_grad;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct GRUUnitFunctor {
-  static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, int frame_size, int batch_size,
+  static void compute(const DeviceContext &context, hl_gru_value<T> value,
+                      int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct GRUUnitGradFunctor {
-  static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad,
-                      int frame_size, int batch_size,
+  static void compute(const DeviceContext &context, hl_gru_value<T> value,
+                      hl_gru_grad<T> grad, int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate);
 };
diff --git a/paddle/operators/math/im2col.cc b/paddle/operators/math/im2col.cc
index c10c44c520..707ebf0596 100644
--- a/paddle/operators/math/im2col.cc
+++ b/paddle/operators/math/im2col.cc
@@ -25,9 +25,9 @@ namespace math {
  */
 template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CPUPlace, T> {
+                    platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& im, const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col) {
@@ -90,9 +90,9 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
  */
 template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CPUPlace, T> {
+                    platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
@@ -149,13 +149,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
 };
 
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUPlace, float>;
+                             platform::CPUDeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUPlace, double>;
+                             platform::CPUDeviceContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUPlace, float>;
+                             platform::CPUDeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUPlace, double>;
+                             platform::CPUDeviceContext, double>;
 
 /*
  * im = [input_channels, input_height, input_width]
@@ -164,9 +164,9 @@ template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
  */
 template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CPUPlace, T> {
+                    platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& im, const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col) {
@@ -235,9 +235,9 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
  */
 template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CPUPlace, T> {
+                    platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
@@ -300,13 +300,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
 };
 
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUPlace, float>;
+                             platform::CPUDeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUPlace, double>;
+                             platform::CPUDeviceContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUPlace, float>;
+                             platform::CPUDeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUPlace, double>;
+                             platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/im2col.cu b/paddle/operators/math/im2col.cu
index bf78942439..a88e837b03 100644
--- a/paddle/operators/math/im2col.cu
+++ b/paddle/operators/math/im2col.cu
@@ -58,9 +58,9 @@ __global__ void im2col(const T* data_im, int num_outs, int im_height,
  */
 template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::GPUPlace, T> {
+                    platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& im, const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col) {
@@ -96,9 +96,7 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     int block_y = (blocks + 512 - 1) / 512;
     dim3 threads(1024, 1);
     dim3 grid(block_x, block_y);
-    im2col<T><<<grid, threads, 0,
-                reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                    .stream()>>>(
+    im2col<T><<<grid, threads, 0, context.stream()>>>(
         im.data<T>(), num_outputs, im_height, im_width, dilation[0],
         dilation[1], filter_height, filter_width, stride[0], stride[1],
         padding[0], padding[1], col_height, col_width, col->data<T>());
@@ -160,9 +158,9 @@ __global__ void col2im(int n, const T* data_col, int im_height, int im_width,
  */
 template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::GPUPlace, T> {
+                    platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
@@ -203,9 +201,7 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
 
     // To avoid involving atomic operations, we will launch one kernel per
     // bottom dimension, and then in the kernel add up the top dimensions.
-    col2im<T><<<grid, threads, 0,
-                reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                    .stream()>>>(
+    col2im<T><<<grid, threads, 0, context.stream()>>>(
         num_kernels, col.data<T>(), im_height, im_width, dilation[0],
         dilation[1], filter_height, filter_width, stride[0], stride[1],
         padding[0], padding[2], col_height, col_width, im->data<T>());
@@ -213,13 +209,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
 };
 
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::GPUPlace, float>;
+                             platform::CUDADeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::GPUPlace, double>;
+                             platform::CUDADeviceContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::GPUPlace, float>;
+                             platform::CUDADeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::GPUPlace, double>;
+                             platform::CUDADeviceContext, double>;
 
 template <class T>
 __global__ void im2colOCF(const T* im_data, int im_channels, int im_height,
@@ -260,9 +256,9 @@ __global__ void im2colOCF(const T* im_data, int im_channels, int im_height,
  */
 template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::GPUPlace, T> {
+                    platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& im, const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col) {
@@ -310,9 +306,7 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
     int block_dim_z = 1024 / block_dim_x / block_dim_y;
     dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
     dim3 grid(col_width, col_height);
-    im2colOCF<T><<<grid, threads, 0,
-                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                       .stream()>>>(
+    im2colOCF<T><<<grid, threads, 0, context.stream()>>>(
         im.data<T>(), im_channels, im_height, im_width, filter_height,
         filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
         col_width, col->data<T>());
@@ -358,9 +352,9 @@ __global__ void col2imOCF(const T* col_data, int im_channels, int im_height,
  */
 template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::GPUPlace, T> {
+                    platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
@@ -409,9 +403,7 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
     int block_dim_z = 1024 / block_dim_x / block_dim_y;
     dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
     dim3 grid(col_width, col_height);
-    col2imOCF<T><<<grid, threads, 0,
-                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                       .stream()>>>(
+    col2imOCF<T><<<grid, threads, 0, context.stream()>>>(
         col.data<T>(), im_channels, im_height, im_width, filter_height,
         filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
         col_width, im->data<T>());
@@ -419,13 +411,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
 };
 
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::GPUPlace, float>;
+                             platform::CUDADeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::GPUPlace, double>;
+                             platform::CUDADeviceContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::GPUPlace, float>;
+                             platform::CUDADeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::GPUPlace, double>;
+                             platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/im2col.h b/paddle/operators/math/im2col.h
index 24fd9a06e9..38f2c9fe0a 100644
--- a/paddle/operators/math/im2col.h
+++ b/paddle/operators/math/im2col.h
@@ -79,20 +79,19 @@ enum class ColFormat { kCFO = 0, kOCF = 1 };
  * \note The caller needs to ensure that imShape.inputChannels is equal to
  *       colShape.inputChannels.
  */
-template <ColFormat Format, typename Place, typename T>
+template <ColFormat Format, typename DeviceContext, typename T>
 class Im2ColFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& im, const std::vector<int>& dilation,
+  void operator()(const DeviceContext& context, const framework::Tensor& im,
+                  const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col);
 };
 
-template <ColFormat Format, typename Place, typename T>
+template <ColFormat Format, typename DeviceContext, typename T>
 class Col2ImFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& col,
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im);
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc
index ae197a97ed..256f3bc9bd 100644
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <iostream>
 
-template <typename Place>
+template <typename DeviceContext, typename Place>
 void testIm2col() {
   paddle::framework::Tensor input_tmp;
   paddle::framework::Tensor input;
@@ -59,18 +59,7 @@ void testIm2col() {
   memcpy(input_ptr, arr, 6 * sizeof(float));
 
   auto* place = new Place();
-  paddle::platform::DeviceContext* context;
-  if (paddle::platform::is_cpu_place(*place)) {
-    context =
-        new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
-  } else {
-#ifdef PADDLE_WITH_CUDA
-    context =
-        new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
-#else
-    PADDLE_THROW("no GPU support");
-#endif  // PADDLE_WITH_CUDA
-  }
+  DeviceContext* context = new DeviceContext(*place);
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
@@ -83,10 +72,10 @@ void testIm2col() {
 
   // Im2Col
   paddle::operators::math::Im2ColFunctor<
-      paddle::operators::math::ColFormat::kCFO, Place, float>
+      paddle::operators::math::ColFormat::kCFO, DeviceContext, float>
       im2col;
   paddle::operators::math::Im2ColFunctor<
-      paddle::operators::math::ColFormat::kOCF, Place, float>
+      paddle::operators::math::ColFormat::kOCF, DeviceContext, float>
       im2col_ocf;
 
   im2col(*context, input, dilation, stride, padding, &output_cfo);
@@ -119,10 +108,10 @@ void testIm2col() {
 
   // Col2Im: kCFO
   paddle::operators::math::Col2ImFunctor<
-      paddle::operators::math::ColFormat::kCFO, Place, float>
+      paddle::operators::math::ColFormat::kCFO, DeviceContext, float>
       col2im;
   paddle::operators::math::Col2ImFunctor<
-      paddle::operators::math::ColFormat::kOCF, Place, float>
+      paddle::operators::math::ColFormat::kOCF, DeviceContext, float>
       col2im_ocf;
   float col2im_data[] = {0, 2, 2, 3, 8, 5};
 
@@ -168,8 +157,8 @@ void testIm2col() {
 }
 
 TEST(math, im2col) {
-  testIm2col<paddle::platform::CPUPlace>();
+  testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #ifdef PADDLE_WITH_CUDA
-  testIm2col<paddle::platform::GPUPlace>();
+  testIm2col<paddle::platform::CUDADeviceContext, paddle::platform::GPUPlace>();
 #endif
 }
diff --git a/paddle/operators/math/lstm_compute.cc b/paddle/operators/math/lstm_compute.cc
index ad3a59bcdb..2c2e8bb82e 100644
--- a/paddle/operators/math/lstm_compute.cc
+++ b/paddle/operators/math/lstm_compute.cc
@@ -21,8 +21,8 @@ namespace operators {
 namespace math {
 
 template <class T>
-struct LstmUnitFunctor<platform::CPUPlace, T> {
-  static void compute(const platform::DeviceContext& context,
+struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext& context,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
                       const std::string& gate_act, const std::string& cell_act,
                       const std::string& cand_act) {
@@ -42,8 +42,8 @@ struct LstmUnitFunctor<platform::CPUPlace, T> {
 };
 
 template <class T>
-struct LstmUnitGradFunctor<platform::CPUPlace, T> {
-  static void compute(const platform::DeviceContext& context,
+struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext& context,
                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
                       int frame_size, int batch_size,
                       const std::string& gate_act, const std::string& cell_act,
@@ -72,10 +72,10 @@ struct LstmUnitGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class LstmUnitFunctor<platform::CPUPlace, float>;
-template class LstmUnitFunctor<platform::CPUPlace, double>;
-template class LstmUnitGradFunctor<platform::CPUPlace, float>;
-template class LstmUnitGradFunctor<platform::CPUPlace, double>;
+template class LstmUnitFunctor<platform::CPUDeviceContext, float>;
+template class LstmUnitFunctor<platform::CPUDeviceContext, double>;
+template class LstmUnitGradFunctor<platform::CPUDeviceContext, float>;
+template class LstmUnitGradFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/lstm_compute.cu b/paddle/operators/math/lstm_compute.cu
index b2122f2a5c..92b1f4228b 100644
--- a/paddle/operators/math/lstm_compute.cu
+++ b/paddle/operators/math/lstm_compute.cu
@@ -21,8 +21,8 @@ namespace operators {
 namespace math {
 
 template <class T>
-struct LstmUnitFunctor<platform::GPUPlace, T> {
-  static void compute(const platform::DeviceContext& context,
+struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext& context,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
                       const std::string& gate_act, const std::string& cell_act,
                       const std::string& cand_act) {
@@ -33,8 +33,8 @@ struct LstmUnitFunctor<platform::GPUPlace, T> {
 };
 
 template <class T>
-struct LstmUnitGradFunctor<platform::GPUPlace, T> {
-  static void compute(const platform::DeviceContext& context,
+struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext& context,
                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
                       int frame_size, int batch_size,
                       const std::string& gate_act, const std::string& cell_act,
@@ -45,10 +45,10 @@ struct LstmUnitGradFunctor<platform::GPUPlace, T> {
   }
 };
 
-template class LstmUnitFunctor<platform::GPUPlace, float>;
-template class LstmUnitFunctor<platform::GPUPlace, double>;
-template class LstmUnitGradFunctor<platform::GPUPlace, float>;
-template class LstmUnitGradFunctor<platform::GPUPlace, double>;
+template class LstmUnitFunctor<platform::CUDADeviceContext, float>;
+template class LstmUnitFunctor<platform::CUDADeviceContext, double>;
+template class LstmUnitGradFunctor<platform::CUDADeviceContext, float>;
+template class LstmUnitGradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/lstm_compute.h b/paddle/operators/math/lstm_compute.h
index 9652399d4c..5f74e27358 100644
--- a/paddle/operators/math/lstm_compute.h
+++ b/paddle/operators/math/lstm_compute.h
@@ -67,21 +67,20 @@ inline activation_mode_t ActiveType(const std::string &type) {
   }
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LstmUnitFunctor {
  public:
-  static void compute(const platform::DeviceContext &context,
-                      LstmMetaValue<T> value, int frame_size, int batch_size,
+  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
+                      int frame_size, int batch_size,
                       const std::string &gate_act, const std::string &cell_act,
                       const std::string &cand_act);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LstmUnitGradFunctor {
  public:
-  static void compute(const platform::DeviceContext &context,
-                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size,
+  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
+                      LstmMetaGrad<T> grad, int frame_size, int batch_size,
                       const std::string &gate_act, const std::string &cell_act,
                       const std::string &cand_act);
 };
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index 2e333a8cde..a05810d778 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -21,13 +21,11 @@ namespace operators {
 namespace math {
 
 template <>
-void gemm<platform::CPUPlace, float>(const platform::DeviceContext& context,
-                                     const CBLAS_TRANSPOSE transA,
-                                     const CBLAS_TRANSPOSE transB, const int M,
-                                     const int N, const int K,
-                                     const float alpha, const float* A,
-                                     const float* B, const float beta,
-                                     float* C) {
+void gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
   int lda = (transA == CblasNoTrans) ? K : M;
   int ldb = (transB == CblasNoTrans) ? N : K;
   int ldc = N;
@@ -36,13 +34,11 @@ void gemm<platform::CPUPlace, float>(const platform::DeviceContext& context,
 }
 
 template <>
-void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
-                                      const CBLAS_TRANSPOSE transA,
-                                      const CBLAS_TRANSPOSE transB, const int M,
-                                      const int N, const int K,
-                                      const double alpha, const double* A,
-                                      const double* B, const double beta,
-                                      double* C) {
+void gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
   int lda = (transA == CblasNoTrans) ? K : M;
   int ldb = (transB == CblasNoTrans) ? N : K;
   int ldc = N;
@@ -51,35 +47,32 @@ void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
 }
 
 template <>
-void gemm<platform::CPUPlace, float>(const platform::DeviceContext& context,
-                                     const bool transA, const bool transB,
-                                     const int M, const int N, const int K,
-                                     const float alpha, const float* A,
-                                     const int lda, const float* B,
-                                     const int ldb, const float beta, float* C,
-                                     const int ldc) {
+void gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K, const float alpha,
+    const float* A, const int lda, const float* B, const int ldb,
+    const float beta, float* C, const int ldc) {
   cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
               transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
               lda, B, ldb, beta, C, ldc);
 }
 
 template <>
-void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
-                                      const bool transA, const bool transB,
-                                      const int M, const int N, const int K,
-                                      const double alpha, const double* A,
-                                      const int lda, const double* B,
-                                      const int ldb, const double beta,
-                                      double* C, const int ldc) {
+void gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const int lda, const double* B,
+    const int ldb, const double beta, double* C, const int ldc) {
   cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
               transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
               lda, B, ldb, beta, C, ldc);
 }
 
 template <>
-void matmul<platform::CPUPlace, float>(
-    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
-    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha,
+void matmul<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, float alpha,
     framework::Tensor* matrix_out, float beta) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
@@ -99,15 +92,16 @@ void matmul<platform::CPUPlace, float>(
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
 
-  gemm<platform::CPUPlace, float>(
+  gemm<platform::CPUDeviceContext, float>(
       context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
       matrix_b.data<float>(), beta, matrix_out->data<float>());
 }
 
 template <>
-void matmul<platform::CPUPlace, double>(
-    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
-    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha,
+void matmul<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, double alpha,
     framework::Tensor* matrix_out, double beta) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
@@ -127,16 +121,16 @@ void matmul<platform::CPUPlace, double>(
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
 
-  gemm<platform::CPUPlace, double>(
+  gemm<platform::CPUDeviceContext, double>(
       context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
       matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
 
-#ifdef PADDLE_USE_MKLML
+#ifdef PADDLE_WITH_MKLML
 // Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize.
 template <>
-void batched_gemm<platform::CPUPlace, float>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const float alpha, const float* A, const float* B, const float beta,
     float* C, const int batchCount, const int strideA, const int strideB) {
@@ -157,8 +151,8 @@ void batched_gemm<platform::CPUPlace, float>(
 }
 
 template <>
-void batched_gemm<platform::CPUPlace, double>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const double alpha, const double* A, const double* B, const double beta,
     double* C, const int batchCount, const int strideA, const int strideB) {
@@ -183,8 +177,8 @@ void batched_gemm<platform::CPUPlace, double>(
 // functions of Intel MKL are not available. In the future, this computation
 // should be parallelized.
 template <>
-void batched_gemm<platform::CPUPlace, float>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const float alpha, const float* A, const float* B, const float beta,
     float* C, const int batchCount, const int strideA, const int strideB) {
@@ -192,14 +186,14 @@ void batched_gemm<platform::CPUPlace, float>(
     const float* Ak = &A[k * strideA];
     const float* Bk = &B[k * strideB];
     float* Ck = &C[k * M * N];
-    gemm<platform::CPUPlace, float>(context, transA, transB, M, N, K, alpha, Ak,
-                                    Bk, beta, Ck);
+    gemm<platform::CPUDeviceContext, float>(context, transA, transB, M, N, K,
+                                            alpha, Ak, Bk, beta, Ck);
   }
 }
 
 template <>
-void batched_gemm<platform::CPUPlace, double>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const double alpha, const double* A, const double* B, const double beta,
     double* C, const int batchCount, const int strideA, const int strideB) {
@@ -207,55 +201,53 @@ void batched_gemm<platform::CPUPlace, double>(
     const double* Ak = &A[k * strideA];
     const double* Bk = &B[k * strideB];
     double* Ck = &C[k * M * N];
-    gemm<platform::CPUPlace, double>(context, transA, transB, M, N, K, alpha,
-                                     Ak, Bk, beta, Ck);
+    gemm<platform::CPUDeviceContext, double>(context, transA, transB, M, N, K,
+                                             alpha, Ak, Bk, beta, Ck);
   }
 }
 #endif
 
 template <>
-void gemv<platform::CPUPlace, float>(const platform::DeviceContext& context,
-                                     const bool trans_a, const int M,
-                                     const int N, const float alpha,
-                                     const float* A, const float* B,
-                                     const float beta, float* C) {
+void gemv<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
+    const int N, const float alpha, const float* A, const float* B,
+    const float beta, float* C) {
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   cblas_sgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
 }
 
 template <>
-void gemv<platform::CPUPlace, double>(const platform::DeviceContext& context,
-                                      const bool trans_a, const int M,
-                                      const int N, const double alpha,
-                                      const double* A, const double* B,
-                                      const double beta, double* C) {
+void gemv<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
+    const int N, const double alpha, const double* A, const double* B,
+    const double beta, double* C) {
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
 }
 
 template <>
-void axpy<platform::CPUPlace, float>(const platform::DeviceContext& context,
-                                     const int n, const float alpha,
-                                     const float* x, float* y) {
+void axpy<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const int n, const float alpha,
+    const float* x, float* y) {
   cblas_saxpy(n, alpha, x, 1, y, 1);
 }
 
 template <>
-void axpy<platform::CPUPlace, double>(const platform::DeviceContext& context,
-                                      const int n, const double alpha,
-                                      const double* x, double* y) {
+void axpy<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const int n, const double alpha,
+    const double* x, double* y) {
   cblas_daxpy(n, alpha, x, 1, y, 1);
 }
 
-template struct SetConstant<platform::CPUPlace, float>;
-template struct SetConstant<platform::CPUPlace, double>;
-template struct SetConstant<platform::CPUPlace, int>;
-template struct SetConstant<platform::CPUPlace, int64_t>;
-template struct SetConstant<platform::CPUPlace, bool>;
+template struct SetConstant<platform::CPUDeviceContext, float>;
+template struct SetConstant<platform::CPUDeviceContext, double>;
+template struct SetConstant<platform::CPUDeviceContext, int>;
+template struct SetConstant<platform::CPUDeviceContext, int64_t>;
+template struct SetConstant<platform::CPUDeviceContext, bool>;
 
-#define DEFINE_CPU_TRANS(RANK)                                \
-  template struct Transpose<platform::CPUPlace, float, RANK>; \
-  template struct Transpose<platform::CPUPlace, double, RANK>;
+#define DEFINE_CPU_TRANS(RANK)                                        \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>; \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;
 
 DEFINE_CPU_TRANS(1);
 DEFINE_CPU_TRANS(2);
@@ -285,6 +277,14 @@ void set_constant_with_place<platform::CPUPlace>(
                            TensorSetConstantCPU(tensor, value));
 }
 
+template <>
+void set_constant_with_place<platform::MKLDNNPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(framework::ToDataType(tensor->type()),
+                           TensorSetConstantCPU(tensor, value));
+}
+
 struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
   TensorSetConstantWithPlace(const platform::DeviceContext& context,
                              framework::Tensor* tensor, float value)
@@ -310,10 +310,10 @@ void set_constant(const platform::DeviceContext& context,
 #endif
 }
 
-template struct RowwiseAdd<platform::CPUPlace, float>;
-template struct RowwiseAdd<platform::CPUPlace, double>;
-template struct ColwiseSum<platform::CPUPlace, float>;
-template struct ColwiseSum<platform::CPUPlace, double>;
+template struct RowwiseAdd<platform::CPUDeviceContext, float>;
+template struct RowwiseAdd<platform::CPUDeviceContext, double>;
+template struct ColwiseSum<platform::CPUDeviceContext, float>;
+template struct ColwiseSum<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index 3018e50a4f..7852bb53a9 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -22,13 +22,11 @@ namespace operators {
 namespace math {
 
 template <>
-void gemm<platform::GPUPlace, float>(const platform::DeviceContext& context,
-                                     const CBLAS_TRANSPOSE transA,
-                                     const CBLAS_TRANSPOSE transB, const int M,
-                                     const int N, const int K,
-                                     const float alpha, const float* A,
-                                     const float* B, const float beta,
-                                     float* C) {
+void gemm<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -39,19 +37,16 @@ void gemm<platform::GPUPlace, float>(const platform::DeviceContext& context,
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 
   PADDLE_ENFORCE(platform::dynload::cublasSgemm(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, N));
 }
 
 template <>
-void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
-                                      const CBLAS_TRANSPOSE transA,
-                                      const CBLAS_TRANSPOSE transB, const int M,
-                                      const int N, const int K,
-                                      const double alpha, const double* A,
-                                      const double* B, const double beta,
-                                      double* C) {
+void gemm<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -61,51 +56,45 @@ void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
   cublasOperation_t cuTransB =
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   PADDLE_ENFORCE(platform::dynload::cublasDgemm(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, N));
 }
 
 template <>
-void gemm<platform::GPUPlace, float>(const platform::DeviceContext& context,
-                                     const bool transA, const bool transB,
-                                     const int M, const int N, const int K,
-                                     const float alpha, const float* A,
-                                     const int lda, const float* B,
-                                     const int ldb, const float beta, float* C,
-                                     const int ldc) {
+void gemm<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K, const float alpha,
+    const float* A, const int lda, const float* B, const int ldb,
+    const float beta, float* C, const int ldc) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
   PADDLE_ENFORCE(platform::dynload::cublasSgemm(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, ldc));
 }
 
 template <>
-void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
-                                      const bool transA, const bool transB,
-                                      const int M, const int N, const int K,
-                                      const double alpha, const double* A,
-                                      const int lda, const double* B,
-                                      const int ldb, const double beta,
-                                      double* C, const int ldc) {
+void gemm<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const int lda, const double* B,
+    const int ldb, const double beta, double* C, const int ldc) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
   PADDLE_ENFORCE(platform::dynload::cublasDgemm(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, ldc));
 }
 
 template <>
-void matmul<platform::GPUPlace, float>(
-    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
-    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha,
+void matmul<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, float alpha,
     framework::Tensor* matrix_out, float beta) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
@@ -125,15 +114,16 @@ void matmul<platform::GPUPlace, float>(
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
 
-  gemm<platform::GPUPlace, float>(
+  gemm<platform::CUDADeviceContext, float>(
       context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
       matrix_b.data<float>(), beta, matrix_out->data<float>());
 }
 
 template <>
-void matmul<platform::GPUPlace, double>(
-    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
-    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha,
+void matmul<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, double alpha,
     framework::Tensor* matrix_out, double beta) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
@@ -153,14 +143,14 @@ void matmul<platform::GPUPlace, double>(
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
 
-  gemm<platform::GPUPlace, double>(
+  gemm<platform::CUDADeviceContext, double>(
       context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
       matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
 
 template <>
-void batched_gemm<platform::GPUPlace, float>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const float alpha, const float* A, const float* B, const float beta,
     float* C, const int batchCount, const int strideA, const int strideB) {
@@ -176,15 +166,13 @@ void batched_gemm<platform::GPUPlace, float>(
   const int strideC = M * N;
 
   PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA,
-      &beta, C, ldc, strideC, batchCount));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
+      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
 }
 
 template <>
-void batched_gemm<platform::GPUPlace, double>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const double alpha, const double* A, const double* B, const double beta,
     double* C, const int batchCount, const int strideA, const int strideB) {
@@ -200,68 +188,58 @@ void batched_gemm<platform::GPUPlace, double>(
   const int strideC = M * N;
 
   PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA,
-      &beta, C, ldc, strideC, batchCount));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
+      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
 }
 
 template <>
-void gemv<platform::GPUPlace, float>(const platform::DeviceContext& context,
-                                     const bool trans_a, const int M,
-                                     const int N, const float alpha,
-                                     const float* A, const float* B,
-                                     const float beta, float* C) {
+void gemv<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const bool trans_a, const int M,
+    const int N, const float alpha, const float* A, const float* B,
+    const float beta, float* C) {
   cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
 
-  PADDLE_ENFORCE(platform::dynload::cublasSgemv(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1));
+  PADDLE_ENFORCE(platform::dynload::cublasSgemv(context.cublas_handle(),
+                                                cuTransA, N, M, &alpha, A, N, B,
+                                                1, &beta, C, 1));
 }
 
 template <>
-void gemv<platform::GPUPlace, double>(const platform::DeviceContext& context,
-                                      const bool trans_a, const int M,
-                                      const int N, const double alpha,
-                                      const double* A, const double* B,
-                                      const double beta, double* C) {
+void gemv<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const bool trans_a, const int M,
+    const int N, const double alpha, const double* A, const double* B,
+    const double beta, double* C) {
   cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  PADDLE_ENFORCE(platform::dynload::cublasDgemv(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1));
+  PADDLE_ENFORCE(platform::dynload::cublasDgemv(context.cublas_handle(),
+                                                cuTransA, N, M, &alpha, A, N, B,
+                                                1, &beta, C, 1));
 }
 
 template <>
-void axpy<platform::GPUPlace, float>(const platform::DeviceContext& context,
-                                     const int n, const float alpha,
-                                     const float* x, float* y) {
-  PADDLE_ENFORCE(platform::dynload::cublasSaxpy(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      n, &alpha, x, 1, y, 1));
+void axpy<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const int n, const float alpha,
+    const float* x, float* y) {
+  PADDLE_ENFORCE(platform::dynload::cublasSaxpy(context.cublas_handle(), n,
+                                                &alpha, x, 1, y, 1));
 }
 
 template <>
-void axpy<platform::GPUPlace, double>(const platform::DeviceContext& context,
-                                      const int n, const double alpha,
-                                      const double* x, double* y) {
-  PADDLE_ENFORCE(platform::dynload::cublasDaxpy(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      n, &alpha, x, 1, y, 1));
+void axpy<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const int n, const double alpha,
+    const double* x, double* y) {
+  PADDLE_ENFORCE(platform::dynload::cublasDaxpy(context.cublas_handle(), n,
+                                                &alpha, x, 1, y, 1));
 }
 
-template struct SetConstant<platform::GPUPlace, float>;
-template struct SetConstant<platform::GPUPlace, double>;
-template struct SetConstant<platform::GPUPlace, int>;
-template struct SetConstant<platform::GPUPlace, int64_t>;
-template struct SetConstant<platform::GPUPlace, bool>;
+template struct SetConstant<platform::CUDADeviceContext, float>;
+template struct SetConstant<platform::CUDADeviceContext, double>;
+template struct SetConstant<platform::CUDADeviceContext, int>;
+template struct SetConstant<platform::CUDADeviceContext, int64_t>;
+template struct SetConstant<platform::CUDADeviceContext, bool>;
 
-#define DEFINE_GPU_TRANS(RANK)                                \
-  template struct Transpose<platform::GPUPlace, float, RANK>; \
-  template struct Transpose<platform::GPUPlace, double, RANK>;
+#define DEFINE_GPU_TRANS(RANK)                                         \
+  template struct Transpose<platform::CUDADeviceContext, float, RANK>; \
+  template struct Transpose<platform::CUDADeviceContext, double, RANK>;
 
 DEFINE_GPU_TRANS(1);
 DEFINE_GPU_TRANS(2);
@@ -277,8 +255,9 @@ struct TensorSetConstantGPU {
 
   template <typename T>
   void operator()() const {
-    SetConstant<platform::GPUPlace, T> functor;
-    functor(context_, tensor_, static_cast<T>(value_));
+    SetConstant<platform::CUDADeviceContext, T> functor;
+    functor(reinterpret_cast<const platform::CUDADeviceContext&>(context_),
+            tensor_, static_cast<T>(value_));
   }
 
   const platform::DeviceContext& context_;
@@ -294,27 +273,34 @@ void set_constant_with_place<platform::GPUPlace>(
                            TensorSetConstantGPU(context, tensor, value));
 }
 
-template struct RowwiseAdd<platform::GPUPlace, float>;
-template struct RowwiseAdd<platform::GPUPlace, double>;
-template struct ColwiseSum<platform::GPUPlace, float>;
-// template struct ColwiseSum<platform::GPUPlace, double>;
-// The ColwiseSum<platform::GPUPlace, double> failed in debug mode,
+template <>
+void set_constant_with_place<platform::CUDNNPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  set_constant_with_place<platform::GPUPlace>(context, tensor, value);
+}
+
+template struct RowwiseAdd<platform::CUDADeviceContext, float>;
+template struct RowwiseAdd<platform::CUDADeviceContext, double>;
+template struct ColwiseSum<platform::CUDADeviceContext, float>;
+// template struct ColwiseSum<platform::CUDADeviceContext, double>;
+// The ColwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
 // and only failed for this case. So reimplemented it.
 template <>
-void ColwiseSum<platform::GPUPlace, double>::operator()(
-    const platform::DeviceContext& context, const framework::Tensor& input,
+void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
+    const platform::CUDADeviceContext& context, const framework::Tensor& input,
     framework::Tensor* vector) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
   PADDLE_ENFORCE_EQ(vector->numel(), size);
   framework::Tensor one;
   one.mutable_data<double>({in_dims[0]}, context.GetPlace());
-  SetConstant<platform::GPUPlace, double> set;
+  SetConstant<platform::CUDADeviceContext, double> set;
   set(context, &one, static_cast<double>(1.0));
-  gemv<platform::GPUPlace, double>(context, true, static_cast<int>(in_dims[0]),
-                                   static_cast<int>(in_dims[1]), 1.0,
-                                   input.data<double>(), one.data<double>(),
-                                   0.0, vector->data<double>());
+  gemv<platform::CUDADeviceContext, double>(
+      context, true, static_cast<int>(in_dims[0]), static_cast<int>(in_dims[1]),
+      1.0, input.data<double>(), one.data<double>(), 0.0,
+      vector->data<double>());
 }
 
 }  // namespace math
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
index 5a42854f22..8cc03c2ba0 100644
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#ifdef PADDLE_USE_MKLML
+#ifdef PADDLE_WITH_MKLML
 #include <mkl_cblas.h>
 #include <mkl_lapacke.h>
 #include <mkl_vml_functions.h>
@@ -62,53 +62,51 @@ namespace math {
 // Then matrixA: M * K, matrixB: K * N, matrixC : M * N
 // For more detailed info, please refer to
 // http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
-template <typename Place, typename T>
-void gemm(const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+template <typename DeviceContext, typename T>
+void gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
           const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
           const T alpha, const T* A, const T* B, const T beta, T* C);
 
 // gemm wrapper with stride args for matrix uncontinuous in memory
-template <typename Place, typename T>
-void gemm(const platform::DeviceContext& context, const bool transA,
-          const bool transB, const int M, const int N, const int K,
-          const T alpha, const T* A, const int lda, const T* B, const int ldb,
-          const T beta, T* C, const int ldc);
+template <typename DeviceContext, typename T>
+void gemm(const DeviceContext& context, const bool transA, const bool transB,
+          const int M, const int N, const int K, const T alpha, const T* A,
+          const int lda, const T* B, const int ldb, const T beta, T* C,
+          const int ldc);
 
 // matrix multiply with continuous memory
-template <typename Place, typename T>
-void matmul(const platform::DeviceContext& context,
-            const framework::Tensor& matrix_a, bool trans_a,
-            const framework::Tensor& matrix_b, bool trans_b, T alpha,
-            framework::Tensor* matrix_out, T beta);
+template <typename DeviceContext, typename T>
+void matmul(const DeviceContext& context, const framework::Tensor& matrix_a,
+            bool trans_a, const framework::Tensor& matrix_b, bool trans_b,
+            T alpha, framework::Tensor* matrix_out, T beta);
 
 // Batched gemm
-template <typename Place, typename T>
-void batched_gemm(const platform::DeviceContext& context,
-                  const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-                  const int M, const int N, const int K, const T alpha,
-                  const T* A, const T* B, const T beta, T* C,
-                  const int batchCount, const int strideA, const int strideB);
-
-template <typename Place, typename T>
-void gemv(const platform::DeviceContext& context, const bool trans_a,
-          const int M, const int N, const T alpha, const T* A, const T* B,
-          const T beta, T* C);
-
-template <typename Place, typename T>
-void axpy(const platform::DeviceContext& context, const int n, const T alpha,
-          const T* x, T* y);
-
-template <typename Place, typename T, int Rank>
+template <typename DeviceContext, typename T>
+void batched_gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE transB, const int M, const int N,
+                  const int K, const T alpha, const T* A, const T* B,
+                  const T beta, T* C, const int batchCount, const int strideA,
+                  const int strideB);
+
+template <typename DeviceContext, typename T>
+void gemv(const DeviceContext& context, const bool trans_a, const int M,
+          const int N, const T alpha, const T* A, const T* B, const T beta,
+          T* C);
+
+template <typename DeviceContext, typename T>
+void axpy(const DeviceContext& context, const int n, const T alpha, const T* x,
+          T* y);
+
+template <typename DeviceContext, typename T, int Rank>
 struct Transpose {
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& in, framework::Tensor* out,
-                  const std::vector<int>& axis);
+  void operator()(const DeviceContext& context, const framework::Tensor& in,
+                  framework::Tensor* out, const std::vector<int>& axis);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SetConstant {
-  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor* tensor, T num);
+  void operator()(const DeviceContext& context, framework::Tensor* tensor,
+                  T num);
 };
 
 template <typename Place>
@@ -118,17 +116,16 @@ void set_constant_with_place(const platform::DeviceContext& context,
 void set_constant(const platform::DeviceContext& context,
                   framework::Tensor* tensor, float value);
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct RowwiseAdd {
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, const framework::Tensor& vec,
-                  framework::Tensor* output);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& vec, framework::Tensor* output);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct ColwiseSum {
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* vec);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* vec);
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h
index 4dc17a4e52..3e6d833865 100644
--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
@@ -20,16 +20,17 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename Place, typename T>
-void SetConstant<Place, T>::operator()(const platform::DeviceContext& context,
-                                       framework::Tensor* tensor, T num) {
+template <typename DeviceContext, typename T>
+void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
+                                               framework::Tensor* tensor,
+                                               T num) {
   auto t = framework::EigenVector<T>::Flatten(*tensor);
-  t.device(*context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(num));
+  t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
 }
 
-template <typename Place, typename T, int Rank>
-void Transpose<Place, T, Rank>::operator()(
-    const platform::DeviceContext& context, const framework::Tensor& in,
+template <typename DeviceContext, typename T, int Rank>
+void Transpose<DeviceContext, T, Rank>::operator()(
+    const DeviceContext& context, const framework::Tensor& in,
     framework::Tensor* out, const std::vector<int>& axis) {
   Eigen::array<int, Rank> permute;
   for (int i = 0; i < Rank; i++) {
@@ -40,15 +41,15 @@ void Transpose<Place, T, Rank>::operator()(
 
   auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
   auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
-  auto* dev = context.GetEigenDevice<Place>();
+  auto* dev = context.eigen_device();
   eigen_out.device(*dev) = eigen_in.shuffle(permute);
 }
 
-template <typename Place, typename T>
-void RowwiseAdd<Place, T>::operator()(const platform::DeviceContext& context,
-                                      const framework::Tensor& input,
-                                      const framework::Tensor& vector,
-                                      framework::Tensor* output) {
+template <typename DeviceContext, typename T>
+void RowwiseAdd<DeviceContext, T>::operator()(const DeviceContext& context,
+                                              const framework::Tensor& input,
+                                              const framework::Tensor& vector,
+                                              framework::Tensor* output) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
   PADDLE_ENFORCE_EQ(vector.numel(), size);
@@ -59,14 +60,14 @@ void RowwiseAdd<Place, T>::operator()(const platform::DeviceContext& context,
   auto out = framework::EigenMatrix<T>::From(*output);
   Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
   Eigen::array<int, 2> bcast({{static_cast<int>(in_dims[0]), 1}});
-  out.device(*context.GetEigenDevice<Place>()) =
+  out.device(*context.eigen_device()) =
       in + vec.reshape(shape).broadcast(bcast);
 }
 
-template <typename Place, typename T>
-void ColwiseSum<Place, T>::operator()(const platform::DeviceContext& context,
-                                      const framework::Tensor& input,
-                                      framework::Tensor* vector) {
+template <typename DeviceContext, typename T>
+void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
+                                              const framework::Tensor& input,
+                                              framework::Tensor* vector) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
   PADDLE_ENFORCE_EQ(vector->numel(), size);
@@ -74,7 +75,7 @@ void ColwiseSum<Place, T>::operator()(const platform::DeviceContext& context,
   auto vec = framework::EigenMatrix<T>::From(*vector);
   auto in = framework::EigenMatrix<T>::From(input);
   Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
-  vec.reshape(shape).device(*context.GetEigenDevice<Place>()) =
+  vec.reshape(shape).device(*context.eigen_device()) =
       in.sum(Eigen::array<int, 1>({{0}})).reshape(shape);
 }
 
diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc
index 983c9fdcff..7c6f098ca9 100644
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
@@ -21,7 +21,7 @@ TEST(math_function, gemm_notrans_cblas) {
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::gemm<paddle::platform::CPUPlace, float>(
+  paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>(
       context, false, false, m, n, k, 1, input1_ptr, 3, input2_ptr + 1, 4, 1,
       input3_ptr + 1, 4);
 
@@ -55,7 +55,7 @@ TEST(math_function, gemm_trans_clbas) {
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::gemm<paddle::platform::CPUPlace, float>(
+  paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>(
       context, false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1,
       input3_ptr + 1, 4);
 
@@ -74,7 +74,8 @@ TEST(math_function, zero) {
   auto* cpu_place = new paddle::platform::CPUPlace();
   float* t = tensor.mutable_data<float>({2, 2}, *cpu_place);
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUPlace, float>
+  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
+                                       float>
       functor;
   functor(context, &tensor, 0);
   EXPECT_EQ(t[0], 0);
@@ -110,7 +111,7 @@ void GemvTest(int m, int n, bool trans) {
   }
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::gemv<paddle::platform::CPUPlace, T>(
+  paddle::operators::math::gemv<paddle::platform::CPUDeviceContext, T>(
       context, trans, static_cast<int>(m), static_cast<int>(n), 1., data_a,
       data_b, 0., data_c);
 
diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu
index d5d6f0c73b..32e96d9487 100644
--- a/paddle/operators/math/math_function_test.cu
+++ b/paddle/operators/math/math_function_test.cu
@@ -21,7 +21,7 @@ TEST(math_function, notrans_mul_trans) {
 
   out_gpu.mutable_data<float>({2, 2}, *gpu_place);
 
-  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
+  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
       context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
 
   paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out);
@@ -55,7 +55,7 @@ TEST(math_function, trans_mul_notrans) {
 
   out_gpu.mutable_data<float>({3, 3}, *gpu_place);
 
-  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
+  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
       context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
 
   paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out);
@@ -106,7 +106,7 @@ TEST(math_function, gemm_notrans_cublas) {
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(*gpu_place);
 
-  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
+  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
       context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
 
   paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3);
@@ -161,7 +161,7 @@ TEST(math_function, gemm_trans_cublas) {
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(*gpu_place);
 
-  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
+  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
       context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
 
   paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3);
@@ -208,7 +208,7 @@ void GemvTest(int m, int n, bool trans) {
   paddle::framework::CopyFrom(mat_a, *gpu_place, context, &g_mat_a);
   paddle::framework::CopyFrom(vec_b, *gpu_place, context, &g_vec_b);
 
-  paddle::operators::math::gemv<paddle::platform::GPUPlace, T>(
+  paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>(
       context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
       g_data_b, 0., g_data_c);
 
diff --git a/paddle/operators/math/matmul.h b/paddle/operators/math/matmul.h
index 6ba9a0ba9a..7048e11e6f 100644
--- a/paddle/operators/math/matmul.h
+++ b/paddle/operators/math/matmul.h
@@ -26,13 +26,12 @@ namespace math {
 //
 // Both a & b can be 1- to 3-dimensional. Higher rank tensors are not supported
 // yet.
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MatMulFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& a, bool trans_a,
-                  const framework::Tensor& b, bool trans_b, T alpha,
-                  framework::Tensor* out, T beta) {
+  void operator()(const DeviceContext& context, const framework::Tensor& a,
+                  bool trans_a, const framework::Tensor& b, bool trans_b,
+                  T alpha, framework::Tensor* out, T beta) {
     auto dim_a = a.dims();
     auto dim_b = b.dims();
 
@@ -108,13 +107,13 @@ class MatMulFunctor {
 
     if (!batchCount) {
       // regular matrix multiplication
-      gemm<Place, T>(context, transA, transB, M, N, kA, alpha, a.data<T>(),
-                     b.data<T>(), beta, out->data<T>());
+      gemm<DeviceContext, T>(context, transA, transB, M, N, kA, alpha,
+                             a.data<T>(), b.data<T>(), beta, out->data<T>());
     } else {
       // batched matrix multiplication
-      batched_gemm<Place, T>(context, transA, transB, M, N, kA, alpha,
-                             a.data<T>(), b.data<T>(), beta, out->data<T>(),
-                             batchCount, strideA, strideB);
+      batched_gemm<DeviceContext, T>(
+          context, transA, transB, M, N, kA, alpha, a.data<T>(), b.data<T>(),
+          beta, out->data<T>(), batchCount, strideA, strideB);
     }
   }
 };
diff --git a/paddle/operators/math/maxouting.cc b/paddle/operators/math/maxouting.cc
index c9003962d3..fea86675f7 100644
--- a/paddle/operators/math/maxouting.cc
+++ b/paddle/operators/math/maxouting.cc
@@ -20,9 +20,9 @@ namespace math {
 
 // All tensors are in NCHW format, and the groups must be greater than 1
 template <typename T>
-class MaxOutFunctor<platform::CPUPlace, T> {
+class MaxOutFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, framework::Tensor* output,
                   int groups) {
     const int batch_size = input.dims()[0];
@@ -54,9 +54,9 @@ class MaxOutFunctor<platform::CPUPlace, T> {
 };
 
 template <class T>
-class MaxOutGradFunctor<platform::CPUPlace, T> {
+class MaxOutGradFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, int groups) {
@@ -91,10 +91,10 @@ class MaxOutGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class MaxOutGradFunctor<platform::CPUPlace, float>;
-template class MaxOutGradFunctor<platform::CPUPlace, double>;
-template class MaxOutFunctor<platform::CPUPlace, float>;
-template class MaxOutFunctor<platform::CPUPlace, double>;
+template class MaxOutGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxOutGradFunctor<platform::CPUDeviceContext, double>;
+template class MaxOutFunctor<platform::CPUDeviceContext, float>;
+template class MaxOutFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/maxouting.cu b/paddle/operators/math/maxouting.cu
index c3fabcae08..6056ad251c 100644
--- a/paddle/operators/math/maxouting.cu
+++ b/paddle/operators/math/maxouting.cu
@@ -78,9 +78,9 @@ __global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
  * All tensors are in NCHW format.
  */
 template <typename T>
-class MaxOutFunctor<platform::GPUPlace, T> {
+class MaxOutFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, framework::Tensor* output,
                   int groups) {
     const int batch_size = input.dims()[0];
@@ -98,20 +98,18 @@ class MaxOutFunctor<platform::GPUPlace, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxOut<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, input_data, input_channels,
-                              input_height, input_width, groups, output_data);
+    KernelMaxOut<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, input_channels, input_height, input_width, groups,
+        output_data);
   }
 };
 /*
  * All tensors are in NCHW format.
  */
 template <typename T>
-class MaxOutGradFunctor<platform::GPUPlace, T> {
+class MaxOutGradFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, int groups) {
@@ -132,20 +130,17 @@ class MaxOutGradFunctor<platform::GPUPlace, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxoutGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, input_data, output_data,
-                              output_grad_data, input_grad_data, input_channels,
-                              input_height, input_width, groups);
+    KernelMaxoutGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_height, input_width, groups);
   }
 };
 
-template class MaxOutGradFunctor<platform::GPUPlace, float>;
-template class MaxOutGradFunctor<platform::GPUPlace, double>;
+template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
 
-template class MaxOutFunctor<platform::GPUPlace, float>;
-template class MaxOutFunctor<platform::GPUPlace, double>;
+template class MaxOutFunctor<platform::CUDADeviceContext, float>;
+template class MaxOutFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/maxouting.h b/paddle/operators/math/maxouting.h
index 2d9069b0b3..68f4743db0 100644
--- a/paddle/operators/math/maxouting.h
+++ b/paddle/operators/math/maxouting.h
@@ -23,20 +23,18 @@ namespace math {
 
 #define FLT_MAX __FLT_MAX__
 
-template <typename Place, typename T>
-
+template <typename DeviceContext, typename T>
 class MaxOutFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* output,
-                  int groups);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* output, int groups);
 };
 
-template <typename Place, class T>
+template <typename DeviceContext, class T>
 class MaxOutGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* input_grad,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* input_grad,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, int groups);
 };
diff --git a/paddle/operators/math/pooling.cc b/paddle/operators/math/pooling.cc
index 135984586a..150de6fd59 100644
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
@@ -24,9 +24,9 @@ namespace math {
  * height and width, respectively.
  */
 template <typename PoolProcess, typename T>
-class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
+class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_process, framework::Tensor* output) {
@@ -84,9 +84,9 @@ class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
 * and width, respectively.
 */
 template <typename PoolProcess, class T>
-class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
+class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -152,9 +152,9 @@ class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
  * height and width, respectively.
  */
 template <class T>
-class MaxPool2dGradFunctor<platform::CPUPlace, T> {
+class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -213,25 +213,29 @@ class MaxPool2dGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class MaxPool2dGradFunctor<platform::CPUPlace, float>;
-template class MaxPool2dGradFunctor<platform::CPUPlace, double>;
+template class MaxPool2dGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxPool2dGradFunctor<platform::CPUDeviceContext, double>;
 
-template class Pool2dFunctor<platform::CPUPlace,
+template class Pool2dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::MaxPool<float>, float>;
-template class Pool2dFunctor<platform::CPUPlace,
+template class Pool2dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::AvgPool<float>, float>;
-template class Pool2dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
-template class Pool2dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
-template class Pool2dFunctor<platform::CPUPlace,
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool2dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::MaxPool<double>, double>;
-template class Pool2dFunctor<platform::CPUPlace,
+template class Pool2dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::AvgPool<double>, double>;
-template class Pool2dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
-template class Pool2dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
 
 /*
  * All tensors are in NCDHW format.
@@ -239,9 +243,9 @@ template class Pool2dGradFunctor<
  * depth, height and width, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
+class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_process, framework::Tensor* output) {
@@ -314,9 +318,9 @@ class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
  * depth, height and width, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
+class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -398,9 +402,9 @@ class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
  * depth, height and width, respectively.
  */
 template <class T>
-class MaxPool3dGradFunctor<platform::CPUPlace, T> {
+class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -473,25 +477,29 @@ class MaxPool3dGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class MaxPool3dGradFunctor<platform::CPUPlace, float>;
-template class MaxPool3dGradFunctor<platform::CPUPlace, double>;
+template class MaxPool3dGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxPool3dGradFunctor<platform::CPUDeviceContext, double>;
 
-template class Pool3dFunctor<platform::CPUPlace,
+template class Pool3dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::MaxPool<float>, float>;
-template class Pool3dFunctor<platform::CPUPlace,
+template class Pool3dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::AvgPool<float>, float>;
-template class Pool3dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
-template class Pool3dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
-template class Pool3dFunctor<platform::CPUPlace,
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool3dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::MaxPool<double>, double>;
-template class Pool3dFunctor<platform::CPUPlace,
+template class Pool3dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::AvgPool<double>, double>;
-template class Pool3dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
-template class Pool3dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
 
 /*
  * All tensors are in NCHW format.
@@ -499,9 +507,9 @@ template class Pool3dGradFunctor<
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexFunctor<platform::CPUPlace, T1, T2> {
+class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* output, framework::Tensor* mask) {
@@ -564,9 +572,9 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T1, T2> {
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
+class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
@@ -602,10 +610,14 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
   }
 };
 
-template class MaxPool2dWithIndexFunctor<platform::CPUPlace, float, int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, float, int>;
-template class MaxPool2dWithIndexFunctor<platform::CPUPlace, double, int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, double, int>;
+template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, float,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, float,
+                                             int>;
+template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, double,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, double,
+                                             int>;
 
 /*
  * All tensors are in NCDHW format.
@@ -613,9 +625,9 @@ template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, double, int>;
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexFunctor<platform::CPUPlace, T1, T2> {
+class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* output, framework::Tensor* mask) {
@@ -692,9 +704,9 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T1, T2> {
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
+class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
@@ -735,10 +747,14 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
   }
 };
 
-template class MaxPool3dWithIndexFunctor<platform::CPUPlace, float, int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, float, int>;
-template class MaxPool3dWithIndexFunctor<platform::CPUPlace, double, int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, double, int>;
+template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, float,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, float,
+                                             int>;
+template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, double,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, double,
+                                             int>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/pooling.cu b/paddle/operators/math/pooling.cu
index ca3560f264..0243cf8316 100644
--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
@@ -155,9 +155,9 @@ __global__ void KernelMaxPool2DGrad(
  * height and width, respectively.
  */
 template <typename PoolProcess, typename T>
-class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
+class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_process, framework::Tensor* output) {
@@ -183,11 +183,7 @@ class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool2D<
-        PoolProcess,
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
         stride_width, padding_height, padding_width, pool_process, output_data);
@@ -200,9 +196,9 @@ class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
  * height and width, respectively.
  */
 template <typename PoolProcess, typename T>
-class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
+class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -231,11 +227,7 @@ class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool2DGrad<
-        PoolProcess,
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelPool2DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_height, input_width, output_height, output_width, ksize_height,
         ksize_width, stride_height, stride_width, padding_height, padding_width,
@@ -249,9 +241,9 @@ class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
  * height and width, respectively.
  */
 template <typename T>
-class MaxPool2dGradFunctor<platform::GPUPlace, T> {
+class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -281,10 +273,7 @@ class MaxPool2dGradFunctor<platform::GPUPlace, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool2DGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_height, input_width, output_height, output_width, ksize_height,
         ksize_width, stride_height, stride_width, padding_height, padding_width,
@@ -292,25 +281,29 @@ class MaxPool2dGradFunctor<platform::GPUPlace, T> {
   }
 };
 
-template class MaxPool2dGradFunctor<platform::GPUPlace, float>;
-template class MaxPool2dGradFunctor<platform::GPUPlace, double>;
+template class MaxPool2dGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxPool2dGradFunctor<platform::CUDADeviceContext, double>;
 
-template class Pool2dFunctor<platform::GPUPlace,
+template class Pool2dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::MaxPool<float>, float>;
-template class Pool2dFunctor<platform::GPUPlace,
+template class Pool2dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::AvgPool<float>, float>;
-template class Pool2dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
-template class Pool2dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
-template class Pool2dFunctor<platform::GPUPlace,
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool2dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::MaxPool<double>, double>;
-template class Pool2dFunctor<platform::GPUPlace,
+template class Pool2dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::AvgPool<double>, double>;
-template class Pool2dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
-template class Pool2dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
 
 template <typename PoolProcess, typename T>
 __global__ void KernelPool3D(const int nthreads, const T* input_data,
@@ -478,9 +471,9 @@ __global__ void KernelMaxPool3DGrad(
  * depth, height and width, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
+class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_process, framework::Tensor* output) {
@@ -512,11 +505,7 @@ class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool3D<
-        PoolProcess,
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_depth, input_height,
         input_width, output_depth, output_height, output_width, ksize_depth,
         ksize_height, ksize_width, stride_depth, stride_height, stride_width,
@@ -531,9 +520,9 @@ class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
  * depth, height and width, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
+class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -569,11 +558,7 @@ class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool3DGrad<
-        PoolProcess,
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelPool3DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_depth, input_height, input_width, output_depth, output_height,
         output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
@@ -588,9 +573,9 @@ class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
  * depth, height and width, respectively.
  */
 template <class T>
-class MaxPool3dGradFunctor<platform::GPUPlace, T> {
+class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -626,10 +611,7 @@ class MaxPool3dGradFunctor<platform::GPUPlace, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool3DGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelMaxPool3DGrad<T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_depth, input_height, input_width, output_depth, output_height,
         output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
@@ -638,25 +620,29 @@ class MaxPool3dGradFunctor<platform::GPUPlace, T> {
   }
 };
 
-template class MaxPool3dGradFunctor<platform::GPUPlace, float>;
-template class MaxPool3dGradFunctor<platform::GPUPlace, double>;
+template class MaxPool3dGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxPool3dGradFunctor<platform::CUDADeviceContext, double>;
 
-template class Pool3dFunctor<platform::GPUPlace,
+template class Pool3dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::MaxPool<float>, float>;
-template class Pool3dFunctor<platform::GPUPlace,
+template class Pool3dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::AvgPool<float>, float>;
-template class Pool3dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
-template class Pool3dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
-template class Pool3dFunctor<platform::GPUPlace,
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool3dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::MaxPool<double>, double>;
-template class Pool3dFunctor<platform::GPUPlace,
+template class Pool3dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::AvgPool<double>, double>;
-template class Pool3dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
-template class Pool3dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
 
 template <typename T1, typename T2>
 __global__ void KernelMaxPool2dWithIdx(
@@ -747,9 +733,9 @@ __global__ void KernelMaxPool2DWithIdxGrad(
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexFunctor<platform::GPUPlace, T1, T2> {
+class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* output, framework::Tensor* mask) {
@@ -776,10 +762,7 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T1, T2> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool2dWithIdx<
-        T1, T2><<<grid, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(
+    KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
         stride_width, padding_height, padding_width, output_data, mask_data);
@@ -792,9 +775,9 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T1, T2> {
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
+class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
@@ -821,10 +804,7 @@ class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool2DWithIdxGrad<
-        T1, T2><<<grid, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(
+    KernelMaxPool2DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, output_grad_data, mask_data, input_channels, input_height,
         input_width, output_height, output_width, ksize_height, ksize_width,
         stride_height, stride_width, padding_height, padding_width,
@@ -832,10 +812,14 @@ class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
   }
 };
 
-template class MaxPool2dWithIndexFunctor<platform::GPUPlace, float, int>;
-template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, float, int>;
-template class MaxPool2dWithIndexFunctor<platform::GPUPlace, double, int>;
-template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, double, int>;
+template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, float,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, float,
+                                             int>;
+template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, double,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext,
+                                             double, int>;
 
 template <typename T1, typename T2>
 __global__ void KernelMaxPool3DWithIdx(
@@ -950,9 +934,9 @@ __global__ void KernelMaxPool3DWithIdxGrad(
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexFunctor<platform::GPUPlace, T1, T2> {
+class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* output, framework::Tensor* mask) {
@@ -985,10 +969,7 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T1, T2> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool3DWithIdx<
-        T1, T2><<<grid, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(
+    KernelMaxPool3DWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_depth, input_height,
         input_width, output_depth, output_height, output_width, ksize_depth,
         ksize_height, ksize_width, stride_depth, stride_height, stride_width,
@@ -1002,9 +983,9 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T1, T2> {
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
+class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
@@ -1037,10 +1018,7 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool3DWithIdxGrad<
-        T1, T2><<<grid, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(
+    KernelMaxPool3DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, output_grad_data, mask_data, input_channels, input_depth,
         input_height, input_width, output_depth, output_height, output_width,
         ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
@@ -1049,10 +1027,14 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
   }
 };
 
-template class MaxPool3dWithIndexFunctor<platform::GPUPlace, float, int>;
-template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, float, int>;
-template class MaxPool3dWithIndexFunctor<platform::GPUPlace, double, int>;
-template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, double, int>;
+template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, float,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, float,
+                                             int>;
+template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, double,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext,
+                                             double, int>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/pooling.h b/paddle/operators/math/pooling.h
index 19fbd8b4bb..2759f06cb6 100644
--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
@@ -84,62 +84,58 @@ class AvgPoolGrad {
  * This is different from average pooling. So we rewrite the max_pool_grad:
  * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
  */
-template <typename Place, typename PoolProcess, typename T>
+template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool2dFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute, framework::Tensor* output);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute,
+                  framework::Tensor* output);
 };
 
-template <typename Place, typename PoolProcess, typename T>
+template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool2dGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_compute, framework::Tensor* input_grad);
 };
 
-template <typename Place, class T>
+template <typename DeviceContext, class T>
 class MaxPool2dGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* input_grad);
 };
 
-template <typename Place, typename PoolProcess, typename T>
+template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool3dFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute, framework::Tensor* output);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute,
+                  framework::Tensor* output);
 };
 
-template <typename Place, typename PoolProcess, typename T>
+template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool3dGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_compute, framework::Tensor* input_grad);
 };
 
-template <typename Place, class T>
+template <typename DeviceContext, class T>
 class MaxPool3dGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
@@ -153,38 +149,38 @@ class MaxPool3dGradFunctor {
  * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
  * NCDHW format.
  */
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPool2dWithIndexFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* output, framework::Tensor* mask);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, framework::Tensor* output,
+                  framework::Tensor* mask);
 };
 
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPool2dWithIndexGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* input_grad);
 };
 
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPool3dWithIndexFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* output, framework::Tensor* mask);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, framework::Tensor* output,
+                  framework::Tensor* mask);
 };
 
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPool3dWithIndexGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc
index 514f2adef2..ab758d1e7f 100644
--- a/paddle/operators/math/selected_rows_functor.cc
+++ b/paddle/operators/math/selected_rows_functor.cc
@@ -19,8 +19,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 template <typename T>
-struct SelectedRowsAdd<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::SelectedRows& input2,
                   framework::SelectedRows* output) {
@@ -67,12 +67,12 @@ struct SelectedRowsAdd<platform::CPUPlace, T> {
   }
 };
 
-template struct SelectedRowsAdd<platform::CPUPlace, float>;
-template struct SelectedRowsAdd<platform::CPUPlace, double>;
+template struct SelectedRowsAdd<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAdd<platform::CPUDeviceContext, double>;
 
 template <typename T>
-struct SelectedRowsAddTensor<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::Tensor& input2, framework::Tensor* output) {
     auto in1_height = input1.height();
@@ -88,7 +88,7 @@ struct SelectedRowsAddTensor<platform::CPUPlace, T> {
     PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
     PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
 
-    SetConstant<platform::CPUPlace, T> functor;
+    SetConstant<platform::CPUDeviceContext, T> functor;
     functor(context, output, 0.0);
 
     auto* in1_data = in1_value.data<T>();
@@ -103,17 +103,16 @@ struct SelectedRowsAddTensor<platform::CPUPlace, T> {
 
     auto out_eigen = framework::EigenVector<T>::Flatten(*output);
     auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
-    out_eigen.device(*context.GetEigenDevice<platform::CPUPlace>()) =
-        out_eigen + in2_eigen;
+    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
   }
 };
 
-template struct SelectedRowsAddTensor<platform::CPUPlace, float>;
-template struct SelectedRowsAddTensor<platform::CPUPlace, double>;
+template struct SelectedRowsAddTensor<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAddTensor<platform::CPUDeviceContext, double>;
 
 template <typename T>
-struct SelectedRowsAddTo<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& input1,
                   const int64_t input2_offset,
                   framework::SelectedRows* input2) {
@@ -143,14 +142,14 @@ struct SelectedRowsAddTo<platform::CPUPlace, T> {
   }
 };
 
-template struct SelectedRowsAddTo<platform::CPUPlace, float>;
-template struct SelectedRowsAddTo<platform::CPUPlace, double>;
-template struct SelectedRowsAddTo<platform::CPUPlace, int>;
-template struct SelectedRowsAddTo<platform::CPUPlace, int64_t>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, double>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, int>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, int64_t>;
 
 template <typename T>
-struct SelectedRowsAddToTensor<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& input1,
                   framework::Tensor* input2) {
     auto in1_height = input1.height();
@@ -175,10 +174,10 @@ struct SelectedRowsAddToTensor<platform::CPUPlace, T> {
   }
 };
 
-template struct SelectedRowsAddToTensor<platform::CPUPlace, float>;
-template struct SelectedRowsAddToTensor<platform::CPUPlace, double>;
-template struct SelectedRowsAddToTensor<platform::CPUPlace, int>;
-template struct SelectedRowsAddToTensor<platform::CPUPlace, int64_t>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu
index c1dd323ba2..c44577e00a 100644
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -20,8 +20,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 template <typename T>
-struct SelectedRowsAdd<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::SelectedRows& input2,
                   framework::SelectedRows* output) {
@@ -64,16 +64,15 @@ struct SelectedRowsAdd<platform::GPUPlace, T> {
         reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
 
     auto* in2_data = in2_value.data<T>();
-    memory::Copy(
-        boost::get<platform::GPUPlace>(out_place), out_data + in1_value.numel(),
-        boost::get<platform::GPUPlace>(in2_place), in2_data,
-        in2_value.numel() * sizeof(T),
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
+    memory::Copy(boost::get<platform::GPUPlace>(out_place),
+                 out_data + in1_value.numel(),
+                 boost::get<platform::GPUPlace>(in2_place), in2_data,
+                 in2_value.numel() * sizeof(T), context.stream());
   }
 };
 
-template struct SelectedRowsAdd<platform::GPUPlace, float>;
-template struct SelectedRowsAdd<platform::GPUPlace, double>;
+template struct SelectedRowsAdd<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAdd<platform::CUDADeviceContext, double>;
 
 namespace {
 template <typename T, int block_size>
@@ -96,8 +95,8 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
 }  // namespace
 
 template <typename T>
-struct SelectedRowsAddTensor<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::Tensor& input2, framework::Tensor* output) {
     auto in1_height = input1.height();
@@ -117,30 +116,28 @@ struct SelectedRowsAddTensor<platform::GPUPlace, T> {
     auto* in2_data = input2.data<T>();
     auto* out_data = output->data<T>();
 
-    SetConstant<platform::GPUPlace, T> functor;
+    SetConstant<platform::CUDADeviceContext, T> functor;
     functor(context, output, 0.0);
 
     const int block_size = 256;
     dim3 threads(block_size, 1);
     dim3 grid(1, in1_rows.size());
-    SelectedRowsAddTensorKernel<T, block_size><<<
-        grid, threads, 0,
-        reinterpret_cast<const platform::CUDADeviceContext&>(context)
-            .stream()>>>(in1_data, in1_rows.data(), out_data, in1_row_numel);
+    SelectedRowsAddTensorKernel<
+        T, block_size><<<grid, threads, 0, context.stream()>>>(
+        in1_data, in1_rows.data(), out_data, in1_row_numel);
 
     auto out_eigen = framework::EigenVector<T>::Flatten(*output);
     auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
-    out_eigen.device(*context.GetEigenDevice<platform::GPUPlace>()) =
-        out_eigen + in2_eigen;
+    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
   }
 };
 
-template struct SelectedRowsAddTensor<platform::GPUPlace, float>;
-template struct SelectedRowsAddTensor<platform::GPUPlace, double>;
+template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
 
 template <typename T>
-struct SelectedRowsAddTo<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& input1,
                   const int64_t input2_offset,
                   framework::SelectedRows* input2) {
@@ -163,18 +160,17 @@ struct SelectedRowsAddTo<platform::GPUPlace, T> {
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
-    memory::Copy(
-        boost::get<platform::GPUPlace>(in2_place), in2_data + input2_offset,
-        boost::get<platform::GPUPlace>(in1_place), in1_data,
-        in1_value.numel() * sizeof(T),
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
+    memory::Copy(boost::get<platform::GPUPlace>(in2_place),
+                 in2_data + input2_offset,
+                 boost::get<platform::GPUPlace>(in1_place), in1_data,
+                 in1_value.numel() * sizeof(T), context.stream());
   }
 };
 
-template struct SelectedRowsAddTo<platform::GPUPlace, float>;
-template struct SelectedRowsAddTo<platform::GPUPlace, double>;
-template struct SelectedRowsAddTo<platform::GPUPlace, int>;
-template struct SelectedRowsAddTo<platform::GPUPlace, int64_t>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, double>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, int>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, int64_t>;
 
 namespace {
 template <typename T, int block_size>
@@ -197,8 +193,8 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
 }  // namespace
 
 template <typename T>
-struct SelectedRowsAddToTensor<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& input1,
                   framework::Tensor* input2) {
     auto in1_height = input1.height();
@@ -216,17 +212,16 @@ struct SelectedRowsAddToTensor<platform::GPUPlace, T> {
     const int block_size = 256;
     dim3 threads(block_size, 1);
     dim3 grid(1, in1_rows.size());
-    SelectedRowsAddToTensorKernel<T, block_size><<<
-        grid, threads, 0,
-        reinterpret_cast<const platform::CUDADeviceContext&>(context)
-            .stream()>>>(in1_data, in1_rows.data(), in2_data, in1_row_numel);
+    SelectedRowsAddToTensorKernel<
+        T, block_size><<<grid, threads, 0, context.stream()>>>(
+        in1_data, in1_rows.data(), in2_data, in1_row_numel);
   }
 };
 
-template struct SelectedRowsAddToTensor<platform::GPUPlace, float>;
-template struct SelectedRowsAddToTensor<platform::GPUPlace, double>;
-template struct SelectedRowsAddToTensor<platform::GPUPlace, int>;
-template struct SelectedRowsAddToTensor<platform::GPUPlace, int64_t>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h
index d6dc6c03c9..1149075abf 100644
--- a/paddle/operators/math/selected_rows_functor.h
+++ b/paddle/operators/math/selected_rows_functor.h
@@ -21,33 +21,33 @@ namespace math {
 
 // SelectedRows + SelectedRows will simplely concat value and rows.
 // The real computation happens in dealing with LoDTensor.
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SelectedRowsAdd {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::SelectedRows& input2,
                   framework::SelectedRows* output);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SelectedRowsAddTensor {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::Tensor& input2, framework::Tensor* output);
 };
 
 // input2 = input1 + input2
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SelectedRowsAddTo {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::SelectedRows& input1,
                   const int64_t input2_offset, framework::SelectedRows* input2);
 };
 
 // input2 = input1 + input2
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SelectedRowsAddToTensor {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::SelectedRows& input1,
                   framework::Tensor* input2);
 };
diff --git a/paddle/operators/math/selected_rows_functor_test.cc b/paddle/operators/math/selected_rows_functor_test.cc
index a3649b6875..8c74cab0a1 100644
--- a/paddle/operators/math/selected_rows_functor_test.cc
+++ b/paddle/operators/math/selected_rows_functor_test.cc
@@ -23,7 +23,7 @@ TEST(selected_rows_functor, cpu_add) {
 
   CPUPlace cpu_place;
   CPUDeviceContext ctx(cpu_place);
-  SetConstant<CPUPlace, float> functor;
+  SetConstant<CPUDeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -47,7 +47,7 @@ TEST(selected_rows_functor, cpu_add) {
   // simplely concat two SelectedRows
   out_value->mutable_data<float>(make_ddim({7, 10}), cpu_place);
 
-  SelectedRowsAdd<CPUPlace, float> add_functor;
+  SelectedRowsAdd<CPUDeviceContext, float> add_functor;
   add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
 
   auto out_height = output->height();
@@ -85,7 +85,7 @@ TEST(selected_rows_functor, cpu_add) {
   std::unique_ptr<Tensor> tensor2{new Tensor()};
   tensor2->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
 
-  SelectedRowsAddTensor<CPUPlace, float> add_tensor_functor;
+  SelectedRowsAddTensor<CPUDeviceContext, float> add_tensor_functor;
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
 
   auto* tensor2_data = tensor2->data<float>();
@@ -112,7 +112,7 @@ TEST(selected_rows_functor, cpu_add_to) {
 
   CPUPlace cpu_place;
   CPUDeviceContext ctx(cpu_place);
-  SetConstant<CPUPlace, float> functor;
+  SetConstant<CPUDeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -137,7 +137,7 @@ TEST(selected_rows_functor, cpu_add_to) {
   // simplely concat two SelectedRows
   out_value->mutable_data<float>(make_ddim({7, 10}), cpu_place);
 
-  SelectedRowsAddTo<CPUPlace, float> add_to_functor;
+  SelectedRowsAddTo<CPUDeviceContext, float> add_to_functor;
   add_to_functor(ctx, *selected_rows1, 0, output.get());
   add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
 
@@ -173,7 +173,7 @@ TEST(selected_rows_functor, cpu_add_to) {
   tensor1->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
-  SelectedRowsAddToTensor<CPUPlace, float> add_to_tensor_functor;
+  SelectedRowsAddToTensor<CPUDeviceContext, float> add_to_tensor_functor;
   add_to_tensor_functor(ctx, *output, tensor1.get());
 
   auto* tensor1_data = tensor1->data<float>();
diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu
index 7de9291c17..777caf5635 100644
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/operators/math/selected_rows_functor_test.cu
@@ -24,7 +24,7 @@ TEST(selected_rows_functor, gpu_add) {
   GPUPlace gpu_place(0);
   CPUPlace cpu_place;
   CUDADeviceContext ctx(gpu_place);
-  SetConstant<GPUPlace, float> functor;
+  SetConstant<CUDADeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -48,7 +48,7 @@ TEST(selected_rows_functor, gpu_add) {
   // simplely concat two SelectedRows
   out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place);
 
-  SelectedRowsAdd<GPUPlace, float> add_functor;
+  SelectedRowsAdd<CUDADeviceContext, float> add_functor;
   add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
 
   auto out_height = output->height();
@@ -90,7 +90,7 @@ TEST(selected_rows_functor, gpu_add) {
   std::unique_ptr<Tensor> tensor2{new Tensor()};
   tensor2->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
 
-  SelectedRowsAddTensor<GPUPlace, float> add_tensor_functor;
+  SelectedRowsAddTensor<CUDADeviceContext, float> add_tensor_functor;
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
 
   Tensor tensor2_cpu;
@@ -122,7 +122,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   GPUPlace gpu_place(0);
   CPUPlace cpu_place;
   CUDADeviceContext ctx(gpu_place);
-  SetConstant<GPUPlace, float> functor;
+  SetConstant<CUDADeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -147,7 +147,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   // simplely concat two SelectedRows
   out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place);
 
-  SelectedRowsAddTo<GPUPlace, float> add_to_functor;
+  SelectedRowsAddTo<CUDADeviceContext, float> add_to_functor;
   add_to_functor(ctx, *selected_rows1, 0, output.get());
   add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
 
@@ -187,7 +187,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
-  SelectedRowsAddToTensor<GPUPlace, float> add_to_tensor_functor;
+  SelectedRowsAddToTensor<CUDADeviceContext, float> add_to_tensor_functor;
   add_to_tensor_functor(ctx, *output, tensor1.get());
 
   Tensor tensor1_cpu;
diff --git a/paddle/operators/math/sequence2batch.cc b/paddle/operators/math/sequence2batch.cc
index 5b3bde02fb..88977be1f8 100644
--- a/paddle/operators/math/sequence2batch.cc
+++ b/paddle/operators/math/sequence2batch.cc
@@ -19,9 +19,9 @@ namespace operators {
 namespace math {
 
 template <typename T>
-class CopyMatrixRowsFunctor<platform::CPUPlace, T> {
+class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& src, const size_t* index,
                   framework::Tensor& dst, bool is_src_index) {
     auto src_dims = src.dims();
@@ -48,13 +48,13 @@ class CopyMatrixRowsFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class CopyMatrixRowsFunctor<platform::CPUPlace, float>;
-template class CopyMatrixRowsFunctor<platform::CPUPlace, double>;
+template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, float>;
+template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, double>;
 
-template class LoDTensor2BatchFunctor<platform::CPUPlace, float>;
-template class LoDTensor2BatchFunctor<platform::CPUPlace, double>;
-template class Batch2LoDTensorFunctor<platform::CPUPlace, float>;
-template class Batch2LoDTensorFunctor<platform::CPUPlace, double>;
+template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, float>;
+template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, double>;
+template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, float>;
+template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu
index c5d968aeb2..452ae89510 100644
--- a/paddle/operators/math/sequence2batch.cu
+++ b/paddle/operators/math/sequence2batch.cu
@@ -39,9 +39,9 @@ __global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index,
 }
 
 template <typename T>
-class CopyMatrixRowsFunctor<platform::GPUPlace, T> {
+class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& src, const size_t* index,
                   framework::Tensor& dst, bool is_src_index) {
     auto src_dims = src.dims();
@@ -59,20 +59,19 @@ class CopyMatrixRowsFunctor<platform::GPUPlace, T> {
 
     dim3 threads(128, 8);
     dim3 grid(8, 1);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    auto stream = context.stream();
     CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
         src_data, dst_data, index, height, width, is_src_index);
   }
 };
 
-template class CopyMatrixRowsFunctor<platform::GPUPlace, float>;
-template class CopyMatrixRowsFunctor<platform::GPUPlace, double>;
+template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, float>;
+template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, double>;
 
-template class LoDTensor2BatchFunctor<platform::GPUPlace, float>;
-template class LoDTensor2BatchFunctor<platform::GPUPlace, double>;
-template class Batch2LoDTensorFunctor<platform::GPUPlace, float>;
-template class Batch2LoDTensorFunctor<platform::GPUPlace, double>;
+template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, float>;
+template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, double>;
+template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, float>;
+template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/sequence2batch.h b/paddle/operators/math/sequence2batch.h
index 73295ddbcb..a5c43a2c7d 100644
--- a/paddle/operators/math/sequence2batch.h
+++ b/paddle/operators/math/sequence2batch.h
@@ -26,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CopyMatrixRowsFunctor {
  public:
   // If is_src_index is true,
@@ -34,12 +34,12 @@ class CopyMatrixRowsFunctor {
   // If is_src_index is false,
   // copy the input src to the indexed rows of output dst.
   // The indexed rows are based on the input index.
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& src, const size_t* index,
-                  framework::Tensor& dst, bool is_src_index);
+  void operator()(const DeviceContext& context, const framework::Tensor& src,
+                  const size_t* index, framework::Tensor& dst,
+                  bool is_src_index);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LoDTensor2BatchFunctor {
   // Calculate the length of each sequence and
   // sort sequence index by the length.
@@ -56,7 +56,7 @@ class LoDTensor2BatchFunctor {
   };
 
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::LoDTensor& lod_tensor,
                   framework::LoDTensor& batch, bool is_cal_batch_lod,
                   bool is_reverse = false) const {
@@ -65,7 +65,7 @@ class LoDTensor2BatchFunctor {
       PADDLE_ENFORCE_GT(lods.size(), 2UL);
       PADDLE_ENFORCE_EQ(lods[1].size(),
                         static_cast<size_t>(lod_tensor.dims()[0]));
-      CopyMatrixRowsFunctor<Place, T> to_batch;
+      CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
       to_batch(context, lod_tensor, lods[1].data(), batch, true);
       return;
     }
@@ -143,22 +143,22 @@ class LoDTensor2BatchFunctor {
     }
     batch.set_lod(batch_lods);
 
-    CopyMatrixRowsFunctor<Place, T> to_batch;
+    CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
     to_batch(context, lod_tensor, seq2batch_idx, batch, true);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class Batch2LoDTensorFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::LoDTensor& batch,
                   framework::LoDTensor& lod_tensor) const {
     auto in_lod = batch.lod();
     PADDLE_ENFORCE_GT(in_lod.size(), 2UL);
     PADDLE_ENFORCE_EQ(in_lod[1].size(),
                       static_cast<size_t>(lod_tensor.dims()[0]));
-    CopyMatrixRowsFunctor<Place, T> to_seq;
+    CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
     size_t* index = in_lod[1].data();
     to_seq(context, batch, index, lod_tensor, false);
   }
diff --git a/paddle/operators/math/sequence_pooling.cc b/paddle/operators/math/sequence_pooling.cc
index 5913c99fdb..8fb92b1a13 100644
--- a/paddle/operators/math/sequence_pooling.cc
+++ b/paddle/operators/math/sequence_pooling.cc
@@ -20,9 +20,9 @@ namespace operators {
 namespace math {
 
 template <typename T>
-class MaxSeqPoolFunctor<platform::CPUPlace, T> {
+class MaxSeqPoolFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::LoDTensor& input, framework::Tensor* output,
                   framework::Tensor* index) {
     auto in_dims = input.dims();
@@ -60,9 +60,9 @@ class MaxSeqPoolFunctor<platform::CPUPlace, T> {
 };
 
 template <typename T>
-class MaxSeqPoolGradFunctor<platform::CPUPlace, T> {
+class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& out_grad,
                   const framework::Tensor& index,
                   framework::LoDTensor* in_grad) {
@@ -80,7 +80,7 @@ class MaxSeqPoolGradFunctor<platform::CPUPlace, T> {
     const int* max_index = index.data<int>();
     T* ig_data = in_grad->data<T>();
 
-    SetConstant<platform::CPUPlace, T> set_zero;
+    SetConstant<platform::CPUDeviceContext, T> set_zero;
     set_zero(context, in_grad, static_cast<T>(0.0));
     int64_t num_seq = og_dims[0];
     int64_t dim = out_grad.numel() / num_seq;
@@ -93,10 +93,10 @@ class MaxSeqPoolGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class MaxSeqPoolFunctor<platform::CPUPlace, float>;
-template class MaxSeqPoolFunctor<platform::CPUPlace, double>;
-template class MaxSeqPoolGradFunctor<platform::CPUPlace, float>;
-template class MaxSeqPoolGradFunctor<platform::CPUPlace, double>;
+template class MaxSeqPoolFunctor<platform::CPUDeviceContext, float>;
+template class MaxSeqPoolFunctor<platform::CPUDeviceContext, double>;
+template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/sequence_pooling.cu b/paddle/operators/math/sequence_pooling.cu
index 5ed951402f..4c9e6b375c 100644
--- a/paddle/operators/math/sequence_pooling.cu
+++ b/paddle/operators/math/sequence_pooling.cu
@@ -46,9 +46,9 @@ __global__ void KeMaxSequencePool(const T* input, const size_t* starts,
 }
 
 template <typename T>
-class MaxSeqPoolFunctor<platform::GPUPlace, T> {
+class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::LoDTensor& input, framework::Tensor* output,
                   framework::Tensor* index) {
     auto in_dims = input.dims();
@@ -71,8 +71,7 @@ class MaxSeqPoolFunctor<platform::GPUPlace, T> {
 
     dim3 threads(256, 1);
     dim3 grid(num_seq, 1);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    auto stream = context.stream();
     KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
         in_data, starts.data(), out_data, max_index, num_seq, dim);
   }
@@ -91,9 +90,9 @@ __global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index,
 }
 
 template <typename T>
-class MaxSeqPoolGradFunctor<platform::GPUPlace, T> {
+class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& out_grad,
                   const framework::Tensor& index,
                   framework::LoDTensor* in_grad) {
@@ -111,7 +110,7 @@ class MaxSeqPoolGradFunctor<platform::GPUPlace, T> {
     const int* max_index = index.data<int>();
     T* ig_data = in_grad->data<T>();
 
-    SetConstant<platform::GPUPlace, T> set_zero;
+    SetConstant<platform::CUDADeviceContext, T> set_zero;
     set_zero(context, in_grad, static_cast<T>(0.0));
     int64_t num_seq = og_dims[0];
     int64_t dim = out_grad.numel() / num_seq;
@@ -119,17 +118,16 @@ class MaxSeqPoolGradFunctor<platform::GPUPlace, T> {
     unsigned int blocks = (num_seq * dim + 128 - 1) / 128;
     dim3 threads(128, 1);
     dim3 grid(blocks, 1);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    auto stream = context.stream();
     KeMaxSequencePoolGrad<T><<<grid, threads, 0, stream>>>(
         og_data, max_index, ig_data, num_seq, dim);
   }
 };
 
-template class MaxSeqPoolFunctor<platform::GPUPlace, float>;
-template class MaxSeqPoolFunctor<platform::GPUPlace, double>;
-template class MaxSeqPoolGradFunctor<platform::GPUPlace, float>;
-template class MaxSeqPoolGradFunctor<platform::GPUPlace, double>;
+template class MaxSeqPoolFunctor<platform::CUDADeviceContext, float>;
+template class MaxSeqPoolFunctor<platform::CUDADeviceContext, double>;
+template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/sequence_pooling.h b/paddle/operators/math/sequence_pooling.h
index 35dfe26de1..13ffb2ebef 100644
--- a/paddle/operators/math/sequence_pooling.h
+++ b/paddle/operators/math/sequence_pooling.h
@@ -23,18 +23,18 @@ namespace math {
 
 #define FLT_MAX __FLT_MAX__
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MaxSeqPoolFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::LoDTensor& input, framework::Tensor* output,
                   framework::Tensor* index);
 };
 
-template <typename Place, class T>
+template <typename DeviceContext, class T>
 class MaxSeqPoolGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::Tensor& out_grad,
                   const framework::Tensor& index,
                   framework::LoDTensor* in_grad);
diff --git a/paddle/operators/math/softmax.cc b/paddle/operators/math/softmax.cc
index 3e2f15d6c2..72f10f35f4 100644
--- a/paddle/operators/math/softmax.cc
+++ b/paddle/operators/math/softmax.cc
@@ -19,10 +19,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class SoftmaxFunctor<platform::CPUPlace, float>;
-template class SoftmaxFunctor<platform::CPUPlace, double>;
-template class SoftmaxGradFunctor<platform::CPUPlace, float>;
-template class SoftmaxGradFunctor<platform::CPUPlace, double>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, float>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, double>;
+template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
+template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/softmax.cu b/paddle/operators/math/softmax.cu
index 4dbab51d46..9e73f6a371 100644
--- a/paddle/operators/math/softmax.cu
+++ b/paddle/operators/math/softmax.cu
@@ -21,10 +21,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class SoftmaxFunctor<platform::GPUPlace, float>;
-template class SoftmaxFunctor<platform::GPUPlace, double>;
-template class SoftmaxGradFunctor<platform::GPUPlace, float>;
-template class SoftmaxGradFunctor<platform::GPUPlace, double>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
+template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
+template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/softmax.h b/paddle/operators/math/softmax.h
index fe10746502..471f44d340 100644
--- a/paddle/operators/math/softmax.h
+++ b/paddle/operators/math/softmax.h
@@ -19,19 +19,18 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SoftmaxFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor* X, framework::Tensor* Y);
+  void operator()(const DeviceContext& context, const framework::Tensor* X,
+                  framework::Tensor* Y);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SoftmaxGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor* y, const framework::Tensor* y_grad,
-                  framework::Tensor* x_grad);
+  void operator()(const DeviceContext& context, const framework::Tensor* y,
+                  const framework::Tensor* y_grad, framework::Tensor* x_grad);
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/softmax_impl.h b/paddle/operators/math/softmax_impl.h
index 05793eeb3e..82f597ff79 100644
--- a/paddle/operators/math/softmax_impl.h
+++ b/paddle/operators/math/softmax_impl.h
@@ -32,10 +32,10 @@ struct ValueClip {
   }
 };
 
-template <typename Place, typename T>
-void SoftmaxFunctor<Place, T>::operator()(
-    const platform::DeviceContext& context, const framework::Tensor* X,
-    framework::Tensor* Y) {
+template <typename DeviceContext, typename T>
+void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+                                                  const framework::Tensor* X,
+                                                  framework::Tensor* Y) {
   auto logits = EigenMatrix<T>::From(*X);
   auto softmax = EigenMatrix<T>::From(*Y);
 
@@ -56,19 +56,18 @@ void SoftmaxFunctor<Place, T>::operator()(
                              .broadcast(one_by_class))
                             .unaryExpr(ValueClip<T>());
 
-  softmax.device(*context.GetEigenDevice<Place>()) = shifted_logits.exp();
-  softmax.device(*context.GetEigenDevice<Place>()) =
-      (softmax *
-       softmax.sum(along_class)
-           .inverse()
-           .eval()
-           .reshape(batch_by_one)
-           .broadcast(one_by_class));
+  softmax.device(*context.eigen_device()) = shifted_logits.exp();
+  softmax.device(*context.eigen_device()) = (softmax *
+                                             softmax.sum(along_class)
+                                                 .inverse()
+                                                 .eval()
+                                                 .reshape(batch_by_one)
+                                                 .broadcast(one_by_class));
 }
 
-template <typename Place, typename T>
-void SoftmaxGradFunctor<Place, T>::operator()(
-    const platform::DeviceContext& context, const framework::Tensor* y,
+template <typename DeviceContext, typename T>
+void SoftmaxGradFunctor<DeviceContext, T>::operator()(
+    const DeviceContext& context, const framework::Tensor* y,
     const framework::Tensor* y_grad, framework::Tensor* x_grad) {
   auto softmax = EigenMatrix<T>::From(*y);
   auto softmax_grad = EigenMatrix<T>::From(*y_grad);
@@ -89,8 +88,7 @@ void SoftmaxGradFunctor<Place, T>::operator()(
                  .eval()
                  .reshape(batch_by_one)
                  .broadcast(one_by_class);
-  logits_grad.device(*context.GetEigenDevice<Place>()) =
-      (softmax_grad - dot) * softmax;
+  logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax;
 }
 
 }  // namespace math
diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
index b57d3dc141..ecd3a647e0 100644
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
@@ -17,9 +17,9 @@ namespace paddle {
 namespace operators {
 namespace math {
 template <typename T>
-class Unpool2dMaxFunctor<platform::CPUPlace, T> {
+class Unpool2dMaxFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
@@ -48,9 +48,9 @@ class Unpool2dMaxFunctor<platform::CPUPlace, T> {
   }
 };
 template <class T>
-class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
+class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
                   const framework::Tensor& output,
@@ -82,10 +82,10 @@ class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
     }
   }
 };
-template class Unpool2dMaxGradFunctor<platform::CPUPlace, float>;
-template class Unpool2dMaxGradFunctor<platform::CPUPlace, double>;
-template class Unpool2dMaxFunctor<platform::CPUPlace, float>;
-template class Unpool2dMaxFunctor<platform::CPUPlace, double>;
+template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, float>;
+template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, double>;
+template class Unpool2dMaxFunctor<platform::CPUDeviceContext, float>;
+template class Unpool2dMaxFunctor<platform::CPUDeviceContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index 37c3c8b689..ecbde0f6a7 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -67,9 +67,9 @@ __global__ void KernelUnpool2dMaxGrad(
  * All tensors are in NCHW format.
  */
 template <typename T>
-class Unpool2dMaxFunctor<platform::GPUPlace, T> {
+class Unpool2dMaxFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
@@ -83,21 +83,18 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T> {
     T* output_data = output->mutable_data<T>(context.GetPlace());
     int threads = 1024;
     int grid = (input.numel() + threads - 1) / threads;
-    KernelUnpool2dMax<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(input.numel(), input_data, indices_data,
-                              input_height, input_width, output_channels,
-                              output_data, output_height, output_width);
+    KernelUnpool2dMax<T><<<grid, threads, 0, context.stream()>>>(
+        input.numel(), input_data, indices_data, input_height, input_width,
+        output_channels, output_data, output_height, output_width);
   }
 };
 /*
  * All tensors are in NCHW format.
  */
 template <typename T>
-class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
+class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
                   const framework::Tensor& output,
@@ -116,19 +113,16 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
     int threads = 1024;
     int grid = (input.numel() + threads - 1) / threads;
-    KernelUnpool2dMaxGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(input.numel(), input_data, indices_data,
-                              input_height, input_width, output_channels,
-                              output_data, output_grad_data, output_height,
-                              output_width, input_grad_data);
+    KernelUnpool2dMaxGrad<T><<<grid, threads, 0, context.stream()>>>(
+        input.numel(), input_data, indices_data, input_height, input_width,
+        output_channels, output_data, output_grad_data, output_height,
+        output_width, input_grad_data);
   }
 };
-template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
-template class Unpool2dMaxGradFunctor<platform::GPUPlace, double>;
-template class Unpool2dMaxFunctor<platform::GPUPlace, float>;
-template class Unpool2dMaxFunctor<platform::GPUPlace, double>;
+template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, float>;
+template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, double>;
+template class Unpool2dMaxFunctor<platform::CUDADeviceContext, float>;
+template class Unpool2dMaxFunctor<platform::CUDADeviceContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
index 7077d7c227..0f0ff1371e 100644
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
@@ -18,18 +18,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 namespace math {
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class Unpool2dMaxFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& indices, framework::Tensor* output);
 };
-template <typename Place, class T>
+template <typename DeviceContext, class T>
 class Unpool2dMaxGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& indices,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad,
diff --git a/paddle/operators/math/vol2col.cc b/paddle/operators/math/vol2col.cc
index 99eb7fd46d..d574ed9234 100644
--- a/paddle/operators/math/vol2col.cc
+++ b/paddle/operators/math/vol2col.cc
@@ -25,9 +25,9 @@ namespace math {
  *                    output_depth, output_height, output_width]
  */
 template <class T>
-class Vol2ColFunctor<platform::CPUPlace, T> {
+class Vol2ColFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& vol,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
@@ -111,9 +111,9 @@ class Vol2ColFunctor<platform::CPUPlace, T> {
  *                    output_depth, output_height, output_width]
  */
 template <class T>
-class Col2VolFunctor<platform::CPUPlace, T> {
+class Col2VolFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
@@ -190,10 +190,10 @@ class Col2VolFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class Vol2ColFunctor<platform::CPUPlace, float>;
-template class Vol2ColFunctor<platform::CPUPlace, double>;
-template class Col2VolFunctor<platform::CPUPlace, float>;
-template class Col2VolFunctor<platform::CPUPlace, double>;
+template class Vol2ColFunctor<platform::CPUDeviceContext, float>;
+template class Vol2ColFunctor<platform::CPUDeviceContext, double>;
+template class Col2VolFunctor<platform::CPUDeviceContext, float>;
+template class Col2VolFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/vol2col.cu b/paddle/operators/math/vol2col.cu
index dae3be858e..b029442fe4 100644
--- a/paddle/operators/math/vol2col.cu
+++ b/paddle/operators/math/vol2col.cu
@@ -68,9 +68,9 @@ __global__ void vol2col(int num_kernels, const T* data_vol, int depth,
  *                    output_depth, output_height, output_width]
  */
 template <class T>
-class Vol2ColFunctor<platform::GPUPlace, T> {
+class Vol2ColFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& vol,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
@@ -117,9 +117,7 @@ class Vol2ColFunctor<platform::GPUPlace, T> {
 
     const int threads = 1024;
     const int blocks = (num_outputs + 1024 - 1) / 1024;
-    vol2col<T><<<blocks, threads, 0,
-                 reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                     .stream()>>>(
+    vol2col<T><<<blocks, threads, 0, context.stream()>>>(
         num_outputs, vol.data<T>(), input_depth, input_height, input_width,
         dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
         filter_width, strides[0], strides[1], strides[2], paddings[0],
@@ -196,9 +194,9 @@ __global__ void col2vol(int num_kernels, const T* data_col, int depth,
  *                    output_depth, output_height, output_width]
  */
 template <class T>
-class Col2VolFunctor<platform::GPUPlace, T> {
+class Col2VolFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
@@ -245,9 +243,7 @@ class Col2VolFunctor<platform::GPUPlace, T> {
     const int threads = 1024;
     const int blocks = (num_kernels + 1024 - 1) / 1024;
 
-    col2vol<T><<<blocks, threads, 0,
-                 reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                     .stream()>>>(
+    col2vol<T><<<blocks, threads, 0, context.stream()>>>(
         num_kernels, col.data<T>(), input_depth, input_height, input_width,
         dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
         filter_width, strides[0], strides[1], strides[2], paddings[0],
@@ -256,10 +252,10 @@ class Col2VolFunctor<platform::GPUPlace, T> {
   }
 };
 
-template class Vol2ColFunctor<platform::GPUPlace, float>;
-template class Vol2ColFunctor<platform::GPUPlace, double>;
-template class Col2VolFunctor<platform::GPUPlace, float>;
-template class Col2VolFunctor<platform::GPUPlace, double>;
+template class Vol2ColFunctor<platform::CUDADeviceContext, float>;
+template class Vol2ColFunctor<platform::CUDADeviceContext, double>;
+template class Col2VolFunctor<platform::CUDADeviceContext, float>;
+template class Col2VolFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/vol2col.h b/paddle/operators/math/vol2col.h
index dc64d1d977..dcd80370e8 100644
--- a/paddle/operators/math/vol2col.h
+++ b/paddle/operators/math/vol2col.h
@@ -63,22 +63,20 @@ namespace math {
  * \note The caller needs to ensure that volShape.inputChannels is equal to
  *       colShape.inputChannels.
  */
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class Vol2ColFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& vol,
+  void operator()(const DeviceContext& context, const framework::Tensor& vol,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
                   framework::Tensor* col) const;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class Col2VolFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& col,
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc
index 62c3152304..f46db3c567 100644
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <iostream>
 
-template <typename Place>
+template <typename DeviceContext, typename Place>
 void testVol2col() {
   paddle::framework::Tensor input;
   paddle::framework::Tensor input_tmp;
@@ -24,18 +24,7 @@ void testVol2col() {
   paddle::framework::Tensor output_tmp;
 
   auto* place = new Place();
-  paddle::platform::DeviceContext* context;
-  if (paddle::platform::is_cpu_place(*place)) {
-    context =
-        new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
-  } else {
-#ifdef PADDLE_WITH_CUDA
-    context =
-        new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
-#else
-    PADDLE_THROW("no GPU support");
-#endif  // PADDLE_WITH_CUDA
-  }
+  DeviceContext* context = new DeviceContext(*place);
 
   /**
    * input = [[0, 1, 2,
@@ -88,7 +77,7 @@ void testVol2col() {
                               output_depth, output_height, output_width},
                              *place);
 
-  paddle::operators::math::Vol2ColFunctor<Place, float> vol2col;
+  paddle::operators::math::Vol2ColFunctor<DeviceContext, float> vol2col;
   vol2col(*context, input, dilations, strides, paddings, &output);
 
   float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
@@ -113,7 +102,7 @@ void testVol2col() {
     CopyFrom(input_tmp, *place, *context, &input);
   }
 
-  paddle::operators::math::Col2VolFunctor<Place, float> col2vol;
+  paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol;
   col2vol(*context, output, dilations, strides, paddings, &input);
 
   float* in_ptr;
@@ -130,8 +119,9 @@ void testVol2col() {
 }
 
 TEST(math, vol2col) {
-  testVol2col<paddle::platform::CPUPlace>();
+  testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #ifdef PADDLE_WITH_CUDA
-  testVol2col<paddle::platform::GPUPlace>();
+  testVol2col<paddle::platform::CUDADeviceContext,
+              paddle::platform::GPUPlace>();
 #endif  // PADDLE_WITH_CUDA
 }
diff --git a/paddle/operators/matmul_op.cc b/paddle/operators/matmul_op.cc
index 5a1a615420..ee0bc0c370 100644
--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
@@ -206,7 +206,8 @@ class MatMulOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad,
             ops::MatMulOpGrad);
-REGISTER_OP_CPU_KERNEL(matmul,
-                       ops::MatMulKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    matmul_grad, ops::MatMulGradKernel<paddle::platform::CPUPlace, float>);
+    matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    matmul_grad,
+    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/matmul_op.cu.cc b/paddle/operators/matmul_op.cu.cc
index b7e66382f0..6a3772c004 100644
--- a/paddle/operators/matmul_op.cu.cc
+++ b/paddle/operators/matmul_op.cu.cc
@@ -15,7 +15,8 @@
 #include "paddle/operators/matmul_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(matmul,
-                       ops::MatMulKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    matmul_grad, ops::MatMulGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    matmul, ops::MatMulKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    matmul_grad,
+    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h
index 1e4aa48b70..de9da487b3 100644
--- a/paddle/operators/matmul_op.h
+++ b/paddle/operators/matmul_op.h
@@ -27,7 +27,7 @@ using DDim = framework::DDim;
 using framework::make_ddim;
 using framework::vectorize;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MatMulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -38,8 +38,9 @@ class MatMulKernel : public framework::OpKernel<T> {
     bool transpose_x = context.Attr<bool>("transpose_X");
     bool transpose_y = context.Attr<bool>("transpose_Y");
 
-    math::MatMulFunctor<Place, T>()(context.device_context(), x, transpose_x, y,
-                                    transpose_y, T(1), out, T(0));
+    math::MatMulFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), x, transpose_x, y,
+        transpose_y, T(1), out, T(0));
   }
 };
 
@@ -68,17 +69,16 @@ Tensor CombineBatchAndM(const Tensor& input) {
 // Reshape a rank-3 tensor from P x M x N to M x (P * N).
 // (Warning: This requires transposing data and writes into new memory.)
 // Identity op if the tensor is not of rank 3.
-template <typename Place, typename T>
-Tensor CombineBatchAndN(const framework::ExecutionContext& context,
-                        const Tensor& input) {
+template <typename DeviceContext, typename T>
+Tensor CombineBatchAndN(const DeviceContext& context, const Tensor& input) {
   Tensor output;
   auto in_dims = input.dims();
   if (in_dims.size() == 3) {
     output.Resize({in_dims[1], in_dims[0], in_dims[2]});
     output.mutable_data<T>(context.GetPlace());
     std::vector<int> axis = {1, 0, 2};
-    math::Transpose<Place, T, 3> trans;
-    trans(context.device_context(), input, &output, axis);
+    math::Transpose<DeviceContext, T, 3> trans;
+    trans(context, input, &output, axis);
     std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
     output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
   } else {
@@ -112,7 +112,7 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context,
 //
 // To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
 // to X: (P * M) x K, dOut: (P * M) x N.
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MatMulGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -178,24 +178,23 @@ class MatMulGradKernel : public framework::OpKernel<T> {
     Tensor Y = Reshape<T>(y, make_ddim(y_dims));
     Tensor dOut = Reshape<T>(dout, make_ddim(dout_dims));
 
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (dx) {
       dx->mutable_data<T>(context.GetPlace());
       const Tensor& dOut_for_dX =
           (x_dims.size() == 2 && y_dims.size() == 3)
-              ? CombineBatchAndN<Place, T>(context, dOut)
+              ? CombineBatchAndN<DeviceContext, T>(dev_ctx, dOut)
               : dOut;
       if (x_dims.size() == 2 && y_dims.size() == 3) {
         Y = transpose_y ? CombineBatchAndM<T>(Y)
-                        : CombineBatchAndN<Place, T>(context, Y);
+                        : CombineBatchAndN<DeviceContext, T>(dev_ctx, Y);
       }
       if (transpose_x) {
-        math::MatMulFunctor<Place, T>()(context.device_context(), Y,
-                                        transpose_y, dOut_for_dX, transpose_x,
-                                        T(1), dx, T(0));
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, Y, transpose_y, dOut_for_dX, transpose_x, T(1), dx, T(0));
       } else {
-        math::MatMulFunctor<Place, T>()(context.device_context(), dOut_for_dX,
-                                        transpose_x, Y, !transpose_y, T(1), dx,
-                                        T(0));
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, dOut_for_dX, transpose_x, Y, !transpose_y, T(1), dx, T(0));
       }
     }
 
@@ -205,18 +204,16 @@ class MatMulGradKernel : public framework::OpKernel<T> {
                                       ? CombineBatchAndM<T>(dOut)
                                       : dOut;
       if (y_dims.size() == 2 && x_dims.size() == 3) {
-        X = transpose_x ? CombineBatchAndN<Place, T>(context, X)
+        X = transpose_x ? CombineBatchAndN<DeviceContext, T>(dev_ctx, X)
                         : CombineBatchAndM<T>(X);
         dOut = CombineBatchAndM<T>(dOut);
       }
       if (transpose_y) {
-        math::MatMulFunctor<Place, T>()(context.device_context(), dOut_for_dY,
-                                        transpose_y, X, transpose_x, T(1), dy,
-                                        T(0));
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, dOut_for_dY, transpose_y, X, transpose_x, T(1), dy, T(0));
       } else {
-        math::MatMulFunctor<Place, T>()(context.device_context(), X,
-                                        !transpose_x, dOut_for_dY, transpose_y,
-                                        T(1), dy, T(0));
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, X, !transpose_x, dOut_for_dY, transpose_y, T(1), dy, T(0));
       }
     }
   }
diff --git a/paddle/operators/maxout_op.cc b/paddle/operators/maxout_op.cc
index 44bf402e95..011616e615 100644
--- a/paddle/operators/maxout_op.cc
+++ b/paddle/operators/maxout_op.cc
@@ -101,7 +101,8 @@ class MaxOutOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
             ops::MaxOutOpGrad);
-REGISTER_OP_CPU_KERNEL(maxout,
-                       ops::MaxOutKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    maxout_grad, ops::MaxOutGradKernel<paddle::platform::CPUPlace, float>);
+    maxout, ops::MaxOutKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    maxout_grad,
+    ops::MaxOutGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/maxout_op.cu.cc b/paddle/operators/maxout_op.cu.cc
index decd43913d..2904f0ff96 100644
--- a/paddle/operators/maxout_op.cu.cc
+++ b/paddle/operators/maxout_op.cu.cc
@@ -15,9 +15,10 @@
 #include "paddle/operators/maxout_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(maxout,
-                       ops::MaxOutKernel<paddle::platform::GPUPlace, float>,
-                       ops::MaxOutKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
-    maxout_grad, ops::MaxOutGradKernel<paddle::platform::GPUPlace, float>,
-    ops::MaxOutGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    maxout, ops::MaxOutKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MaxOutKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    maxout_grad,
+    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/maxout_op.h b/paddle/operators/maxout_op.h
index 44a0d073dd..e8b12552b9 100644
--- a/paddle/operators/maxout_op.h
+++ b/paddle/operators/maxout_op.h
@@ -23,7 +23,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MaxOutKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -31,12 +31,13 @@ class MaxOutKernel : public framework::OpKernel<T> {
     Tensor* out = context.Output<Tensor>("Out");
     int groups = context.template Attr<int>("groups");
 
-    math::MaxOutFunctor<Place, T> maxout_forward;
-    maxout_forward(context.device_context(), *in_x, out, groups);
+    math::MaxOutFunctor<DeviceContext, T> maxout_forward;
+    maxout_forward(context.template device_context<DeviceContext>(), *in_x, out,
+                   groups);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MaxOutGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -46,14 +47,13 @@ class MaxOutGradKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
     int groups = context.template Attr<int>("groups");
-    auto& device_ctx = context.device_context();
-    math::SetConstant<Place, T> zero;
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
       zero(device_ctx, in_x_grad, static_cast<T>(0.0));
-      math::MaxOutGradFunctor<Place, T> maxout_backward;
-      maxout_backward(context.device_context(), *in_x, in_x_grad, *out,
-                      *out_grad, groups);
+      math::MaxOutGradFunctor<DeviceContext, T> maxout_backward;
+      maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups);
     }
   }
 };
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index dcc5b4286f..8932d700c2 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -76,8 +76,9 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker);
 REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
-REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel<paddle::platform::CPUPlace, float>,
-                       ops::MeanKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(mean_grad,
-                       ops::MeanGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::MeanGradKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MeanKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    mean_grad, ops::MeanGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MeanGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/mean_op.cu b/paddle/operators/mean_op.cu
index ca089938c0..93062bf540 100644
--- a/paddle/operators/mean_op.cu
+++ b/paddle/operators/mean_op.cu
@@ -17,8 +17,9 @@
 #include "paddle/operators/mean_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel<paddle::platform::GPUPlace, float>,
-                       ops::MeanKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(mean_grad,
-                       ops::MeanGradKernel<paddle::platform::GPUPlace, float>,
-                       ops::MeanGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h
index c99286a5b9..351b345959 100644
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MeanKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -38,13 +38,14 @@ class MeanKernel : public framework::OpKernel<T> {
 
     auto X = EigenVector<T>::Flatten(*input);
     auto y = EigenScalar<T>::From(*output);
-    auto& place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     y.device(place) = X.mean();
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MeanGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -56,7 +57,8 @@ class MeanGradKernel : public framework::OpKernel<T> {
     T ig_size = static_cast<T>(IG->numel());
     Eigen::DSizes<int, 1> bcast(ig_size);
 
-    EigenVector<T>::Flatten(*IG).device(context.GetEigenDevice<Place>()) =
+    EigenVector<T>::Flatten(*IG).device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
         (EigenVector<T>::From(*OG) / ig_size).broadcast(bcast);
   }
 };
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index 4684c20208..27f0c8de20 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -102,5 +102,5 @@ class MinusGradMaker : public framework::GradOpDescMakerBase {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradMaker);
-REGISTER_OP_CPU_KERNEL(minus,
-                       ops::MinusKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    minus, ops::MinusKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/minus_op.cu b/paddle/operators/minus_op.cu
index a8375cc630..3b202ea92e 100644
--- a/paddle/operators/minus_op.cu
+++ b/paddle/operators/minus_op.cu
@@ -14,5 +14,6 @@
 
 #include "paddle/operators/minus_op.h"
 
-REGISTER_OP_GPU_KERNEL(
-    minus, paddle::operators::MinusKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    minus,
+    paddle::operators::MinusKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/minus_op.h b/paddle/operators/minus_op.h
index bd9a2790aa..78e1e1be6d 100644
--- a/paddle/operators/minus_op.h
+++ b/paddle/operators/minus_op.h
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MinusKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -28,7 +28,8 @@ class MinusKernel : public framework::OpKernel<T> {
     auto* out_tensor = context.Output<framework::Tensor>("Out");
 
     out_tensor->mutable_data<T>(context.GetPlace());
-    auto& dev = context.GetEigenDevice<Place>();
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
     framework::EigenVector<T>::Flatten(*out_tensor).device(dev) =
         framework::EigenVector<T>::Flatten(*left_tensor) -
         framework::EigenVector<T>::Flatten(*right_tensor);
diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc
index 28528848af..f0a42491bf 100644
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -115,6 +115,6 @@ REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp,
 
 REGISTER_OP_CPU_KERNEL(
     modified_huber_loss,
-    ops::ModifiedHuberLossKernel<paddle::platform::CPUPlace, float>);
+    ops::ModifiedHuberLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(modified_huber_loss_grad,
                        ops::ModifiedHuberLossGradCPUKernel<float>);
diff --git a/paddle/operators/modified_huber_loss_op.cu b/paddle/operators/modified_huber_loss_op.cu
index 8854e166cd..40a8447da4 100644
--- a/paddle/operators/modified_huber_loss_op.cu
+++ b/paddle/operators/modified_huber_loss_op.cu
@@ -71,8 +71,8 @@ class ModifiedHuberLossGradGPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     modified_huber_loss,
-    ops::ModifiedHuberLossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(modified_huber_loss_grad,
-                       ops::ModifiedHuberLossGradGPUKernel<float>);
+    ops::ModifiedHuberLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(modified_huber_loss_grad,
+                        ops::ModifiedHuberLossGradGPUKernel<float>);
diff --git a/paddle/operators/modified_huber_loss_op.h b/paddle/operators/modified_huber_loss_op.h
index aba75efad9..157ae0682e 100644
--- a/paddle/operators/modified_huber_loss_op.h
+++ b/paddle/operators/modified_huber_loss_op.h
@@ -46,7 +46,7 @@ struct ModifiedHuberLossForward {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ModifiedHuberLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -57,7 +57,8 @@ class ModifiedHuberLossKernel : public framework::OpKernel<T> {
 
     out0->mutable_data<T>(context.GetPlace());
     out1->mutable_data<T>(context.GetPlace());
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto x = EigenVector<T>::Flatten(*in0);
     auto y = EigenVector<T>::Flatten(*in1);
diff --git a/paddle/operators/momentum_op.cu b/paddle/operators/momentum_op.cu
index be0c8ea071..00f1253465 100644
--- a/paddle/operators/momentum_op.cu
+++ b/paddle/operators/momentum_op.cu
@@ -74,5 +74,5 @@ class MomentumOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(momentum, ops::MomentumOpCUDAKernel<float>,
-                       ops::MomentumOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(momentum, ops::MomentumOpCUDAKernel<float>,
+                        ops::MomentumOpCUDAKernel<double>);
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 3c39ae10dc..bc4a5fdf0b 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -149,6 +149,7 @@ REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker,
                   ops::MulOpShapeInference,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(mul_grad, ops::MulOpGrad);
-REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(mul_grad,
-                       ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/mul_op.cu.cc b/paddle/operators/mul_op.cu.cc
index 66dc3d6d10..6095de58d0 100644
--- a/paddle/operators/mul_op.cu.cc
+++ b/paddle/operators/mul_op.cu.cc
@@ -15,6 +15,7 @@
 #include "paddle/operators/mul_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(mul_grad,
-                       ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    mul_grad, ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index 0eb9df41e9..1b467dca83 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -23,7 +23,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -46,15 +46,16 @@ class MulKernel : public framework::OpKernel<T> {
     if (z_dim.size() != 2) {
       z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
     }
-    math::matmul<Place, T>(context.device_context(), x_matrix, false, y_matrix,
-                           false, 1, z, 0);
+    math::matmul<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), x_matrix, false,
+        y_matrix, false, 1, z, 0);
     if (z_dim.size() != 2) {
       z->Resize(z_dim);
     }
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MulGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -77,6 +78,7 @@ class MulGradKernel : public framework::OpKernel<T> {
 
     Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
     if (dx) {
       dx->mutable_data<T>(ctx.GetPlace());
       Tensor dx_matrix = dx->dims().size() > 2
@@ -84,8 +86,8 @@ class MulGradKernel : public framework::OpKernel<T> {
                              : *dx;
 
       // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      math::matmul<Place, T>(ctx.device_context(), dout_mat, false, y_matrix,
-                             true, 1, &dx_matrix, 0);
+      math::matmul<DeviceContext, T>(dev_ctx, dout_mat, false, y_matrix, true,
+                                     1, &dx_matrix, 0);
     }
     if (dy) {
       dy->mutable_data<T>(ctx.GetPlace());
@@ -93,8 +95,8 @@ class MulGradKernel : public framework::OpKernel<T> {
                              ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
                              : *dy;
       // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      math::matmul<Place, T>(ctx.device_context(), x_matrix, true, dout_mat,
-                             false, 1, &dy_matrix, 0);
+      math::matmul<DeviceContext, T>(dev_ctx, x_matrix, true, dout_mat, false,
+                                     1, &dy_matrix, 0);
     }
   }
 };
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index 8e7f544e0d..b1ee8051c4 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -119,7 +119,8 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<false>);
 REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
 REGISTER_OP_CPU_KERNEL(
-    multiplex, ops::MultiplexCPUKernel<paddle::platform::CPUPlace, float>);
+    multiplex,
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     multiplex_grad,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUPlace, float>);
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index 10dff8d021..47986e9ff8 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -36,7 +36,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
     CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
     auto stream = ctx.cuda_device_context().stream();
-    Place place = boost::get<Place>(ctx.GetPlace());
+    platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
       int32_t k = index[i];
       PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
@@ -60,7 +60,8 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
       if (d_ins[i]) {
         d_ins[i]->mutable_data<T>(ctx.GetPlace());
         auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(ctx.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+        t.device(*ctx.template device_context<Place>().eigen_device()) =
+            t.constant(static_cast<T>(0));
       }
     }
 
@@ -72,7 +73,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
     auto* index = index_t_cpu.data<int32_t>();
 
     auto stream = ctx.cuda_device_context().stream();
-    Place place = boost::get<Place>(ctx.GetPlace());
+    platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
       size_t k = static_cast<size_t>(index[i]);
       if (d_ins[k]) {
@@ -87,8 +88,9 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
-    multiplex, ops::MultiplexGPUKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    multiplex,
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     multiplex_grad,
-    ops::MultiplexGradGPUKernel<paddle::platform::GPUPlace, float>);
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/multiplex_op.h b/paddle/operators/multiplex_op.h
index ab3cafaa32..3443151161 100644
--- a/paddle/operators/multiplex_op.h
+++ b/paddle/operators/multiplex_op.h
@@ -22,7 +22,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MultiplexCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -35,7 +35,7 @@ class MultiplexCPUKernel : public framework::OpKernel<T> {
     auto rows = ins[0]->dims()[0];
     auto cols = ins[0]->numel() / rows;
     auto index = ids->data<int32_t>();
-    Place place = boost::get<Place>(ctx.GetPlace());
+    platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
       int32_t k = index[i];
       PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
@@ -47,7 +47,7 @@ class MultiplexCPUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MultiplexGradCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -60,14 +60,15 @@ class MultiplexGradCPUKernel : public framework::OpKernel<T> {
       if (d_ins[i]) {
         d_ins[i]->mutable_data<T>(ctx.GetPlace());
         auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(ctx.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+        t.device(*ctx.template device_context<DeviceContext>().eigen_device()) =
+            t.constant(static_cast<T>(0));
       }
     }
 
     auto rows = ins[0]->dims()[0];
     auto cols = ins[0]->numel() / rows;
     auto* index = ids->data<int32_t>();
-    Place place = boost::get<Place>(ctx.GetPlace());
+    platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
       size_t k = static_cast<size_t>(index[i]);
       if (d_ins[k]) {
diff --git a/paddle/operators/nccl_op.cu.cc b/paddle/operators/nccl_op.cu.cc
index 4f0a2a79ed..6ca6db7253 100644
--- a/paddle/operators/nccl_op.cu.cc
+++ b/paddle/operators/nccl_op.cu.cc
@@ -204,6 +204,6 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
-REGISTER_OP_GPU_KERNEL(ncclBcast, ops::NCCLBcastKernel<float>);
-REGISTER_OP_GPU_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
+REGISTER_OP_CUDA_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
+REGISTER_OP_CUDA_KERNEL(ncclBcast, ops::NCCLBcastKernel<float>);
+REGISTER_OP_CUDA_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc
index bb7ae20286..d747cc0cf5 100644
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -33,9 +33,9 @@
 #include "paddle/platform/place.h"
 
 USE_NO_KERNEL_OP(ncclInit);
-USE_GPU_ONLY_OP(ncclAllReduce);
-USE_GPU_ONLY_OP(ncclReduce);
-USE_GPU_ONLY_OP(ncclBcast);
+USE_CUDA_ONLY_OP(ncclAllReduce);
+USE_CUDA_ONLY_OP(ncclReduce);
+USE_CUDA_ONLY_OP(ncclBcast);
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc
index 952da10434..5ad1610fde 100644
--- a/paddle/operators/nce_op.cc
+++ b/paddle/operators/nce_op.cc
@@ -67,7 +67,7 @@ class NCEOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        ctx.device_context());
+        ctx.GetPlace());
   }
 };
 
@@ -170,7 +170,7 @@ class NCEOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        ctx.device_context());
+        ctx.GetPlace());
   }
 };
 
diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h
index ea92a797fe..6636dad060 100644
--- a/paddle/operators/nce_op.h
+++ b/paddle/operators/nce_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 void PrepareSamples(const framework::ExecutionContext& context) {
   auto label = context.Input<Tensor>("Label");
   const int64_t* label_data = label->data<int64_t>();
@@ -49,7 +49,7 @@ void PrepareSamples(const framework::ExecutionContext& context) {
 
   int num_label = label_dims.size() == 2 ? label_dims[1] : 1;
   int index = 0;
-  for (size_t i = 0; i < label_dims[0]; ++i) {
+  for (int64_t i = 0; i < label_dims[0]; ++i) {
     int j = 0;
     for (; j < num_label; ++j) {
       sample_labels_data[index++] = label_data[i * num_label + j];
@@ -67,11 +67,11 @@ void PrepareSamples(const framework::ExecutionContext& context) {
   }
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class NCEKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PrepareSamples<Place, T>(context);
+    PrepareSamples<DeviceContext, T>(context);
     auto sample_labels = context.Output<Tensor>("SampleLabels");
     const int64_t* sample_labels_data = sample_labels->data<int64_t>();
     auto sample_out = context.Output<Tensor>("SampleLogits");
@@ -86,7 +86,7 @@ class NCEKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
     int num_neg_samples = context.Attr<int>("num_neg_samples");
     int num_total_classes = context.Attr<int>("num_total_classes");
-    int num_true_class = 1;
+    int64_t num_true_class = 1;
     if (label != nullptr) {
       num_true_class = label->dims()[1];
     }
@@ -95,18 +95,18 @@ class NCEKernel : public framework::OpKernel<T> {
     auto bias = context.Input<Tensor>("Bias");
     if (bias != nullptr) {
       const T* bias_data = bias->data<T>();
-      for (size_t i = 0; i < sample_labels->numel(); ++i) {
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
         sample_out_data[i] = bias_data[sample_labels_data[i]];
       }
     } else {
-      for (size_t i = 0; i < sample_labels->numel(); ++i) {
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
         sample_out_data[i] = 0;
       }
     }
     // forward mul
     auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
     auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
-    for (size_t i = 0; i < sample_labels->numel(); ++i) {
+    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
       Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
           (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) *
            weight_mat.chip(sample_labels_data[i], 0))
@@ -115,8 +115,8 @@ class NCEKernel : public framework::OpKernel<T> {
       sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
     }
     // forward cost
-    for (size_t i = 0; i < sample_labels->dims()[0]; ++i) {
-      size_t j = 0;
+    for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) {
+      int64_t j = 0;
       out_data[i] = 0;
       T w = sample_weight == nullptr ? 1. : sample_weight_data[i];
       // for true classes
@@ -135,7 +135,7 @@ class NCEKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class NCEGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -162,7 +162,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
     T* sample_grad_data =
         sample_grad.mutable_data<T>(sample_labels->dims(), context.GetPlace());
     // backward cost
-    for (size_t i = 0; i < sample_labels->numel(); ++i) {
+    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
       T o = sample_out_data[i];
       T w = sample_weight == nullptr
                 ? 1
@@ -177,7 +177,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
     if (d_bias != nullptr) {
       T* d_bias_data = d_bias->mutable_data<T>(context.GetPlace());
       std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0);
-      for (size_t i = 0; i < sample_labels->numel(); ++i) {
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
         d_bias_data[sample_labels_data[i]] += sample_grad_data[i];
       }
     }
@@ -188,7 +188,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
       std::fill(d_w_data, d_w_data + d_w->numel(), 0.0);
       auto d_w_matrix = EigenMatrix<T>::From(*d_w);
       auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
-      for (size_t i = 0; i < sample_labels->numel(); ++i) {
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
         d_w_matrix.chip(sample_labels_data[i], 0) +=
             x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) *
             sample_grad_data[i];
@@ -200,7 +200,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
       d_x->mutable_data<T>(context.GetPlace());
       auto d_x_matrix = EigenMatrix<T>::From(*d_x);
       auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
-      for (size_t i = 0; i < sample_labels->numel(); ++i) {
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
         d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) +=
             w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
       }
diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h
index ebeb262d96..8935751f15 100644
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -38,7 +38,10 @@ namespace operators {
 class NetOp : public framework::OperatorBase {
  public:
   static const char kAll[];
-  NetOp() : framework::OperatorBase("plain_net", {}, {}, {}) {}
+  NetOp()
+      : framework::OperatorBase("plain_net", framework::VariableNameMap{},
+                                framework::VariableNameMap{},
+                                framework::AttributeMap{}) {}
 
   NetOp(const std::string& type, const framework::VariableNameMap& inputs,
         const framework::VariableNameMap& outputs,
diff --git a/paddle/operators/net_op_test.cc b/paddle/operators/net_op_test.cc
index 63bebd5b44..22fba9568d 100644
--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
@@ -38,10 +38,10 @@ TEST(OpKernel, all) {
 
   net->AppendOp(std::unique_ptr<TestOp>(
       new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                 {{"Out", {"y"}}}, {})));
+                 {{"Out", {"y"}}}, framework::AttributeMap{})));
   net->AppendOp(std::unique_ptr<TestOp>(
       new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
-                 {{"Out", {"z"}}}, {})));
+                 {{"Out", {"z"}}}, framework::AttributeMap{})));
 
   net->CompleteAddOp();
   AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"},
@@ -58,7 +58,7 @@ TEST(NetOp, insert_op) {
   NetOp net;
   auto op1 = std::unique_ptr<framework::NOP>(
       new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                         {{"Out", {"y"}}}, {}));
+                         {{"Out", {"y"}}}, framework::AttributeMap{}));
   net.AppendOp(*op1);
   net.InsertOp(0, *op1);
   ASSERT_EQ(2UL, net.ops_.size());
@@ -68,10 +68,12 @@ TEST(NetOp, insert_op) {
 
 TEST(NetOp, Clone) {
   NetOp net;
-  net.AppendOp(
-      std::unique_ptr<framework::NOP>(new framework::NOP{"empty", {}, {}, {}}));
-  net.AppendOp(std::unique_ptr<framework::NOP>(
-      new framework::NOP{"empty2", {}, {}, {}}));
+  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
+      "empty", framework::VariableNameMap{}, framework::VariableNameMap{},
+      framework::AttributeMap{}}));
+  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
+      "empty2", framework::VariableNameMap{}, framework::VariableNameMap{},
+      framework::AttributeMap{}}));
   net.CompleteAddOp(true);
   auto new_net_op = net.Clone();
   ASSERT_NE(new_net_op, nullptr);
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index adb75df6ef..936dde22c3 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -134,6 +134,7 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker);
 REGISTER_OPERATOR(pad_grad, ops::PadOpGrad);
-REGISTER_OP_CPU_KERNEL(pad, ops::PadKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(pad_grad,
-                       ops::PadGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    pad, ops::PadKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    pad_grad, ops::PadGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/pad_op.cu b/paddle/operators/pad_op.cu
index 555a7dba23..c309fb625c 100644
--- a/paddle/operators/pad_op.cu
+++ b/paddle/operators/pad_op.cu
@@ -16,6 +16,7 @@
 #include "paddle/operators/pad_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(pad, ops::PadKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(pad_grad,
-                       ops::PadGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    pad, ops::PadKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    pad_grad, ops::PadGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/pad_op.h b/paddle/operators/pad_op.h
index 9534dbf545..1b95942af3 100644
--- a/paddle/operators/pad_op.h
+++ b/paddle/operators/pad_op.h
@@ -26,7 +26,7 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 
-template <typename Place, typename T, size_t D>
+template <typename DeviceContext, typename T, size_t D>
 void PadFunction(const framework::ExecutionContext& context) {
   auto pads = context.Attr<std::vector<int>>("paddings");
   Eigen::array<std::pair<int, int>, D> paddings;
@@ -42,33 +42,34 @@ void PadFunction(const framework::ExecutionContext& context) {
 
   auto x_tensor = EigenTensor<T, D>::From(*x);
   auto out_tensor = EigenTensor<T, D>::From(*out);
-  auto place = context.GetEigenDevice<Place>();
+  auto& place =
+      *context.template device_context<DeviceContext>().eigen_device();
   out_tensor.device(place) = x_tensor.pad(paddings, pad_value);
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PadKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     int rank = context.Input<Tensor>("X")->dims().size();
     switch (rank) {
       case 1:
-        PadFunction<Place, T, 1>(context);
+        PadFunction<DeviceContext, T, 1>(context);
         break;
       case 2:
-        PadFunction<Place, T, 2>(context);
+        PadFunction<DeviceContext, T, 2>(context);
         break;
       case 3:
-        PadFunction<Place, T, 3>(context);
+        PadFunction<DeviceContext, T, 3>(context);
         break;
       case 4:
-        PadFunction<Place, T, 4>(context);
+        PadFunction<DeviceContext, T, 4>(context);
         break;
       case 5:
-        PadFunction<Place, T, 5>(context);
+        PadFunction<DeviceContext, T, 5>(context);
         break;
       case 6:
-        PadFunction<Place, T, 6>(context);
+        PadFunction<DeviceContext, T, 6>(context);
         break;
       default:
         PADDLE_THROW(
@@ -77,7 +78,7 @@ class PadKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T, size_t D>
+template <typename DeviceContext, typename T, size_t D>
 void PadGradFunction(const framework::ExecutionContext& context) {
   auto pads = context.Attr<std::vector<int>>("paddings");
   Eigen::array<std::pair<int, int>, D> paddings;
@@ -91,12 +92,13 @@ void PadGradFunction(const framework::ExecutionContext& context) {
     d_x->mutable_data<T>(context.GetPlace());
     auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
     auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0);
   }
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PadGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -104,22 +106,22 @@ class PadGradKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
     switch (rank) {
       case 1:
-        PadGradFunction<Place, T, 1>(context);
+        PadGradFunction<DeviceContext, T, 1>(context);
         break;
       case 2:
-        PadGradFunction<Place, T, 2>(context);
+        PadGradFunction<DeviceContext, T, 2>(context);
         break;
       case 3:
-        PadGradFunction<Place, T, 3>(context);
+        PadGradFunction<DeviceContext, T, 3>(context);
         break;
       case 4:
-        PadGradFunction<Place, T, 4>(context);
+        PadGradFunction<DeviceContext, T, 4>(context);
         break;
       case 5:
-        PadGradFunction<Place, T, 5>(context);
+        PadGradFunction<DeviceContext, T, 5>(context);
         break;
       case 6:
-        PadGradFunction<Place, T, 6>(context);
+        PadGradFunction<DeviceContext, T, 6>(context);
         break;
       default:
         PADDLE_THROW(
diff --git a/paddle/operators/pool_cudnn_op.cc b/paddle/operators/pool_cudnn_op.cc
index be9fcc5661..77407f5cdf 100644
--- a/paddle/operators/pool_cudnn_op.cc
+++ b/paddle/operators/pool_cudnn_op.cc
@@ -19,19 +19,21 @@ namespace ops = paddle::operators;
 REGISTER_OP(pool2d_cudnn, ops::PoolOp, ops::Pool2dOpMaker, pool2d_cudnn_grad,
             ops::PoolOpGrad);
 
-REGISTER_OP_CPU_KERNEL(pool2d_cudnn,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(pool2d_cudnn_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>)
+REGISTER_OP_CPU_KERNEL(
+    pool2d_cudnn, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    pool2d_cudnn_grad,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>)
 
 REGISTER_OP(pool3d_cudnn, ops::PoolOp, ops::Pool3dOpMaker, pool3d_cudnn_grad,
             ops::PoolOpGrad);
 
-REGISTER_OP_CPU_KERNEL(pool3d_cudnn,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(pool3d_cudnn_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>)
+REGISTER_OP_CPU_KERNEL(
+    pool3d_cudnn, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    pool3d_cudnn_grad,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>)
diff --git a/paddle/operators/pool_cudnn_op.cu.cc b/paddle/operators/pool_cudnn_op.cu.cc
index 66dd194ccd..fc2b37bd0f 100644
--- a/paddle/operators/pool_cudnn_op.cu.cc
+++ b/paddle/operators/pool_cudnn_op.cu.cc
@@ -162,12 +162,12 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel<float>,
-                       ops::PoolCudnnOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>,
-                       ops::PoolCudnnGradOpKernel<double>);
-
-REGISTER_OP_GPU_KERNEL(pool3d_cudnn, ops::PoolCudnnOpKernel<float>,
-                       ops::PoolCudnnOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(pool3d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>,
-                       ops::PoolCudnnGradOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel<float>,
+                        ops::PoolCudnnOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>,
+                        ops::PoolCudnnGradOpKernel<double>);
+
+REGISTER_OP_CUDA_KERNEL(pool3d_cudnn, ops::PoolCudnnOpKernel<float>,
+                        ops::PoolCudnnOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(pool3d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>,
+                        ops::PoolCudnnGradOpKernel<double>);
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index e26ffd86e5..45fa20280c 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -216,19 +216,19 @@ namespace ops = paddle::operators;
 REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
             ops::PoolOpGrad);
 
-REGISTER_OP_CPU_KERNEL(pool2d,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(pool2d_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>)
+REGISTER_OP_CPU_KERNEL(
+    pool2d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    pool2d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>)
 
 REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
             ops::PoolOpGrad);
 
-REGISTER_OP_CPU_KERNEL(pool3d,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(pool3d_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    pool3d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    pool3d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/pool_op.cu.cc b/paddle/operators/pool_op.cu.cc
index 1010cb7622..39a9dfbf79 100644
--- a/paddle/operators/pool_op.cu.cc
+++ b/paddle/operators/pool_op.cu.cc
@@ -16,16 +16,18 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(pool2d,
-                       ops::PoolKernel<paddle::platform::GPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(pool2d_grad,
-                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pool2d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pool2d_grad,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
 
-REGISTER_OP_GPU_KERNEL(pool3d,
-                       ops::PoolKernel<paddle::platform::GPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(pool3d_grad,
-                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pool3d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pool3d_grad,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
index 63492a89e8..ab85d587a3 100644
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -50,7 +50,7 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
                 framework::OpAttrChecker* op_checker);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -67,41 +67,41 @@ class PoolKernel : public framework::OpKernel<T> {
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
       }
     }
-
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     switch (ksize.size()) {
       case 2: {
         if (pooling_type == "max") {
           paddle::operators::math::Pool2dFunctor<
-              Place, paddle::operators::math::MaxPool<T>, T>
+              DeviceContext, paddle::operators::math::MaxPool<T>, T>
               pool2d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(context.device_context(), *in_x, ksize, strides,
-                         paddings, pool_process, out);
+          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
 
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool2dFunctor<
-              Place, paddle::operators::math::AvgPool<T>, T>
+              DeviceContext, paddle::operators::math::AvgPool<T>, T>
               pool2d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
-          pool2d_forward(context.device_context(), *in_x, ksize, strides,
-                         paddings, pool_process, out);
+          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
         }
       } break;
       case 3: {
         if (pooling_type == "max") {
           paddle::operators::math::Pool3dFunctor<
-              Place, paddle::operators::math::MaxPool<T>, T>
+              DeviceContext, paddle::operators::math::MaxPool<T>, T>
               pool3d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
-          pool3d_forward(context.device_context(), *in_x, ksize, strides,
-                         paddings, pool_process, out);
+          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool3dFunctor<
-              Place, paddle::operators::math::AvgPool<T>, T>
+              DeviceContext, paddle::operators::math::AvgPool<T>, T>
               pool3d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
-          pool3d_forward(context.device_context(), *in_x, ksize, strides,
-                         paddings, pool_process, out);
+          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
         }
       } break;
       default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
@@ -109,7 +109,7 @@ class PoolKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -130,42 +130,43 @@ class PoolGradKernel : public framework::OpKernel<T> {
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
       }
     }
-
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
       auto temp = framework::EigenVector<T>::Flatten(*in_x_grad);
-      temp.device(context.GetEigenDevice<Place>()) =
+      temp.device(
+          *context.template device_context<DeviceContext>().eigen_device()) =
           temp.constant(static_cast<T>(0));
 
       switch (ksize.size()) {
         case 2: {
           if (pooling_type == "max") {
-            paddle::operators::math::MaxPool2dGradFunctor<Place, T>
+            paddle::operators::math::MaxPool2dGradFunctor<DeviceContext, T>
                 pool2d_backward;
-            pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
-                            ksize, strides, paddings, in_x_grad);
+            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, in_x_grad);
           } else if (pooling_type == "avg") {
             paddle::operators::math::Pool2dGradFunctor<
-                Place, paddle::operators::math::AvgPoolGrad<T>, T>
+                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
                 pool2d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
-                            ksize, strides, paddings, pool_process, in_x_grad);
+            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, pool_process, in_x_grad);
           }
         } break;
         case 3: {
           if (pooling_type == "max") {
-            paddle::operators::math::MaxPool3dGradFunctor<Place, T>
+            paddle::operators::math::MaxPool3dGradFunctor<DeviceContext, T>
                 pool3d_backward;
-            pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
-                            ksize, strides, paddings, in_x_grad);
+            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, in_x_grad);
           } else if (pooling_type == "avg") {
             paddle::operators::math::Pool3dGradFunctor<
-                Place, paddle::operators::math::AvgPoolGrad<T>, T>
+                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
                 pool3d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
-                            ksize, strides, paddings, pool_process, in_x_grad);
+            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, pool_process, in_x_grad);
           }
         } break;
         default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index b9c42a6912..1a2383f8b8 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -266,12 +266,15 @@ REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
 
 REGISTER_OP_CPU_KERNEL(
     max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, double, int>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
+                                int>);
 REGISTER_OP_CPU_KERNEL(
     max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float, int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, double, int>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
+                                    int>)
 
 REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
             ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad,
@@ -279,9 +282,12 @@ REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
 
 REGISTER_OP_CPU_KERNEL(
     max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, double, int>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
+                                int>);
 REGISTER_OP_CPU_KERNEL(
     max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float, int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, double, int>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
+                                    int>)
diff --git a/paddle/operators/pool_with_index_op.cu.cc b/paddle/operators/pool_with_index_op.cu.cc
index 335064a7ee..4c9804da63 100644
--- a/paddle/operators/pool_with_index_op.cu.cc
+++ b/paddle/operators/pool_with_index_op.cu.cc
@@ -16,20 +16,28 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, double, int>);
-REGISTER_OP_GPU_KERNEL(
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
+                                int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
+                                int>);
+REGISTER_OP_CUDA_KERNEL(
     max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float, int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, double, int>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
+                                    int>)
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, double, int>);
-REGISTER_OP_GPU_KERNEL(
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
+                                int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
+                                int>);
+REGISTER_OP_CUDA_KERNEL(
     max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float, int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, double, int>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
+                                    int>)
diff --git a/paddle/operators/pool_with_index_op.h b/paddle/operators/pool_with_index_op.h
index 40766c7e82..4f4087d1dd 100644
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
@@ -24,7 +24,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -35,6 +35,8 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
@@ -44,23 +46,23 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
 
     switch (ksize.size()) {
       case 2: {
-        paddle::operators::math::MaxPool2dWithIndexFunctor<Place, T1, T2>
+        paddle::operators::math::MaxPool2dWithIndexFunctor<DeviceContext, T1,
+                                                           T2>
             pool2d_forward;
-        pool2d_forward(context.device_context(), *in_x, ksize, strides,
-                       paddings, out, mask);
+        pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
       } break;
       case 3: {
-        paddle::operators::math::MaxPool3dWithIndexFunctor<Place, T1, T2>
+        paddle::operators::math::MaxPool3dWithIndexFunctor<DeviceContext, T1,
+                                                           T2>
             pool3d_forward;
-        pool3d_forward(context.device_context(), *in_x, ksize, strides,
-                       paddings, out, mask);
+        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
       } break;
       default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
     }
   }
 };
 
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -81,18 +83,20 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
 
     if (in_x_grad) {
       in_x_grad->mutable_data<T1>(context.GetPlace());
-      auto& device_ctx = context.device_context();
+      auto& device_ctx = context.template device_context<DeviceContext>();
       math::set_constant(device_ctx, in_x_grad, 0);
 
       switch (ksize.size()) {
         case 2: {
-          paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T1, T2>
+          paddle::operators::math::MaxPool2dWithIndexGradFunctor<DeviceContext,
+                                                                 T1, T2>
               pool2d_backward;
           pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides,
                           paddings, in_x_grad);
         } break;
         case 3: {
-          paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T1, T2>
+          paddle::operators::math::MaxPool3dWithIndexGradFunctor<DeviceContext,
+                                                                 T1, T2>
               pool3d_backward;
           pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
                           paddings, in_x_grad);
diff --git a/paddle/operators/positive_negative_pair_op.h b/paddle/operators/positive_negative_pair_op.h
index 2efd3777e0..977e59b7d2 100644
--- a/paddle/operators/positive_negative_pair_op.h
+++ b/paddle/operators/positive_negative_pair_op.h
@@ -22,7 +22,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PositiveNegativePairKernel : public framework::OpKernel<T> {
  public:
   struct PredictionResult {
diff --git a/paddle/operators/precision_recall_op.h b/paddle/operators/precision_recall_op.h
index 4a871ce674..c0d55405a3 100644
--- a/paddle/operators/precision_recall_op.h
+++ b/paddle/operators/precision_recall_op.h
@@ -26,7 +26,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 enum StateVariable { TP = 0, FP, TN, FN };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PrecisionRecallKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
index 055c471b45..317a2a4015 100644
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -85,7 +85,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad,
             ops::PReluGradOp);
-REGISTER_OP_CPU_KERNEL(prelu,
-                       ops::PReluKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(prelu_grad,
-                       ops::PReluGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    prelu, ops::PReluKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    prelu_grad,
+    ops::PReluGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/prelu_op.cu b/paddle/operators/prelu_op.cu
index 9e391dabae..12033dee0e 100644
--- a/paddle/operators/prelu_op.cu
+++ b/paddle/operators/prelu_op.cu
@@ -14,8 +14,9 @@
 
 #include "paddle/operators/prelu_op.h"
 
-REGISTER_OP_GPU_KERNEL(
-    prelu, paddle::operators::PReluKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    prelu_grad,
-    paddle::operators::PReluGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    prelu,
+    paddle::operators::PReluKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(prelu_grad,
+                        paddle::operators::PReluGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/prelu_op.h b/paddle/operators/prelu_op.h
index 5ad31c2203..56f9a553ec 100644
--- a/paddle/operators/prelu_op.h
+++ b/paddle/operators/prelu_op.h
@@ -39,7 +39,7 @@ class PReluFunctor {
   const T* alpha_;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PReluKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -54,9 +54,9 @@ class PReluKernel : public framework::OpKernel<T> {
 
     int numel = x->numel();
 
-    Transform<Place> trans;
-    trans(context.device_context(), x_ptr, x_ptr + numel, o_ptr,
-          PReluFunctor<T>(alpha_ptr));
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x_ptr,
+          x_ptr + numel, o_ptr, PReluFunctor<T>(alpha_ptr));
   }
 };
 
@@ -76,7 +76,7 @@ class PReluGradFunctor {
   const T* alpha_;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PReluGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -92,9 +92,9 @@ class PReluGradKernel : public framework::OpKernel<T> {
     const T* out_ptr = out->data<T>();
     int numel = dx->numel();
 
-    Transform<Place> trans;
-    trans(context.device_context(), out_ptr, out_ptr + numel, dout_ptr, dx_ptr,
-          PReluGradFunctor<T>(alpha_ptr));
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), out_ptr,
+          out_ptr + numel, dout_ptr, dx_ptr, PReluGradFunctor<T>(alpha_ptr));
 
     // TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready
   }
diff --git a/paddle/operators/proximal_adagrad_op.cc b/paddle/operators/proximal_adagrad_op.cc
index 36e460103a..cc350f6d26 100644
--- a/paddle/operators/proximal_adagrad_op.cc
+++ b/paddle/operators/proximal_adagrad_op.cc
@@ -114,4 +114,4 @@ REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad, ops::ProximalAdagradOp,
                              ops::ProximalAdagradOpMaker);
 REGISTER_OP_CPU_KERNEL(
     proximal_adagrad,
-    ops::ProximalAdagradOpKernel<paddle::platform::CPUPlace, float>);
+    ops::ProximalAdagradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/proximal_adagrad_op.cu b/paddle/operators/proximal_adagrad_op.cu
index d0ae039518..42a178f94b 100644
--- a/paddle/operators/proximal_adagrad_op.cu
+++ b/paddle/operators/proximal_adagrad_op.cu
@@ -15,6 +15,6 @@ specific language governing permissions and limitations under the License. */
 #include "paddle/operators/proximal_adagrad_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     proximal_adagrad,
-    ops::ProximalAdagradOpKernel<paddle::platform::GPUPlace, float>);
+    ops::ProximalAdagradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/proximal_adagrad_op.h b/paddle/operators/proximal_adagrad_op.h
index 7a1560e8cb..523924d80e 100644
--- a/paddle/operators/proximal_adagrad_op.h
+++ b/paddle/operators/proximal_adagrad_op.h
@@ -24,7 +24,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ProximalAdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -45,20 +45,20 @@ class ProximalAdagradOpKernel : public framework::OpKernel<T> {
 
     auto p_out = EigenVector<T>::Flatten(*param_out);
     auto m_out = EigenVector<T>::Flatten(*moment_out);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<int, 1> grad_dsize(grad->numel());
 
-    m_out.device(place) = m + g * g;
+    m_out.device(*place) = m + g * g;
     auto prox_param = p - lr.broadcast(grad_dsize) * g / m_out.sqrt();
     if (l1 > static_cast<T>(0)) {
-      p_out.device(place) =
+      p_out.device(*place) =
           prox_param.sign() *
           (((prox_param.abs() - (lr * l1).broadcast(grad_dsize))
                 .cwiseMax(static_cast<T>(0.0))) /
            (static_cast<T>(1.0) + (lr * l2).broadcast(grad_dsize)));
     } else {
-      p_out.device(place) =
+      p_out.device(*place) =
           prox_param / (static_cast<T>(1.0) + (lr * l2).broadcast(grad_dsize));
     }
   }
diff --git a/paddle/operators/proximal_gd_op.cc b/paddle/operators/proximal_gd_op.cc
index 5693d0ec9e..0b26beb3ac 100644
--- a/paddle/operators/proximal_gd_op.cc
+++ b/paddle/operators/proximal_gd_op.cc
@@ -94,4 +94,5 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(proximal_gd, ops::ProximalGDOp,
                              ops::ProximalGDOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    proximal_gd, ops::ProximalGDOpKernel<paddle::platform::CPUPlace, float>);
+    proximal_gd,
+    ops::ProximalGDOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/proximal_gd_op.cu b/paddle/operators/proximal_gd_op.cu
index 26f4ebaa0f..b7dd840d19 100644
--- a/paddle/operators/proximal_gd_op.cu
+++ b/paddle/operators/proximal_gd_op.cu
@@ -15,5 +15,6 @@ specific language governing permissions and limitations under the License. */
 #include "paddle/operators/proximal_gd_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    proximal_gd, ops::ProximalGDOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    proximal_gd,
+    ops::ProximalGDOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/proximal_gd_op.h b/paddle/operators/proximal_gd_op.h
index bebda02041..64648b3cca 100644
--- a/paddle/operators/proximal_gd_op.h
+++ b/paddle/operators/proximal_gd_op.h
@@ -24,7 +24,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ProximalGDOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -42,7 +42,7 @@ class ProximalGDOpKernel : public framework::OpKernel<T> {
     auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
 
     auto p_out = EigenVector<T>::Flatten(*param_out);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<int, 1> grad_dsize(grad->numel());
 
diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index 912f88f455..b80b175792 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -123,7 +123,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad,
             ops::RankLossGradOp);
-REGISTER_OP_CPU_KERNEL(rank_loss,
-                       ops::RankLossKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    rank_loss_grad, ops::RankLossGradKernel<paddle::platform::CPUPlace, float>);
+    rank_loss, ops::RankLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    rank_loss_grad,
+    ops::RankLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/rank_loss_op.cu b/paddle/operators/rank_loss_op.cu
index 5382e3a629..5aee66443d 100644
--- a/paddle/operators/rank_loss_op.cu
+++ b/paddle/operators/rank_loss_op.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/operators/rank_loss_op.h"
 
-REGISTER_OP_GPU_KERNEL(
-    rank_loss,
-    paddle::operators::RankLossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    rank_loss_grad,
-    paddle::operators::RankLossGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(rank_loss,
+                        paddle::operators::RankLossKernel<
+                            paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(rank_loss_grad,
+                        paddle::operators::RankLossGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h
index 703c77a0b2..ea24b61fd9 100644
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class RankLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -35,13 +35,13 @@ class RankLossKernel : public framework::OpKernel<T> {
     auto left = framework::EigenVector<T>::Flatten(*left_t);
     auto right = framework::EigenVector<T>::Flatten(*right_t);
 
-    auto& dev = ctx.GetEigenDevice<Place>();
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
     out.device(dev) =
         (1. + (left - right).exp()).log() - label * (left - right);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class RankLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -55,7 +55,7 @@ class RankLossGradKernel : public framework::OpKernel<T> {
     auto* left_t = ctx.Input<framework::Tensor>("Left");
     auto* right_t = ctx.Input<framework::Tensor>("Right");
 
-    auto& dev = ctx.GetEigenDevice<Place>();
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
     auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
     auto label = framework::EigenVector<T>::Flatten(*label_t);
     auto left = framework::EigenVector<T>::Flatten(*left_t);
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 8b60b9c912..232d926f7b 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -25,7 +25,7 @@ constexpr char kOutputs[] = "outputs";
 constexpr char kStepScopes[] = "step_scopes";
 constexpr char kExStates[] = "ex_states";
 constexpr char kStates[] = "states";
-constexpr char kStepBlock[] = "step_block";
+constexpr char kStepBlock[] = "sub_block";
 constexpr char kReverse[] = "reverse";
 constexpr char kIsTrain[] = "is_train";
 #define GRAD_SUFFIX "@GRAD"
@@ -408,7 +408,8 @@ class RecurrentGradOp : public RecurrentBase {
             attrs["value"] = 0.0f;
 
             auto zero_op = framework::OpRegistry::CreateOp(
-                "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs);
+                "fill_constant", framework::VariableNameMap{},
+                {{"Out", {pg_names[param_id]}}}, attrs);
             zero_op->Run(scope, dev_ctx);
           }
 
@@ -417,7 +418,7 @@ class RecurrentGradOp : public RecurrentBase {
 
           auto sum_op = framework::OpRegistry::CreateOp(
               "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-              {{"Out", {pg_names[param_id]}}}, {});
+              {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
           sum_op->Run(cur_scope, dev_ctx);
 
           cur_scope.Rename(new_inside_name, inside_grad_name);
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
index c69e416e10..eed482c1b4 100644
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -72,11 +72,13 @@ class RecvOp : public framework::OperatorBase {
     // FIXME(typhoonzero): do not copy
     framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor);
 
-    auto *block = Attr<framework::BlockDescBind *>("OptimizeBlock");
-    auto *program = block->Program();
+    std::string program_str = Attr<std::string>("OptimizeProgram");
+    framework::ProgramDesc program_desc;
+    program_desc.ParseFromString(program_str);
+    framework::ProgramDescBind program(program_desc);
     framework::Executor executor(dev_ctx);
     // Run sub graph to get optimized tensor
-    executor.Run(*program, &recv_scope, block->ID(),
+    executor.Run(program, &recv_scope, 0, /*global_block*/
                  false /*create_local_scope*/);
 
     auto *out_var = recv_scope.FindVar("Out");
@@ -108,8 +110,8 @@ This operator will recv tensor from send_op
                          "IP address to listen on.")
         .SetDefault("127.0.0.1:6164")
         .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
-    AddAttr<framework::BlockDescBind *>("OptimizeBlock", "type BlockDescBind*",
-                                        "optimize network run in server");
+    AddAttr<std::string>("OptimizeProgram", "type string",
+                         "Serialized ProgramDesc string for recv to run.");
   }
 };
 
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
index 2589a54cfc..fedc2a5c37 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -37,18 +37,23 @@ class ReduceOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_LT(
         dim, x_rank,
         "The dim should be in the range [-rank(input), rank(input)).");
-    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
-    auto dims_vector = vectorize(x_dims);
-    if (keep_dim || x_rank == 1) {
-      dims_vector[dim] = 1;
+    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    if (reduce_all) {
+      ctx->SetOutputDim("Out", {1});
     } else {
-      dims_vector.erase(dims_vector.begin() + dim);
-    }
-    auto out_dims = framework::make_ddim(dims_vector);
-    ctx->SetOutputDim("Out", out_dims);
-    if (dim != 0) {
-      // Only pass LoD when not reducing on the first dim.
-      ctx->ShareLoD("X", /*->*/ "Out");
+      bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
+      auto dims_vector = vectorize(x_dims);
+      if (keep_dim || x_rank == 1) {
+        dims_vector[dim] = 1;
+      } else {
+        dims_vector.erase(dims_vector.begin() + dim);
+      }
+      auto out_dims = framework::make_ddim(dims_vector);
+      ctx->SetOutputDim("Out", out_dims);
+      if (dim != 0) {
+        // Only pass LoD when not reducing on the first dim.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
     }
   }
 };
@@ -95,11 +100,16 @@ class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false) "
                   "If true, retain the reduced dimension with length 1.")
         .SetDefault(false);
+    AddAttr<bool>("reduce_all",
+                  "(bool, default false) "
+                  "If true, output a scalar reduced along all dimensions.")
+        .SetDefault(false);
     comment_ = R"DOC(
 {ReduceOp} Operator.
 
 This operator computes the {reduce} of input tensor along the given dimension. 
 The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+If reduce_all is true, just reduce along all dimensions and output a scalar.
 
 )DOC";
     AddComment(comment_);
@@ -180,12 +190,13 @@ REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad,
 REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad,
             ops::ReduceGradOp);
 
-#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)     \
-  REGISTER_OP_CPU_KERNEL(                                                  \
-      reduce_type,                                                         \
-      ops::ReduceKernel<paddle::platform::CPUPlace, float, ops::functor>); \
-  REGISTER_OP_CPU_KERNEL(reduce_type##_grad,                               \
-                         ops::ReduceGradKernel<paddle::platform::CPUPlace, \
-                                               float, ops::grad_functor>);
+#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
+  REGISTER_OP_CPU_KERNEL(reduce_type,                                          \
+                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
+                                           float, ops::functor>);              \
+  REGISTER_OP_CPU_KERNEL(                                                      \
+      reduce_type##_grad,                                                      \
+      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,         \
+                            ops::grad_functor>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL);
diff --git a/paddle/operators/reduce_op.cu b/paddle/operators/reduce_op.cu
index d306e1a240..a10ace5253 100644
--- a/paddle/operators/reduce_op.cu
+++ b/paddle/operators/reduce_op.cu
@@ -17,12 +17,13 @@
 
 namespace ops = paddle::operators;
 
-#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)     \
-  REGISTER_OP_GPU_KERNEL(                                                  \
-      reduce_type,                                                         \
-      ops::ReduceKernel<paddle::platform::GPUPlace, float, ops::functor>); \
-  REGISTER_OP_GPU_KERNEL(reduce_type##_grad,                               \
-                         ops::ReduceGradKernel<paddle::platform::GPUPlace, \
-                                               float, ops::grad_functor>);
+#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)    \
+  REGISTER_OP_CUDA_KERNEL(                                                \
+      reduce_type, ops::ReduceKernel<paddle::platform::CUDADeviceContext, \
+                                     float, ops::functor>);               \
+  REGISTER_OP_CUDA_KERNEL(                                                \
+      reduce_type##_grad,                                                 \
+      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,   \
+                            ops::grad_functor>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL);
diff --git a/paddle/operators/reduce_op.h b/paddle/operators/reduce_op.h
index dd6547542d..7bd99cb1e6 100644
--- a/paddle/operators/reduce_op.h
+++ b/paddle/operators/reduce_op.h
@@ -26,61 +26,63 @@ using DDim = framework::DDim;
 template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 struct SumFunctor {
-  template <typename Place, typename X, typename Y, typename Dim>
-  void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
     y.device(place) = x.sum(dim);
   }
 };
 
 struct SumGradFunctor {
-  template <typename Place, typename X, typename Y, typename DX, typename DY,
-            typename Dim>
-  void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy,
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
                   const Dim& dim, int size) {
     dx.device(place) = dy.broadcast(dim);
   }
 };
 
 struct MeanFunctor {
-  template <typename Place, typename X, typename Y, typename Dim>
-  void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
     y.device(place) = x.mean(dim);
   }
 };
 
 struct MeanGradFunctor {
-  template <typename Place, typename X, typename Y, typename DX, typename DY,
-            typename Dim>
-  void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy,
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
                   const Dim& dim, int size) {
     dx.device(place) = dy.broadcast(dim) / dx.constant(size);
   }
 };
 
 struct MaxFunctor {
-  template <typename Place, typename X, typename Y, typename Dim>
-  void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
     y.device(place) = x.maximum(dim);
   }
 };
 
 struct MinFunctor {
-  template <typename Place, typename X, typename Y, typename Dim>
-  void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
     y.device(place) = x.minimum(dim);
   }
 };
 
 struct MaxOrMinGradFunctor {
-  template <typename Place, typename X, typename Y, typename DX, typename DY,
-            typename Dim>
-  void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy,
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
                   const Dim& dim, int size) {
     auto equals = x == y.broadcast(dim);
     auto ones = dx.constant(1);
@@ -91,30 +93,45 @@ struct MaxOrMinGradFunctor {
   }
 };
 
-template <typename Place, typename T, typename Functor>
+template <typename DeviceContext, typename T, typename Functor>
 class ReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    int rank = context.Input<Tensor>("X")->dims().size();
-    switch (rank) {
-      case 1:
-        ReduceCompute<1>(context);
-        break;
-      case 2:
-        ReduceCompute<2>(context);
-        break;
-      case 3:
-        ReduceCompute<3>(context);
-        break;
-      case 4:
-        ReduceCompute<4>(context);
-        break;
-      case 5:
-        ReduceCompute<5>(context);
-        break;
-      case 6:
-        ReduceCompute<6>(context);
-        break;
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    if (reduce_all) {
+      // Flatten and reduce 1-D tensor
+      auto* input = context.Input<Tensor>("X");
+      auto* output = context.Output<Tensor>("Out");
+      output->mutable_data<T>(context.GetPlace());
+      auto x = EigenVector<T>::Flatten(*input);
+      auto out = EigenScalar<T>::From(*output);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      auto reduce_dim = Eigen::array<int, 1>({{0}});
+      Functor functor;
+      functor(place, x, out, reduce_dim);
+    } else {
+      int rank = context.Input<Tensor>("X")->dims().size();
+      switch (rank) {
+        case 1:
+          ReduceCompute<1>(context);
+          break;
+        case 2:
+          ReduceCompute<2>(context);
+          break;
+        case 3:
+          ReduceCompute<3>(context);
+          break;
+        case 4:
+          ReduceCompute<4>(context);
+          break;
+        case 5:
+          ReduceCompute<5>(context);
+          break;
+        case 6:
+          ReduceCompute<6>(context);
+          break;
+      }
     }
   }
 
@@ -139,7 +156,8 @@ class ReduceKernel : public framework::OpKernel<T> {
       dims = framework::make_ddim(dims_vector);
     }
 
-    auto& place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
 
     if (D == 1) {
@@ -152,30 +170,50 @@ class ReduceKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T, typename Functor>
+template <typename DeviceContext, typename T, typename Functor>
 class ReduceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    int rank = context.Input<Tensor>("X")->dims().size();
-    switch (rank) {
-      case 1:
-        ReduceGradCompute<1>(context);
-        break;
-      case 2:
-        ReduceGradCompute<2>(context);
-        break;
-      case 3:
-        ReduceGradCompute<3>(context);
-        break;
-      case 4:
-        ReduceGradCompute<4>(context);
-        break;
-      case 5:
-        ReduceGradCompute<5>(context);
-        break;
-      case 6:
-        ReduceGradCompute<6>(context);
-        break;
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    if (reduce_all) {
+      auto* input0 = context.Input<Tensor>("X");
+      auto* input1 = context.Input<Tensor>("Out");
+      auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+      output->mutable_data<T>(context.GetPlace());
+      auto x = EigenVector<T>::Flatten(*input0);
+      auto x_reduce = EigenVector<T>::From(*input1);
+      auto x_reduce_grad = EigenVector<T>::From(*input2);
+      auto x_grad = EigenVector<T>::Flatten(*output);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      auto broadcast_dim =
+          Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
+      Functor functor;
+      functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
+              broadcast_dim[0]);
+    } else {
+      int rank = context.Input<Tensor>("X")->dims().size();
+      switch (rank) {
+        case 1:
+          ReduceGradCompute<1>(context);
+          break;
+        case 2:
+          ReduceGradCompute<2>(context);
+          break;
+        case 3:
+          ReduceGradCompute<3>(context);
+          break;
+        case 4:
+          ReduceGradCompute<4>(context);
+          break;
+        case 5:
+          ReduceGradCompute<5>(context);
+          break;
+        case 6:
+          ReduceGradCompute<6>(context);
+          break;
+      }
     }
   }
 
@@ -201,7 +239,8 @@ class ReduceGradKernel : public framework::OpKernel<T> {
     Eigen::array<int, D> broadcast_dim;
     for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
     broadcast_dim[dim] = input0->dims()[dim];
-    auto& place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
     functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
             broadcast_dim[dim]);
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index 39bf2118d6..d82d828747 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -34,21 +34,33 @@ class ReshapeOp : public framework::OperatorWithKernel {
     auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
     PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
     auto x_dims = ctx->GetInputDim("X");
-    // TODO(qiao) change batch_size
-    for (size_t i = 1; i < shape.size(); ++i) {
-      PADDLE_ENFORCE(shape[i] > 0,
-                     "Each dimension of Attr(shape) "
-                     "must be positive except the first one.");
-    }
-    if (shape[0] < 0) {
-      shape[0] = x_dims[0];
+
+    std::vector<size_t> neg_dims_idx;
+    // set some dimension to -1 if it is unknown
+    const int unknown_size = -1;
+    for (size_t i = 0; i < shape.size(); ++i) {
+      PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size,
+                     "Each dimension of Attr(shape) must be positive or %d.",
+                     unknown_size);
+      if (shape[i] == unknown_size) {
+        neg_dims_idx.push_back(i);
+        PADDLE_ENFORCE(neg_dims_idx.size() <= 1,
+                       "Only one dimension of Attr(shape) can be unknown.");
+      }
     }
-    // capacity check
+
     int64_t capacity =
         std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
     int64_t in_size = framework::product(x_dims);
-    PADDLE_ENFORCE_EQ(capacity, in_size,
-                      "The size of Input(X) mismatches with Attr(shape).");
+    if (neg_dims_idx.size() == 1) {
+      // dim infer
+      shape[neg_dims_idx[0]] = in_size / (-capacity);
+      // recalculate capacity
+      capacity = shape[neg_dims_idx[0]] * (-capacity);
+    }
+    // capacity check
+    PADDLE_ENFORCE(capacity == in_size,
+                   "The size of Input(X) mismatches with Attr(shape).");
     // resize output
     std::vector<int64_t> shape_int64(shape.size(), 0);
     std::transform(shape.begin(), shape.end(), shape_int64.begin(),
@@ -84,10 +96,13 @@ Given a 2-D tensor X with 2 rows and 2 columns
     [[1, 2], [3, 4]]
 
 and target shape = [1, 4], the reshape operator will transform
-the tensor X into a 1-D tensor:
+the tensor X into a 2-D tensor:
 
-    [1, 2, 3, 4]
+    [[1, 2, 3, 4]]
 
+One dimension in the target shape can be set -1, representing that its
+size is unknown. In this case, the real dimension will be infered from 
+the original shape of Input(X) and other dimensions in the target shape.
 )DOC");
   }
 };
diff --git a/paddle/operators/reshape_op.cu b/paddle/operators/reshape_op.cu
index dca6c15007..b7329238c0 100644
--- a/paddle/operators/reshape_op.cu
+++ b/paddle/operators/reshape_op.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/operators/reshape_op.h"
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     reshape,
     paddle::operators::ReshapeKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     reshape_grad,
     paddle::operators::ReshapeGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h
index 73fd1da642..92d8cbbb56 100644
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ReshapeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -33,7 +33,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ReshapeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
diff --git a/paddle/operators/rmsprop_op.cc b/paddle/operators/rmsprop_op.cc
index a9c45f639c..fc3f9b8988 100644
--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
@@ -116,5 +116,5 @@ http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
-REGISTER_OP_CPU_KERNEL(rmsprop,
-                       ops::RmspropOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/rmsprop_op.cu b/paddle/operators/rmsprop_op.cu
index 52634a5481..2a9fd6e104 100644
--- a/paddle/operators/rmsprop_op.cu
+++ b/paddle/operators/rmsprop_op.cu
@@ -16,5 +16,5 @@
 #include "paddle/operators/rmsprop_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(rmsprop,
-                       ops::RmspropOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/rmsprop_op.h b/paddle/operators/rmsprop_op.h
index 7bf2129010..16a561835d 100644
--- a/paddle/operators/rmsprop_op.h
+++ b/paddle/operators/rmsprop_op.h
@@ -24,7 +24,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class RmspropOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -51,7 +51,7 @@ class RmspropOpKernel : public framework::OpKernel<T> {
     auto p_out = EigenVector<T>::Flatten(*param_out);
     auto mom_out = EigenVector<T>::Flatten(*moment_out);
     auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<int, 1> grad_dsize(grad->numel());
 
diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc
index 2b5e66c96b..75fcea8401 100644
--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
@@ -157,9 +157,10 @@ namespace ops = paddle::operators;
 REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
             ops::ROIPoolGradOp);
 REGISTER_OP_CPU_KERNEL(
-    roi_pool, ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, float>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, double>);
+    roi_pool,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     roi_pool_grad,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUPlace, float>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, double>);
+    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/roi_pool_op.cu b/paddle/operators/roi_pool_op.cu
index 9a4c8ca752..a874befe4d 100644
--- a/paddle/operators/roi_pool_op.cu
+++ b/paddle/operators/roi_pool_op.cu
@@ -177,7 +177,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
     if (x_grad) {
       x_grad->mutable_data<T>(ctx.GetPlace());
       math::SetConstant<Place, T> set_zero;
-      set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
+      set_zero(ctx.cuda_device_context(), x_grad, static_cast<T>(0));
 
       int output_grad_size = out_grad->numel();
       int blocks = NumBlocks(output_grad_size);
@@ -199,10 +199,11 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    roi_pool, ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, float>,
-    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    roi_pool,
+    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
     roi_pool_grad,
-    ops::GPUROIPoolGradOpKernel<paddle::platform::GPUPlace, float>,
-    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, double>);
+    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/roi_pool_op.h b/paddle/operators/roi_pool_op.h
index 3812c66c65..09a9d3d870 100644
--- a/paddle/operators/roi_pool_op.h
+++ b/paddle/operators/roi_pool_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CPUROIPoolOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -126,7 +126,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -145,8 +145,9 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
       const T* out_grad_data = out_grad->data<T>();
       const int64_t* argmax_data = argmax->data<int64_t>();
       T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<Place, T> set_zero;
-      set_zero(ctx.device_context(), in_grad, static_cast<T>(0));
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(ctx.template device_context<DeviceContext>(), in_grad,
+               static_cast<T>(0));
 
       auto in_stride = framework::stride(in->dims());
       auto argmax_stride = framework::stride(argmax->dims());
diff --git a/paddle/operators/row_conv_op.cc b/paddle/operators/row_conv_op.cc
new file mode 100644
index 0000000000..5203a5079c
--- /dev/null
+++ b/paddle/operators/row_conv_op.cc
@@ -0,0 +1,260 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/row_conv_op.h"
+#include "paddle/framework/eigen.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+class RowConvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of RowConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                   "Input(Filter) of RowConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of RowConvOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto filter_dims = ctx->GetInputDim("Filter");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(filter_dims.size(), 2, "Input(Y)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(
+        x_dims[1], filter_dims[1],
+        "The 2nd dimension of Input(X) and Input(Filter) should be same.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class RowConvGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                   "Input(Filter) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Gradient of output(Out) should not be null.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      auto x_dims = ctx->GetInputDim("X");
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+
+    auto filter_grad_name = framework::GradVarName("Filter");
+    if (ctx->HasOutput(filter_grad_name)) {
+      auto filter_dims = ctx->GetInputDim("Filter");
+      ctx->SetOutputDim(filter_grad_name, filter_dims);
+    }
+  }
+};
+
+class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RowConvOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor), the input(X) is a LodTensor, which supports "
+             "variable time-length input sequences. The underlying tensor "
+             "in this LoDTensor is a matrix with shape (T x N), where T "
+             "is the total time steps in this mini-batch and N is the input "
+             "data dimension.");
+    AddInput("Filter",
+             "(Tensor), the input(Filter) is a learnable parameter. It "
+             "is a 2-D tensor with shape (future_context x N), where, "
+             "future_context is the future context length and N is the data "
+             "dimension.");
+    AddOutput("Out",
+              "(LoDTensor), the output(Out) is a LodTensor, which supports "
+              "variable time-length input sequences. The underlying tensor "
+              "in this LodTensor is a matrix with shape T x N, i.e., the "
+              "same shape as X.");
+    AddComment(R"DOC(
+Row-convolution Operator.
+
+The row convolution is called lookahead convolution.  This operator was 
+introduced in the following paper for DeepSpeech2:
+http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf 
+
+The main motivation is that a bidirectional RNN, useful in DeepSpeech 
+like speech models, learns representation for a sequence by performing a 
+forward and a backward pass through the entire sequence. However, unlike 
+unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
+and low-latency setting. The lookahead convolution incorporates information 
+from future subsequences in a computationally efficient manner to improve 
+unidirectional recurrent neural networks. The row convolution operator is 
+different from the 1D sequence convolution, and is computed as follows:
+
+Given an input sequence $in$ of length $t$ and input dimension $d$, 
+and a filter ($W$) of size $context \times d$, 
+the output sequence is convolved as:
+
+$$
+out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :}
+$$
+
+)DOC");
+  }
+};
+
+template <typename T>
+class RowConvKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<LoDTensor>("X");
+    auto *filter = context.Input<Tensor>("Filter");
+    auto *out = context.Output<LoDTensor>("Out");
+
+    out->mutable_data<T>(context.GetPlace());
+
+    auto batch_indices = x->lod()[0];
+    auto input_dim = x->dims()[1];  // 'in' is of size T x N
+    size_t num_sequence = batch_indices.size() - 1;
+
+    auto future_context = filter->dims()[0];
+    auto weights = EigenMatrix<T>::From(*filter);
+
+    for (size_t i = 0; i < num_sequence; i++) {
+      int start = static_cast<int>(batch_indices[i]);
+      int end = static_cast<int>(batch_indices[i + 1]);
+      int current_timesteps = end - start;
+      Tensor cur_input_sequence =
+          x->Slice(start, end);  // Current input sequence
+      Tensor cur_output_sequence =
+          out->Slice(start, end);  // Current output sequence
+      auto cip_seq = EigenMatrix<T>::From(cur_input_sequence);
+      auto cot_seq = EigenMatrix<T>::From(cur_output_sequence);
+
+      for (int k = 0; k < current_timesteps;
+           k++) {  // For different time steps in the same sequence
+        for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+             w++) {
+          for (int d = 0; d < input_dim; d++) {
+            if (w == 0) {
+              cot_seq(k, d) = weights(w, d) * cip_seq(k + w, d);
+            } else {
+              cot_seq(k, d) += weights(w, d) * cip_seq(k + w, d);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class RowConvGradKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<LoDTensor>("X");
+    auto *filter = context.Input<Tensor>("Filter");
+    auto *d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto *dx = context.Output<LoDTensor>(framework::GradVarName("X"));
+    auto *d_filter = context.Output<Tensor>(framework::GradVarName("Filter"));
+
+    auto input_dim = x->dims()[1];  // 'x' is of size T x N
+    auto batch_indices = x->lod()[0];
+    size_t num_sequence = batch_indices.size() - 1;
+    auto future_context = filter->dims()[0];
+
+    if (d_filter) {
+      d_filter->mutable_data<T>(context.GetPlace());
+      auto dweights =
+          EigenMatrix<T>::From(*d_filter);  // Gradient of weight matrix
+      dweights.setZero();
+
+      for (size_t i = 0; i < num_sequence; i++) {  // For different sequences
+        int start = static_cast<int>(batch_indices[i]);
+        int end = static_cast<int>(batch_indices[i + 1]);
+
+        Tensor cur_input = x->Slice(start, end);  // Current input sequence
+        Tensor cur_doutput =
+            d_out->Slice(start, end);  // Current output grad sequence
+
+        auto cur_ip = EigenMatrix<T>::From(cur_input);
+        auto cur_dout = EigenMatrix<T>::From(cur_doutput);
+        int current_timesteps = end - start;
+
+        for (int k = 0; k < current_timesteps;
+             k++) {  // For different time steps in the same sequence
+          for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+               w++) {
+            // For dweights (Updating the gradient of weight matrix)
+            for (int d = 0; d < input_dim; d++) {
+              dweights(w, d) += cur_ip(k + w, d) * cur_dout(k, d);
+            }
+          }
+        }
+      }
+    }
+
+    if (dx) {
+      dx->mutable_data<T>(context.GetPlace());
+      auto weights = EigenMatrix<T>::From(*filter);
+      for (size_t i = 0; i < num_sequence; i++) {  // For different sequences
+        int start = static_cast<int>(batch_indices[i]);
+        int end = static_cast<int>(batch_indices[i + 1]);
+
+        Tensor cur_doutput =
+            d_out->Slice(start, end);  // Current output grad sequence
+        Tensor cur_dinput =
+            dx->Slice(start, end);  // Current input grad sequence
+
+        auto cur_dout = EigenMatrix<T>::From(cur_doutput);
+        auto cur_dip = EigenMatrix<T>::From(cur_dinput);
+        cur_dip.setZero();
+        int current_timesteps = end - start;
+
+        for (int k = 0; k < current_timesteps;
+             k++) {  // For different time steps in the same sequence
+          for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+               w++) {
+            // For dinput (Updating the gradient wrt input)
+            for (int d = 0; d < input_dim; d++) {
+              cur_dip(k + w, d) += weights(w, d) * cur_dout(k, d);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad,
+            ops::RowConvGradOp);
+REGISTER_OP_CPU_KERNEL(
+    row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    row_conv_grad,
+    ops::RowConvGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu
new file mode 100644
index 0000000000..56a98ff299
--- /dev/null
+++ b/paddle/operators/row_conv_op.cu
@@ -0,0 +1,410 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/row_conv_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using framework::Tensor;
+
+namespace {
+
+inline int DivUp(int x, int y) { return (x + y - 1) / y; }
+
+// Forward prop (shared memory version, for small future_context)
+template <typename T>
+__global__ void RowConvForwardSharedMemory(const T *in, const T *wt,
+                                           int num_sequence, int input_dim,
+                                           int future_context,
+                                           const size_t *batch_indices,
+                                           T *out) {
+  int blx = blockDim.x;
+  int bly = blockDim.y;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int d = blockIdx.x * blx + thx;  // index along input dim
+
+  extern __shared__ T mem[];
+  T *sw = mem;
+
+  if (thy < future_context) {
+    sw[thy * blx + thx] =
+        (d < input_dim) ? wt[thy * input_dim + d] : static_cast<T>(0);
+  }
+  __syncthreads();
+
+  for (size_t i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+           w++) {
+        sum += (d < input_dim)
+                   ? sw[w * blx + thx] * in[(start + k + w) * input_dim + d]
+                   : static_cast<T>(0);
+      }
+      if (d < input_dim) {
+        out[(start + k) * input_dim + d] = sum;
+      }
+    }
+  }
+}
+
+// Forward prop (naive version)
+template <typename T>
+__global__ void RowConvForward(const T *in, const T *wt, int num_sequence,
+                               int input_dim, int future_context,
+                               const size_t *batch_indices, T *out) {
+  int d = blockIdx.x * blockDim.x + threadIdx.x;  // index along input_dim
+  int bly = blockDim.y;
+  int thy = threadIdx.y;
+
+  if (d >= input_dim) return;
+
+  for (size_t i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+           w++) {
+        sum += (wt[w * input_dim + d] * in[(start + k + w) * input_dim + d]);
+      }
+      out[(start + k) * input_dim + d] = sum;
+    }
+  }
+}
+
+// Compute input gradient (shared memory version, for small future_context)
+template <typename T>
+__global__ void RowConvGradInputSharedMemory(const T *dout, const T *wt,
+                                             int num_sequence, int input_dim,
+                                             int future_context,
+                                             const size_t *batch_indices,
+                                             T *din) {
+  int blx = blockDim.x;
+  int bly = blockDim.y;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int d = blockIdx.x * blx + thx;  // index along input dim
+
+  extern __shared__ T mem[];
+  T *sw = mem;
+  if (thy < future_context) {
+    sw[thy * blx + thx] =
+        (d < input_dim) ? wt[thy * input_dim + d] : static_cast<T>(0);
+  }
+  __syncthreads();
+
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) {
+        sum += (d < input_dim)
+                   ? (sw[w * blx + thx] * dout[(k + start - w) * input_dim + d])
+                   : static_cast<T>(0);
+      }
+      if (d < input_dim) {
+        din[(k + start) * input_dim + d] = sum;
+      }
+    }
+  }
+}
+
+// Compute input gradient (Naive version)
+template <typename T>
+__global__ void RowConvGradInput(const T *dout, const T *wt, int num_sequence,
+                                 int input_dim, int future_context,
+                                 const size_t *batch_indices, T *din) {
+  int d = blockIdx.x * blockDim.x + threadIdx.x;  // index along input_dim
+  int bly = blockDim.y;
+  int thy = threadIdx.y;
+
+  if (d >= input_dim) return;
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) {
+        sum += (wt[w * input_dim + d] * dout[(k + start - w) * input_dim + d]);
+      }
+      din[(k + start) * input_dim + d] = sum;
+    }
+  }
+}
+
+// Compute W gradient (small future_context version)
+template <typename T>
+__global__ void RowConvGradFilterImproved(const T *in, const T *dout,
+                                          int num_sequence, int input_dim,
+                                          int future_context, int block_x,
+                                          int block_y,
+                                          const size_t *batch_indices,
+                                          T *dfilter) {
+  int blx = blockDim.x;
+  int bly = blockDim.y;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int gx = blockIdx.x * blx;
+  int d = gx + thx;  // index along input dim
+
+  extern __shared__ T mem[];
+
+  int xdim_sh_in = block_y;
+  int xdim_sh_dout = block_y;
+  // int xdim_sh_dfilter = future_context;
+  int ydim_sh_in = block_x;
+  int ydim_sh_dout = block_x + future_context - 1;
+  int ydim_sh_dfilter = block_y;
+
+  T *sh_in = mem;
+  T *sh_dout = &mem[xdim_sh_in * ydim_sh_in];
+  T *sh_dfilter = &mem[xdim_sh_in * ydim_sh_in + xdim_sh_dout * ydim_sh_dout];
+
+  if (thy < future_context) {
+    sh_dfilter[thy * ydim_sh_dfilter + thx] = static_cast<T>(0);
+  }
+  __syncthreads();
+
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    int scaled_cur_steps =
+        ((current_timesteps + block_x - 1) / block_x) * block_x;
+
+    for (int k = thy; k < scaled_cur_steps; k += block_x) {
+      int pos = start + k;
+      sh_in[thx * ydim_sh_in + thy] =
+          (d < input_dim && pos < end) ? in[pos * input_dim + d] : T(0);
+      sh_dout[thx * ydim_sh_dout + thy + future_context - 1] =
+          (d < input_dim && pos < end) ? dout[pos * input_dim + d] : T(0);
+      __syncthreads();
+
+      if (thy < future_context - 1) {
+        int pos_offset = pos - future_context + 1;
+        sh_dout[thx * ydim_sh_dout + thy] =
+            (d < input_dim && pos_offset >= start)
+                ? dout[pos_offset * input_dim + d]
+                : T(0);
+      }
+      __syncthreads();
+
+      for (int w = 0; w < future_context; w++) {
+        T val = sh_in[thy * ydim_sh_in + thx] *
+                sh_dout[thy * ydim_sh_dout + thx + future_context - 1 - w];
+        __syncthreads();
+
+        for (int offset = 16; offset > 0;
+             offset = offset / 2) {  // blockDim.x is 32.
+          val += __shfl_down(val, offset);
+        }
+        __syncthreads();
+
+        if (thx == 0) {
+          sh_dfilter[w * ydim_sh_dfilter + thy] += val;
+        }
+        __syncthreads();
+      }
+    }
+  }
+  for (int w = thy; (w < future_context) && (d < input_dim); w += bly) {
+    dfilter[w * input_dim + d] += sh_dfilter[w * ydim_sh_dfilter + thx];
+  }
+}
+
+// Compute weight(filter) gradient
+template <typename T>
+__global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence,
+                                  int input_dim, int future_context,
+                                  int block_x, int block_y,
+                                  const size_t *batch_indices, T *dfilter) {
+  int blx = blockDim.x;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int gx = blockIdx.x * blx;
+  int d = gx + thx;  // index along input dim
+  extern __shared__ T mem[];
+  T *sh_in = mem;
+  T *sh_dout = &mem[block_x * block_y];
+
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    int scaled_cur_steps =
+        ((current_timesteps + block_x - 1) / block_x) * block_x;
+
+    for (int k = thy; k < scaled_cur_steps; k += block_x) {
+      int pos = start + k;
+      sh_in[thx * block_y + thy] =
+          (d < input_dim && pos < end) ? in[pos * input_dim + d] : 0.0;
+      __syncthreads();
+
+      for (int w = 0; w < future_context; w++) {
+        sh_dout[thx * block_y + thy] =
+            (d < input_dim && (k - w) >= 0 && (k - w) < current_timesteps)
+                ? dout[(pos - w) * input_dim + d]
+                : 0.0;
+        __syncthreads();
+
+        T val = sh_in[thy * block_y + thx] * sh_dout[thy * block_y + thx];
+        __syncthreads();
+
+        for (int offset = 16; offset > 0;
+             offset = offset / 2) {  // blockDim.x is 32.
+          val += __shfl_down(val, offset);
+        }
+        __syncthreads();
+
+        if (thx == 0 && (gx + thy) < input_dim) {
+          dfilter[w * input_dim + gx + thy] += val;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
+
+template <typename T>
+class RowConvKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<LoDTensor>("X");
+    auto *Filter = context.Input<Tensor>("Filter");
+    auto *Out = context.Output<LoDTensor>("Out");
+
+    const T *in = X->data<T>();
+    const T *weight = Filter->data<T>();
+    T *out = Out->mutable_data<T>(context.GetPlace());
+
+    auto batch_indices = X->lod()[0];
+    int input_dim = X->dims()[1];
+    int num_sequence = batch_indices.size() - 1;
+    int future_context = Filter->dims()[0];
+    size_t *idx = batch_indices.data();
+    auto stream = context.cuda_device_context().stream();
+
+    if (future_context <= 32) {
+      dim3 block_dim = dim3(32, 32);
+      dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+      int mem_per_block = (future_context * block_dim.x) * sizeof(T);
+      RowConvForwardSharedMemory<
+          T><<<grid_dim, block_dim, mem_per_block, stream>>>(
+          in, weight, num_sequence, input_dim, future_context, idx, out);
+    } else {
+      dim3 block_dim = dim3(32, 32);
+      dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+      RowConvForward<T><<<grid_dim, block_dim, 0, stream>>>(
+          in, weight, num_sequence, input_dim, future_context, idx, out);
+    }
+  }
+};
+
+template <typename T>
+class RowConvGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<LoDTensor>("X");
+    auto *Filter = context.Input<Tensor>("Filter");
+    auto *dOut = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    const T *in = X->data<T>();
+    const T *weights = Filter->data<T>();
+    const T *dout = dOut->data<T>();
+
+    Tensor *dX = context.Output<LoDTensor>(framework::GradVarName("X"));
+    Tensor *dFilter = context.Output<Tensor>(framework::GradVarName("Filter"));
+
+    auto batch_indices = X->lod()[0];
+    int input_dim = X->dims()[1];
+    int num_sequence = batch_indices.size() - 1;
+    int future_context = Filter->dims()[0];
+    size_t *idx = batch_indices.data();
+
+    auto &device_ctx = context.cuda_device_context();
+    math::SetConstant<platform::CUDADeviceContext, T> zero;
+
+    if (dFilter) {
+      T *dfilter = dFilter->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, dFilter, static_cast<T>(0.0));
+
+      if (future_context <= 32) {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        int block_x = block_dim.x;
+        int block_y = block_dim.y;
+        int mem_per_block =
+            (block_y * block_x + block_y * (block_x + future_context - 1) +
+             future_context * block_y) *
+            sizeof(T);
+        RowConvGradFilterImproved<
+            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
+            idx, dfilter);
+      } else {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        int block_x = block_dim.x;
+        int block_y = block_dim.y;
+        int mem_per_block =
+            (block_x * block_y * 2) * sizeof(T);  // For 2 arrays of size 32x32
+        RowConvGradFilter<
+            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
+            idx, dfilter);
+      }
+    }
+
+    if (dX) {
+      T *din = dX->mutable_data<T>(context.GetPlace());
+      if (future_context <= 32) {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        int mem_per_block = (future_context * block_dim.x) * sizeof(T);
+        RowConvGradInputSharedMemory<
+            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+            dout, weights, num_sequence, input_dim, future_context, idx, din);
+      } else {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        RowConvGradInput<T><<<grid_dim, block_dim, 0, device_ctx.stream()>>>(
+            dout, weights, num_sequence, input_dim, future_context, idx, din);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    row_conv, ops::RowConvKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    row_conv_grad,
+    ops::RowConvGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/row_conv_op.h b/paddle/operators/row_conv_op.h
new file mode 100644
index 0000000000..80912ad8f7
--- /dev/null
+++ b/paddle/operators/row_conv_op.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class RowConvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+
+template <typename DeviceContext, typename T>
+class RowConvGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index e5c10fec4d..d848be823e 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -75,8 +75,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
                   ops::ScaleGradMaker);
-REGISTER_OP_CPU_KERNEL(scale,
-                       ops::ScaleKernel<paddle::platform::CPUPlace, float>,
-                       ops::ScaleKernel<paddle::platform::CPUPlace, double>,
-                       ops::ScaleKernel<paddle::platform::CPUPlace, int>,
-                       ops::ScaleKernel<paddle::platform::CPUPlace, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/scale_op.cu b/paddle/operators/scale_op.cu
index 0d70775159..0c7980430f 100644
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
@@ -14,8 +14,10 @@
 
 #include "paddle/operators/scale_op.h"
 
-REGISTER_OP_GPU_KERNEL(
-    scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>,
-    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>,
-    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int>,
-    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    scale,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
+                                   int64_t>);
diff --git a/paddle/operators/scale_op.h b/paddle/operators/scale_op.h
index 4931294c9d..02a8c97a83 100644
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
@@ -19,7 +19,7 @@
 
 namespace paddle {
 namespace operators {
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ScaleKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& context) const {
@@ -31,7 +31,8 @@ class ScaleKernel : public framework::OpKernel<T> {
 
     auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
     auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& dev = context.GetEigenDevice<Place>();
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
     eigen_out.device(dev) = scale * eigen_in;
   }
 };
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
index 3b32ae2fb7..6b43a1389f 100644
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
@@ -59,5 +59,5 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>);
-REGISTER_OP_GPU_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel<float>);
diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc
index ac03eb3752..3e2e2051af 100644
--- a/paddle/operators/send_recv_op_test.cc
+++ b/paddle/operators/send_recv_op_test.cc
@@ -85,7 +85,10 @@ void StartServerNet() {
 
   paddle::framework::AttributeMap attrs;
   attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
-  attrs.insert({"OptimizeBlock", block});
+  std::string program_proto;
+  PADDLE_ENFORCE(program.Proto()->SerializeToString(&program_proto));
+
+  attrs.insert({"OptimizeProgram", program_proto});
   recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}},
                                                     {{"Out", {"Out"}}}, attrs);
   paddle::platform::CPUDeviceContext ctx(place);
diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc
index d1de0b4447..9c7e5456e8 100644
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -129,7 +129,7 @@ REGISTER_OP(sequence_concat, ops::SequenceConcatOp, ops::SequenceConcatOpMaker,
             sequence_concat_grad, ops::SequenceConcatGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_concat,
-    ops::SequenceConcatOpKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     sequence_concat_grad,
-    ops::SequenceConcatGradOpKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceConcatGradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_concat_op.cu.cc b/paddle/operators/sequence_concat_op.cu.cc
index 9ca99c2258..144bdb5af6 100644
--- a/paddle/operators/sequence_concat_op.cu.cc
+++ b/paddle/operators/sequence_concat_op.cu.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/operators/sequence_concat_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     sequence_concat,
-    ops::SequenceConcatOpKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    sequence_concat_grad,
-    ops::SequenceConcatGradOpKernel<paddle::platform::GPUPlace, float>);
+    ops::SequenceConcatOpKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(sequence_concat_grad,
+                        ops::SequenceConcatGradOpKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_concat_op.h b/paddle/operators/sequence_concat_op.h
index 09212070aa..8445224f46 100644
--- a/paddle/operators/sequence_concat_op.h
+++ b/paddle/operators/sequence_concat_op.h
@@ -59,7 +59,7 @@ LoD ConcatLoD(const std::vector<const T*> ins, const size_t level) {
   return out_lod;
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceConcatOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -119,7 +119,7 @@ class SequenceConcatOpKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceConcatGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/operators/sequence_conv_op.cc
index c5533732d4..f5c4f1c133 100644
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
@@ -179,9 +179,10 @@ REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
             sequence_conv_grad, ops::SequenceConvGradOp);
 
 REGISTER_OP_CPU_KERNEL(
-    sequence_conv, ops::SequenceConvKernel<paddle::platform::CPUPlace, float>,
-    ops::SequenceConvKernel<paddle::platform::CPUPlace, double>);
+    sequence_conv,
+    ops::SequenceConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceConvKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::CPUPlace, float>,
-    ops::SequenceConvGradKernel<paddle::platform::CPUPlace, double>);
+    ops::SequenceConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceConvGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/sequence_conv_op.cu.cc b/paddle/operators/sequence_conv_op.cu.cc
index c8136dbcb3..eacba79ace 100644
--- a/paddle/operators/sequence_conv_op.cu.cc
+++ b/paddle/operators/sequence_conv_op.cu.cc
@@ -15,10 +15,11 @@
 #include "paddle/operators/sequence_conv_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    sequence_conv, ops::SequenceConvKernel<paddle::platform::GPUPlace, float>,
-    ops::SequenceConvKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    sequence_conv,
+    ops::SequenceConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
     sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::GPUPlace, float>,
-    ops::SequenceConvGradKernel<paddle::platform::GPUPlace, double>);
+    ops::SequenceConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/sequence_conv_op.h b/paddle/operators/sequence_conv_op.h
index b8fbe2647c..bb584b7bfa 100644
--- a/paddle/operators/sequence_conv_op.h
+++ b/paddle/operators/sequence_conv_op.h
@@ -23,7 +23,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceConvKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -56,21 +56,23 @@ class SequenceConvKernel : public framework::OpKernel<T> {
     Tensor col;
     col.mutable_data<T>(col_shape, context.GetPlace());
     // Because if padding_trainable is false, padding data should be zeros.
-    math::SetConstant<Place, T> set_zero;
-    set_zero(context.device_context(), &col, static_cast<T>(0));
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    set_zero(dev_ctx, &col, static_cast<T>(0));
 
-    math::ContextProjectFunctor<Place, T> seq_project_functor;
+    math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
 
-    seq_project_functor(context.device_context(), *in, *padding_data,
-                        padding_trainable, context_start, context_length,
-                        context_stride, up_pad, down_pad, &col);
+    seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
+                        context_start, context_length, context_stride, up_pad,
+                        down_pad, &col);
 
-    math::matmul<Place, T>(context.device_context(), col, false, filter, false,
-                           static_cast<T>(1.0), out, static_cast<T>(0.0));
+    math::matmul<DeviceContext, T>(dev_ctx, col, false, filter, false,
+                                   static_cast<T>(1.0), out,
+                                   static_cast<T>(0.0));
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceConvGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -95,7 +97,8 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
     int down_pad = std::max(0, context_start + context_length - 1);
     int sequence_width = static_cast<int>(in->dims()[1]);
 
-    math::SetConstant<Place, T> set_zero;
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     // use col_shape in the im2col calculation
     framework::DDim col_shape = {in->dims()[0],
                                  sequence_width * context_length};
@@ -104,38 +107,36 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
     if (in_g || filter_g || (padding_trainable && padding_data_g)) {
       col.mutable_data<T>(col_shape, context.GetPlace());
       // Because if padding_trainable is false, padding data should be zeros.
-      set_zero(context.device_context(), &col, static_cast<T>(0));
-      math::matmul<Place, T>(context.device_context(), *out_g, false, *filter,
-                             true, T(1.0), &col, T(1.0));
+      set_zero(dev_ctx, &col, static_cast<T>(0));
+      math::matmul<DeviceContext, T>(dev_ctx, *out_g, false, *filter, true,
+                                     T(1.0), &col, T(1.0));
     }
-    math::ContextProjectFunctor<Place, T> seq_project_functor;
-    math::ContextProjectGradFunctor<Place, T> seq_project_grad_functor;
+    math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
+    math::ContextProjectGradFunctor<DeviceContext, T> seq_project_grad_functor;
 
     if (in_g) {
       in_g->mutable_data<T>(context.GetPlace());
       in_g->set_lod(in->lod());
-      set_zero(context.device_context(), in_g, static_cast<T>(0));
+      set_zero(dev_ctx, in_g, static_cast<T>(0));
 
-      seq_project_grad_functor(context.device_context(), *in_g,
-                               padding_trainable, context_start, context_length,
-                               context_stride, up_pad, down_pad, false, true,
-                               padding_data_g, &col);
+      seq_project_grad_functor(dev_ctx, *in_g, padding_trainable, context_start,
+                               context_length, context_stride, up_pad, down_pad,
+                               false, true, padding_data_g, &col);
     }
 
     if (padding_trainable && padding_data_g) {
       padding_data_g->mutable_data<T>(context.GetPlace());
-      set_zero(context.device_context(), padding_data_g, static_cast<T>(0));
+      set_zero(dev_ctx, padding_data_g, static_cast<T>(0));
 
       LoDTensor* input = const_cast<LoDTensor*>(in);
-      seq_project_grad_functor(context.device_context(), *input,
-                               padding_trainable, context_start, context_length,
-                               context_stride, up_pad, down_pad, true, false,
-                               padding_data_g, &col);
+      seq_project_grad_functor(
+          dev_ctx, *input, padding_trainable, context_start, context_length,
+          context_stride, up_pad, down_pad, true, false, padding_data_g, &col);
     }
 
     if (filter_g) {
       filter_g->mutable_data<T>(context.GetPlace());
-      set_zero(context.device_context(), filter_g, static_cast<T>(0));
+      set_zero(dev_ctx, filter_g, static_cast<T>(0));
 
       Tensor filter_grad = *filter_g;
       LoDTensor out_grad = *out_g;
@@ -145,12 +146,12 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
         padding_data = context.Input<Tensor>("PaddingData");
       }
 
-      seq_project_functor(context.device_context(), *in, *padding_data,
-                          padding_trainable, context_start, context_length,
-                          context_stride, up_pad, down_pad, &col);
+      seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
+                          context_start, context_length, context_stride, up_pad,
+                          down_pad, &col);
 
-      math::matmul<Place, T>(context.device_context(), col, true, out_grad,
-                             false, T(1.0), &filter_grad, T(1.0));
+      math::matmul<DeviceContext, T>(dev_ctx, col, true, out_grad, false,
+                                     T(1.0), &filter_grad, T(1.0));
     }
   }
 };
diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/sequence_expand_op.cc
similarity index 82%
rename from paddle/operators/seq_expand_op.cc
rename to paddle/operators/sequence_expand_op.cc
index b862056ad4..770161b593 100644
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/sequence_expand_op.cc
@@ -12,14 +12,14 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/seq_expand_op.h"
+#include "paddle/operators/sequence_expand_op.h"
 
 namespace paddle {
 namespace operators {
 
 using framework::Tensor;
 
-class SeqExpandOp : public framework::OperatorWithKernel {
+class SequenceExpandOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -35,25 +35,25 @@ class SeqExpandOp : public framework::OperatorWithKernel {
   }
 };
 
-class SeqExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SeqExpandOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  SequenceExpandOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor or LoDTensor) The input(X) of this operator can be a "
              "LoDTensor or a base Tensor.");
     AddInput("Y",
-             "(LoDTensor)The reference input(Y) of seq_expand op."
+             "(LoDTensor)The reference input(Y) of sequence_expand op."
              "It must be a LoDTensor with k-level(k>0)."
              "The input(X) will be expanded according to LOD of input(Y)."
              "The element numbers of last level in input(Y) "
              "must be equal to dims[0] of input(X).");
     AddOutput("Out",
-              "(LodTensor)The output of seq_expand op."
+              "(LodTensor)The output of sequence_expand op."
               "The lod of output will be as same as input(Y)'s lod.");
     AddComment(R"DOC(
-Seq Expand Operator.
+Sequence Expand Operator.
 
 This operator expands input(X) according to LOD of input(Y).
 Following are cases to better explain how this works:
@@ -124,7 +124,7 @@ then we get 2-level LoDTensor
   }
 };
 
-class SeqExpandOpGrad : public framework::OperatorWithKernel {
+class SequenceExpandOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -146,10 +146,11 @@ class SeqExpandOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(seq_expand, ops::SeqExpandOp, ops::SeqExpandOpMaker,
-            seq_expand_grad, ops::SeqExpandOpGrad);
-REGISTER_OP_CPU_KERNEL(seq_expand,
-                       ops::SeqExpandKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker,
+            sequence_expand_grad, ops::SequenceExpandOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    seq_expand_grad,
-    ops::SeqExpandGradKernel<paddle::platform::CPUPlace, float>);
+    sequence_expand,
+    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_expand_grad,
+    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/seq_expand_op.cu b/paddle/operators/sequence_expand_op.cu
similarity index 69%
rename from paddle/operators/seq_expand_op.cu
rename to paddle/operators/sequence_expand_op.cu
index f1e4b82a76..f79c84dff8 100644
--- a/paddle/operators/seq_expand_op.cu
+++ b/paddle/operators/sequence_expand_op.cu
@@ -13,11 +13,12 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/seq_expand_op.h"
+#include "paddle/operators/sequence_expand_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(seq_expand,
-                       ops::SeqExpandKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    seq_expand_grad,
-    ops::SeqExpandGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_expand,
+    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_expand_grad,
+    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/sequence_expand_op.h
similarity index 87%
rename from paddle/operators/seq_expand_op.h
rename to paddle/operators/sequence_expand_op.h
index 4ef0d02cf8..411b819c65 100644
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/sequence_expand_op.h
@@ -23,8 +23,8 @@ namespace operators {
 
 using LoDTensor = framework::LoDTensor;
 
-template <typename Place, typename T>
-class SeqExpandKernel : public framework::OpKernel<T> {
+template <typename DeviceContext, typename T>
+class SequenceExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<LoDTensor>("X");
@@ -37,7 +37,8 @@ class SeqExpandKernel : public framework::OpKernel<T> {
                       "The size of last lod level in Input(Y)"
                       "must be equal to dims[0] of Input(X).");
     out->set_lod(y->lod());
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
     size_t element_len = framework::product(x_dims) / x_dims[0];
     T* out_data = out->mutable_data<T>(context.GetPlace());
     auto out_starts = out->lod().back();
@@ -50,7 +51,7 @@ class SeqExpandKernel : public framework::OpKernel<T> {
       Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
           out_t(out_data, scale, element_len);
       Eigen::array<int, 2> cast({{scale, 1}});
-      out_t.device(place) = x_t.broadcast(cast);
+      out_t.device(*place) = x_t.broadcast(cast);
       x_data += element_len;
       out_data += element_len * scale;
     }
@@ -69,8 +70,8 @@ class SeqExpandKernel : public framework::OpKernel<T> {
  *    Grad(X).lod = Input(X).lod
  *
  * */
-template <typename Place, typename T>
-class SeqExpandGradKernel : public framework::OpKernel<T> {
+template <typename DeviceContext, typename T>
+class SequenceExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
@@ -89,8 +90,9 @@ class SeqExpandGradKernel : public framework::OpKernel<T> {
       d_out_t(d_out_data, static_cast<int>(repeat), element_len);
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>
       d_x_t(d_x_data, static_cast<int>(element_len));
-      auto place = context.GetEigenDevice<Place>();
-      d_x_t.device(place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
+      auto place =
+          context.template device_context<DeviceContext>().eigen_device();
+      d_x_t.device(*place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
       d_out_data += (repeat * element_len);
       d_x_data += element_len;
     }
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
index a2f4257037..3526e45a1b 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -58,12 +58,12 @@ Sequence Pool Operator.
 
 The SequencePoolOp pools features of all time-steps of each instance.
 It supports six pooling types:
-1. AVERAGE: Out[i] = $$avg(X_i)$$
-2. SUM:     Out[i] = $$\sum_jX_{ij}$$
-3. SQRT:    Out[i] = $$\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$
+1. AVERAGE: $$Out[i] = \frac{\sum_i X_i}{N}$$
+2. SUM:     $$Out[i] = \sum_jX_{ij}$$
+3. SQRT:    $$Out[i] = \frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$
 4. LAST:    Out[i] = last instance in i-th sequence X[i]
 5. FIRST:   Out[i] = first instance in i-th sequence X[i]
-6. MAX:     Out[i] = $$max(X_i)$$
+6. MAX:     $$Out[i] = max(X_i)$$
 
 The following example explains how this works:
 For a mini-batch of 3 variable-length sentences,
@@ -123,7 +123,8 @@ namespace ops = paddle::operators;
 REGISTER_OP(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker,
             sequence_pool_grad, ops::SequencePoolGradOp);
 REGISTER_OP_CPU_KERNEL(
-    sequence_pool, ops::SequencePoolKernel<paddle::platform::CPUPlace, float>);
+    sequence_pool,
+    ops::SequencePoolKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     sequence_pool_grad,
-    ops::SequencePoolGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SequencePoolGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_pool_op.cu b/paddle/operators/sequence_pool_op.cu
index 66850772d5..fcd6508435 100644
--- a/paddle/operators/sequence_pool_op.cu
+++ b/paddle/operators/sequence_pool_op.cu
@@ -17,8 +17,9 @@
 #include "paddle/operators/sequence_pool_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    sequence_pool, ops::SequencePoolKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    sequence_pool,
+    ops::SequencePoolKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     sequence_pool_grad,
-    ops::SequencePoolGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SequencePoolGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
index 7f136d8cf0..7519aa1d72 100644
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -30,7 +30,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequencePoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -54,17 +54,18 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     auto lod_level_0 = lod[0];
 
     out->mutable_data<T>(context.GetPlace());
-
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (pooltype == "MAX") {
-      math::MaxSeqPoolFunctor<Place, T> max_pool;
+      math::MaxSeqPoolFunctor<DeviceContext, T> max_pool;
       auto* index = context.Output<Tensor>("MaxIndex");
       index->Resize({dims});
       index->mutable_data<int>(context.GetPlace());
-      max_pool(context.device_context(), *in, out, index);
+      max_pool(dev_ctx, *in, out, index);
       return;
     }
 
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
       Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
                               static_cast<int>(lod_level_0[i + 1]));
@@ -91,7 +92,7 @@ class SequencePoolKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequencePoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -105,20 +106,23 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
     int64_t w = in->numel() / dims[0];
 
     in_g->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
 
     if (pooltype == "MAX") {
-      math::MaxSeqPoolGradFunctor<Place, T> max_pool_grad;
+      math::MaxSeqPoolGradFunctor<DeviceContext, T> max_pool_grad;
       auto* index = context.Input<Tensor>("MaxIndex");
-      max_pool_grad(context.device_context(), *out_g, *index, in_g);
+      max_pool_grad(dev_ctx, *out_g, *index, in_g);
       return;
     }
 
     if (pooltype == "LAST" || pooltype == "FIRST") {
       // set X@Grad be zero at first when pooltype is LAST/FIRST
-      math::SetConstant<Place, T> functor;
-      functor(context.device_context(), in_g, 0);
+      math::SetConstant<DeviceContext, T> functor;
+      functor(dev_ctx, in_g, 0);
     }
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
       auto in_g_t =
           in_g->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc
index 255683a572..481db8f9e5 100644
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
@@ -125,7 +125,7 @@ REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker,
             sequence_slice_grad, ops::SequenceSliceGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_slice,
-    ops::SequenceSliceOpKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     sequence_slice_grad,
-    ops::SequenceSliceGradOpKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceSliceGradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_slice_op.cu b/paddle/operators/sequence_slice_op.cu
index a9f59dadba..43a21d619f 100755
--- a/paddle/operators/sequence_slice_op.cu
+++ b/paddle/operators/sequence_slice_op.cu
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/operators/sequence_slice_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     sequence_slice,
-    ops::SequenceSliceOpKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+    ops::SequenceSliceOpKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     sequence_slice_grad,
-    ops::SequenceSliceGradOpKernel<paddle::platform::GPUPlace, float>);
+    ops::SequenceSliceGradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h
index 428ef556da..14bcaebbb4 100644
--- a/paddle/operators/sequence_slice_op.h
+++ b/paddle/operators/sequence_slice_op.h
@@ -39,7 +39,7 @@ inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data,
   return out_lod;
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceSliceOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -108,7 +108,7 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -143,8 +143,9 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
     if (x_grad) {
       x_grad->mutable_data<T>(ctx.GetPlace());
       x_grad->set_lod(in->lod());
-      math::SetConstant<Place, T> set_zero;
-      set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(ctx.template device_context<DeviceContext>(), x_grad,
+               static_cast<T>(0));
 
       auto out_grad_stride = framework::stride(out_grad->dims());
 
diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/operators/sequence_softmax_op.cc
index 32c1502566..37d5452e6b 100644
--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
@@ -99,7 +99,7 @@ REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp,
             ops::SequenceSoftmaxGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_softmax,
-    ops::SequenceSoftmaxKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_softmax_op.cu.cc b/paddle/operators/sequence_softmax_op.cu.cc
index 7023795a3b..5f65b4daf9 100644
--- a/paddle/operators/sequence_softmax_op.cu.cc
+++ b/paddle/operators/sequence_softmax_op.cu.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/operators/sequence_softmax_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     sequence_softmax,
-    ops::SequenceSoftmaxKernel<paddle::platform::GPUPlace, float>)
-REGISTER_OP_GPU_KERNEL(
+    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, float>)
+REGISTER_OP_CUDA_KERNEL(
     sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_softmax_op.h b/paddle/operators/sequence_softmax_op.h
index 1b68dd0662..e889e88cb3 100644
--- a/paddle/operators/sequence_softmax_op.h
+++ b/paddle/operators/sequence_softmax_op.h
@@ -23,7 +23,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceSoftmaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -52,12 +52,13 @@ class SequenceSoftmaxKernel : public framework::OpKernel<T> {
       framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
       x_i.Resize(dims_i);
       out_i.Resize(dims_i);
-      math::SoftmaxFunctor<Place, T>()(ctx.device_context(), &x_i, &out_i);
+      math::SoftmaxFunctor<DeviceContext, T>()(
+          ctx.template device_context<DeviceContext>(), &x_i, &out_i);
     }
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -83,8 +84,9 @@ class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
       out_i.Resize(dims_i);
       out_grad_i.Resize(dims_i);
       x_grad_i.Resize(dims_i);
-      math::SoftmaxGradFunctor<Place, T>()(ctx.device_context(), &out_i,
-                                           &out_grad_i, &x_grad_i);
+      math::SoftmaxGradFunctor<DeviceContext, T>()(
+          ctx.template device_context<DeviceContext>(), &out_i, &out_grad_i,
+          &x_grad_i);
     }
   }
 };
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 5576d7b8be..121bf60b27 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -62,8 +62,8 @@ $$param\_out = param - learning\_rate * grad$$
 };
 
 template <typename T>
-struct SparseSGDFunctor<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SparseSGDFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& input,
                   const framework::Tensor& learning_rate,
                   framework::Tensor* output) {
@@ -90,13 +90,14 @@ struct SparseSGDFunctor<platform::CPUPlace, T> {
   }
 };
 
-template struct SparseSGDFunctor<platform::CPUPlace, float>;
-template struct SparseSGDFunctor<platform::CPUPlace, double>;
+template struct SparseSGDFunctor<platform::CPUDeviceContext, float>;
+template struct SparseSGDFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker);
-REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<paddle::platform::CPUPlace, float>,
-                       ops::SGDOpKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    sgd, ops::SGDOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SGDOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
index 7b6c5ec306..a3c0db7e50 100644
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -41,8 +41,8 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
 }  // namespace
 
 template <typename T>
-struct SparseSGDFunctor<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SparseSGDFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& input,
                   const framework::Tensor& learning_rate,
                   framework::Tensor* output) {
@@ -62,21 +62,19 @@ struct SparseSGDFunctor<platform::GPUPlace, T> {
     const int block_size = 256;
     dim3 threads(block_size, 1);
     dim3 grid(1, in_rows.size());
-    SparseSGDFunctorKernel<
-        T, 256><<<grid, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(in_data, in_rows.data(),
-                                   learning_rate.data<T>(), out_data,
-                                   in_row_numel);
+    SparseSGDFunctorKernel<T, 256><<<grid, threads, 0, context.stream()>>>(
+        in_data, in_rows.data(), learning_rate.data<T>(), out_data,
+        in_row_numel);
   }
 };
 
-template struct SparseSGDFunctor<platform::GPUPlace, float>;
-template struct SparseSGDFunctor<platform::GPUPlace, double>;
+template struct SparseSGDFunctor<platform::CUDADeviceContext, float>;
+template struct SparseSGDFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<paddle::platform::GPUPlace, float>,
-                       ops::SGDOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    sgd, ops::SGDOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SGDOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index 78b595fc6c..c920025a91 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -20,15 +20,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SparseSGDFunctor {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::SelectedRows& input,
                   const framework::Tensor& learning_rate,
                   framework::Tensor* output);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SGDOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -46,7 +46,8 @@ class SGDOpKernel : public framework::OpKernel<T> {
       auto g = framework::EigenVector<T>::Flatten(*grad);
       auto o = framework::EigenVector<T>::Flatten(*param_out);
       auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
-      auto place = ctx.GetEigenDevice<Place>();
+      auto& place =
+          *ctx.template device_context<DeviceContext>().eigen_device();
 
       Eigen::DSizes<int, 1> grad_dsize(grad->numel());
       o.device(place) = p - lr.broadcast(grad_dsize) * g;
@@ -56,8 +57,9 @@ class SGDOpKernel : public framework::OpKernel<T> {
       // It's better to find a more elegant solution.
       PADDLE_ENFORCE_EQ(param, param_out);
       auto* grad = ctx.Input<framework::SelectedRows>("Grad");
-      SparseSGDFunctor<Place, T> functor;
-      functor(ctx.device_context(), *grad, *learning_rate, param_out);
+      SparseSGDFunctor<DeviceContext, T> functor;
+      functor(ctx.template device_context<DeviceContext>(), *grad,
+              *learning_rate, param_out);
     } else {
       PADDLE_THROW("Unsupported Variable Type of Grad");
     }
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
index 782f4c7936..b8a1bf122a 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -142,7 +142,7 @@ REGISTER_OP(sigmoid_cross_entropy_with_logits,
             ops::SigmoidCrossEntropyWithLogitsGradOp);
 REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
                        ops::SigmoidCrossEntropyWithLogitsKernel<
-                           paddle::platform::CPUPlace, float>);
+                           paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
                        ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::CPUPlace, float>);
+                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
index 32a39956a1..1b569c93ed 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -16,9 +16,9 @@
 #include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(sigmoid_cross_entropy_with_logits,
-                       ops::SigmoidCrossEntropyWithLogitsKernel<
-                           paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits,
+                        ops::SigmoidCrossEntropyWithLogitsKernel<
+                            paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                        ops::SigmoidCrossEntropyWithLogitsGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
index 2a9d9bbc77..8fe7c5ba82 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -32,7 +32,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto labels = framework::EigenVector<T>::Flatten(*Labels);
     auto out = framework::EigenVector<T>::Flatten(*Out);
-    auto place = context.GetEigenDevice<Place>();
+    auto &place = *context.device_context<DeviceContext>().eigen_device();
 
     // term1 = max(x, 0)
     auto term1 = x.cwiseMax(static_cast<T>(0));
@@ -46,7 +46,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
 };
 
 // dX = sigmoid(X) - labels
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -62,7 +62,8 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
     auto labels = framework::EigenVector<T>::Flatten(*Labels);
     auto dout = framework::EigenVector<T>::Flatten(*dOut);
     auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+    auto &place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto sigmoid_x = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
     dx.device(place) = dout * (sigmoid_x - labels);
diff --git a/paddle/operators/sign_op.cc b/paddle/operators/sign_op.cc
index 08bf2e4e7c..d5a7ccb77e 100644
--- a/paddle/operators/sign_op.cc
+++ b/paddle/operators/sign_op.cc
@@ -67,5 +67,5 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker<float>,
                   ops::SignGradMaker);
-REGISTER_OP_CPU_KERNEL(sign,
-                       ops::SignKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    sign, ops::SignKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sign_op.cu b/paddle/operators/sign_op.cu
index 4d0638cb97..9bc1c65d21 100644
--- a/paddle/operators/sign_op.cu
+++ b/paddle/operators/sign_op.cu
@@ -14,5 +14,6 @@
 
 #include "paddle/operators/sign_op.h"
 
-REGISTER_OP_GPU_KERNEL(
-    sign, paddle::operators::SignKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    sign,
+    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sign_op.h b/paddle/operators/sign_op.h
index ab5cd4bac0..2e476ed665 100644
--- a/paddle/operators/sign_op.h
+++ b/paddle/operators/sign_op.h
@@ -19,7 +19,7 @@
 
 namespace paddle {
 namespace operators {
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SignKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& context) const {
@@ -29,7 +29,8 @@ class SignKernel : public framework::OpKernel<T> {
 
     auto eigen_out = framework::EigenVector<T>::Flatten(*out);
     auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     eigen_out.device(place) = eigen_in.sign();
   }
 };
diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc
index 50543fcc14..56e8d9058f 100644
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -138,7 +138,8 @@ REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp,
             ops::SmoothL1LossOpMaker<float>, smooth_l1_loss_grad,
             ops::SmoothL1LossGradOp);
 REGISTER_OP_CPU_KERNEL(
-    smooth_l1_loss, ops::SmoothL1LossKernel<paddle::platform::CPUPlace, float>);
+    smooth_l1_loss,
+    ops::SmoothL1LossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     smooth_l1_loss_grad,
-    ops::SmoothL1LossGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SmoothL1LossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/smooth_l1_loss_op.cu b/paddle/operators/smooth_l1_loss_op.cu
index 1c3172f438..8e94ebac64 100644
--- a/paddle/operators/smooth_l1_loss_op.cu
+++ b/paddle/operators/smooth_l1_loss_op.cu
@@ -17,8 +17,9 @@
 #include "paddle/operators/smooth_l1_loss_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    smooth_l1_loss, ops::SmoothL1LossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    smooth_l1_loss,
+    ops::SmoothL1LossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     smooth_l1_loss_grad,
-    ops::SmoothL1LossGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SmoothL1LossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/smooth_l1_loss_op.h b/paddle/operators/smooth_l1_loss_op.h
index 39d0070b6c..1a70c9c63c 100644
--- a/paddle/operators/smooth_l1_loss_op.h
+++ b/paddle/operators/smooth_l1_loss_op.h
@@ -44,7 +44,7 @@ struct SmoothL1LossForward {
   T sigma2;
 };
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class SmoothL1LossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -57,7 +57,8 @@ class SmoothL1LossKernel : public framework::OpKernel<T> {
 
     out0->mutable_data<T>(context.GetPlace());
     out1->mutable_data<T>(context.GetPlace());
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
 
     auto sigma = static_cast<T>(context.Attr<AttrType>("sigma"));
     T sigma2 = sigma * sigma;
@@ -67,12 +68,12 @@ class SmoothL1LossKernel : public framework::OpKernel<T> {
     auto y = EigenVector<T>::Flatten(*in1);
     auto diff = EigenVector<T>::Flatten(*out0);
 
-    diff.device(place) = x - y;
+    diff.device(*place) = x - y;
     // multiply inside weight
     if (has_weight) {
       auto inside_weight = EigenVector<T>::Flatten(*in2);
       // cache diff, reused in bp
-      diff.device(place) = diff * inside_weight;
+      diff.device(*place) = diff * inside_weight;
     }
 
     auto in_counts = in0->numel();
@@ -81,12 +82,12 @@ class SmoothL1LossKernel : public framework::OpKernel<T> {
                                    context.GetPlace());
     auto errors = EigenVector<T>::Flatten(ptensor_errors);
     // apply smooth l1 forward
-    errors.device(place) = diff.unaryExpr(SmoothL1LossForward<T>(sigma2));
+    errors.device(*place) = diff.unaryExpr(SmoothL1LossForward<T>(sigma2));
 
     // multiply outside weight
     if (has_weight) {
       auto outside_weight = EigenVector<T>::Flatten(*in3);
-      errors.device(place) = errors * outside_weight;
+      errors.device(*place) = errors * outside_weight;
     }
     auto loss = EigenVector<T>::Flatten(*out1);
     // first dimension of 'X' is the number of samples
@@ -94,7 +95,7 @@ class SmoothL1LossKernel : public framework::OpKernel<T> {
         framework::make_ddim({static_cast<int>(in0->dims()[0]),
                               static_cast<int>(in_counts / in0->dims()[0])});
     auto errors_mat_view = EigenMatrix<T>::From(ptensor_errors, mat_dims);
-    loss.device(place) = errors_mat_view.sum(Eigen::array<int, 1>({{1}}));
+    loss.device(*place) = errors_mat_view.sum(Eigen::array<int, 1>({{1}}));
   }
 };
 
@@ -114,7 +115,7 @@ struct SmoothL1LossBackward {
   T sigma2;
 };
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class SmoothL1LossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -126,7 +127,8 @@ class SmoothL1LossGradKernel : public framework::OpKernel<T> {
     T sigma2 = sigma * sigma;
     bool has_weight = (in0 != nullptr) && (in1 != nullptr);
 
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
 
     auto in_dims = in2->dims();
     auto counts = in2->numel();
@@ -139,7 +141,7 @@ class SmoothL1LossGradKernel : public framework::OpKernel<T> {
                                  context.GetPlace());
     auto diff = EigenVector<T>::Flatten(ptensor_diff);
     // apply smooth l1 backwoard
-    diff.device(place) = EigenVector<T>::Flatten(*in2).unaryExpr(
+    diff.device(*place) = EigenVector<T>::Flatten(*in2).unaryExpr(
         SmoothL1LossBackward<T>(sigma2));
 
     // compute weights
@@ -147,11 +149,11 @@ class SmoothL1LossGradKernel : public framework::OpKernel<T> {
     ptensor_weights.mutable_data<T>(mat_dims, context.GetPlace());
     auto weights = EigenMatrix<T>::From(ptensor_weights);
     // initialize to 1.0
-    weights.device(place) = weights.constant(static_cast<T>(1.0));
+    weights.device(*place) = weights.constant(static_cast<T>(1.0));
     if (has_weight) {
       auto inside_weight = EigenMatrix<T>::From(*in0, mat_dims);
       auto outside_weight = EigenMatrix<T>::From(*in1, mat_dims);
-      weights.device(place) = inside_weight * outside_weight;
+      weights.device(*place) = inside_weight * outside_weight;
     }
 
     // compute gradients
@@ -167,13 +169,13 @@ class SmoothL1LossGradKernel : public framework::OpKernel<T> {
     if (out0) {
       out0->mutable_data<T>(context.GetPlace());
       auto x_grad = EigenMatrix<T>::From(*out0, mat_dims);
-      x_grad.device(place) = gradients;
+      x_grad.device(*place) = gradients;
     }
 
     if (out1) {
       out1->mutable_data<T>(context.GetPlace());
       auto y_grad = EigenMatrix<T>::From(*out1, mat_dims);
-      y_grad.device(place) = -1 * gradients;
+      y_grad.device(*place) = -1 * gradients;
     }
   }
 };
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 93e0525bad..0988c83d43 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -89,7 +89,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad,
             ops::SoftmaxOpGrad);
-REGISTER_OP_CPU_KERNEL(softmax,
-                       ops::SoftmaxKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    softmax_grad, ops::SoftmaxGradKernel<paddle::platform::CPUPlace, float>);
+    softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    softmax_grad,
+    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/softmax_op.cu.cc b/paddle/operators/softmax_op.cu.cc
index 013ace19ae..7b9882cbcf 100644
--- a/paddle/operators/softmax_op.cu.cc
+++ b/paddle/operators/softmax_op.cu.cc
@@ -16,7 +16,8 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(softmax,
-                       ops::SoftmaxKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    softmax_grad, ops::SoftmaxGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    softmax, ops::SoftmaxKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    softmax_grad,
+    ops::SoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 44d1e63f1b..0f8998b99e 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -21,7 +21,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SoftmaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -31,11 +31,12 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     Y->mutable_data<T>(context.GetPlace());
 
-    math::SoftmaxFunctor<Place, T>()(context.device_context(), X, Y);
+    math::SoftmaxFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), X, Y);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SoftmaxGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -46,7 +47,8 @@ class SoftmaxGradKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     dX->mutable_data<T>(context.GetPlace());
 
-    math::SoftmaxGradFunctor<Place, T>()(context.device_context(), Y, dY, dX);
+    math::SoftmaxGradFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), Y, dY, dX);
   }
 };
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
index b1faddac3f..6100c63f9a 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -69,10 +69,10 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
     softmax->mutable_data<T>(context.GetPlace());
     loss->mutable_data<T>(context.GetPlace());
 
-    math::SoftmaxFunctor<platform::GPUPlace, T>()(context.device_context(),
-                                                  logits, softmax);
-    math::CrossEntropyFunctor<platform::GPUPlace, T>()(
-        context.device_context(), loss, softmax, labels,
+    math::SoftmaxFunctor<platform::CUDADeviceContext, T>()(
+        context.cuda_device_context(), logits, softmax);
+    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
+        context.cuda_device_context(), loss, softmax, labels,
         context.Attr<bool>("soft_label"));
   }
 };
@@ -98,18 +98,18 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
 
     if (context.Attr<bool>("soft_label")) {
       const T* label_data = labels->data<T>();
-      SoftCrossEntropyGradientKernel<T><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              context.device_context())
-                              .stream()>>>(logit_grad_data, loss_grad_data,
-                                           label_data, batch_size, class_num);
+      SoftCrossEntropyGradientKernel<
+          T><<<grid, block, 0,
+               context.template device_context<platform::CUDADeviceContext>()
+                   .stream()>>>(logit_grad_data, loss_grad_data, label_data,
+                                batch_size, class_num);
     } else {
       const int64_t* label_data = labels->data<int64_t>();
-      CrossEntropyGrad<T><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              context.device_context())
-                              .stream()>>>(logit_grad_data, loss_grad_data,
-                                           label_data, batch_size, class_num);
+      CrossEntropyGrad<
+          T><<<grid, block, 0,
+               context.template device_context<platform::CUDADeviceContext>()
+                   .stream()>>>(logit_grad_data, loss_grad_data, label_data,
+                                batch_size, class_num);
     }
   }
 };
@@ -118,9 +118,9 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy,
-                       ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
-                       ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
-REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy_grad,
-                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
-                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy,
+                        ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
+                        ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy_grad,
+                        ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
+                        ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
index c4ab3f74b4..9c3431605b 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -40,11 +40,12 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
     softmax->mutable_data<T>(context.GetPlace());
     loss->mutable_data<T>(context.GetPlace());
 
-    math::SoftmaxFunctor<platform::CPUPlace, T>()(context.device_context(),
-                                                  logits, softmax);
-    math::CrossEntropyFunctor<platform::CPUPlace, T>()(
-        context.device_context(), loss, softmax, labels,
-        context.Attr<bool>("soft_label"));
+    auto& dev_ctx =
+        context.template device_context<platform::CPUDeviceContext>();
+    math::SoftmaxFunctor<platform::CPUDeviceContext, T>()(dev_ctx, logits,
+                                                          softmax);
+    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
+        dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"));
   }
 };
 
@@ -62,14 +63,15 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     const int class_num = logit_grad->dims()[1];
     auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
     auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
-
+    auto& place = *context.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
     if (context.Attr<bool>("soft_label")) {
       auto lbl_mat = EigenMatrix<T>::From(*labels);
-      logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
+      logit_grad_mat.device(place) =
           out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) *
           (logit_grad_mat - lbl_mat);
     } else {
-      logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
+      logit_grad_mat.device(place) =
           logit_grad_mat *
           out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num));
 
diff --git a/paddle/operators/split_op.cu.cc b/paddle/operators/split_op.cu.cc
index 93d1fc3c44..dbad0bbf68 100644
--- a/paddle/operators/split_op.cu.cc
+++ b/paddle/operators/split_op.cu.cc
@@ -14,5 +14,5 @@ limitations under the License. */
 
 #include "paddle/operators/split_op.h"
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(split,
-                       ops::SplitOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/split_op.h b/paddle/operators/split_op.h
index fa26e5f677..a38c435d53 100644
--- a/paddle/operators/split_op.h
+++ b/paddle/operators/split_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SplitOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/spp_op.cc b/paddle/operators/spp_op.cc
new file mode 100644
index 0000000000..b1807b6261
--- /dev/null
+++ b/paddle/operators/spp_op.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/spp_op.h"
+namespace paddle {
+namespace operators {
+
+class SppOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SppOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of spp operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of spp operator."
+              "N * M."
+              "M = C * H * W");
+    AddAttr<int>("pyramid_height", "(int), multi level pooling");
+    AddAttr<std::string>(
+        "pooling_type",
+        "(string), pooling type, can be \"max\" for max-pooling "
+        "and \"avg\" for average-pooling.")
+        .InEnum({"max", "avg"});
+    AddComment(R"DOC(
+        "With spatial pyramid pooling, the input image can
+        be of any sizes. This not only allows arbitrary aspect
+        ratios, but also allows arbitrary scales. We can resize
+        the input image to any scale (e.g., min(w, h)=180, 224,
+        ...) and apply the same deep network. When the
+        input image is at different scales, the network (with
+        the same filter sizes) will extract features at different
+        scales. The scales play important roles in traditional
+        methods.
+        Input shape: $(N, C_{in}, H_{in}, W_{in})$
+        Output shape: $(H_{out}, W_{out})$
+        Where
+          $$
+            H_{out} = N \\
+            W_{out} = (((4^pyramid_height) - 1) / (4 - 1))$ * C_{in}
+          $$
+        paper https://arxiv.org/pdf/1406.4729v4.pdf
+        )DOC");
+  }
+};
+
+class SppOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SppOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SppOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    int pyramid_height = ctx->Attrs().Get<int>("pyramid_height");
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
+                   "Spping intput must be of 4-dimensional.");
+    int outlen = ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1];
+    std::vector<int64_t> output_shape({in_x_dims[0], outlen});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class SppOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(spp, ops::SppOp, ops::SppOpMaker, spp_grad, ops::SppOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    spp, ops::SppKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SppKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    spp_grad, ops::SppGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SppGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/spp_op.cu.cc b/paddle/operators/spp_op.cu.cc
new file mode 100644
index 0000000000..761e4d6c4a
--- /dev/null
+++ b/paddle/operators/spp_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/spp_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    spp, ops::SppKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SppKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    spp_grad, ops::SppGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SppGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/spp_op.h b/paddle/operators/spp_op.h
new file mode 100644
index 0000000000..f35b305d02
--- /dev/null
+++ b/paddle/operators/spp_op.h
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/pooling.h"
+#include "paddle/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class SppKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    int pyramid_height = context.template Attr<int>("pyramid_height");
+    std::string pooling_type =
+        context.template Attr<std::string>("pooling_type");
+    out->mutable_data<T>(context.GetPlace());
+    auto out_stride = framework::stride(out->dims());
+    int input_h = in_x->dims()[2];
+    int input_w = in_x->dims()[3];
+    size_t output_offset = 0;
+    for (int p = 0; p < pyramid_height; ++p) {
+      int bins = std::pow(2, p);
+      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
+      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
+      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
+      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
+      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
+      std::vector<int> strides({kernel_size_h, kernel_size_w});
+      std::vector<int> paddings({padding_h, padding_w});
+      // pooling output shape
+      framework::Tensor out_level;
+      std::vector<int64_t> output_shape_vec(
+          {in_x->dims()[0], in_x->dims()[1], bins, bins});
+      framework::DDim output_shape(framework::make_ddim(output_shape_vec));
+      out_level.mutable_data<T>(output_shape, context.GetPlace());
+      // pooling
+      if (pooling_type == "max") {
+        math::Pool2dFunctor<DeviceContext, math::MaxPool<T>, T> pool_forward;
+        math::MaxPool<T> max_process;
+        pool_forward(context.template device_context<DeviceContext>(), *in_x,
+                     kernel_size, strides, paddings, max_process, &out_level);
+      } else if (pooling_type == "avg") {
+        math::Pool2dFunctor<DeviceContext, math::AvgPool<T>, T> pool_forward;
+        math::AvgPool<T> avg_process;
+        pool_forward(context.template device_context<DeviceContext>(), *in_x,
+                     kernel_size, strides, paddings, avg_process, &out_level);
+      }
+      // flatten pooling output shape
+      int output_flatten_w = in_x->dims()[1] * bins * bins;
+      std::vector<int64_t> output_flatten_shape_vec(
+          {in_x->dims()[0], output_flatten_w});
+      framework::DDim output_flatten_shape(
+          framework::make_ddim(output_flatten_shape_vec));
+      out_level.Resize(output_flatten_shape);
+      // concat
+      auto out_level_stride = framework::stride(out_level.dims());
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out_level.data<T>(), out_level_stride, out_level.dims(),
+                       out_stride, out->data<T>() + output_offset);
+      output_offset += out_level.dims()[1] * out_level_stride[1];
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class SppGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    int pyramid_height = context.template Attr<int>("pyramid_height");
+    std::string pooling_type =
+        context.template Attr<std::string>("pooling_type");
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    in_x_grad->mutable_data<T>(context.GetPlace());
+    zero(device_ctx, in_x_grad, static_cast<T>(0));
+    auto out_stride = framework::stride(out->dims());
+    int input_h = in_x->dims()[2];
+    int input_w = in_x->dims()[3];
+    size_t out_offset = 0;
+    for (int p = 0; p < pyramid_height; ++p) {
+      int bins = std::pow(2, p);
+      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
+      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
+      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
+      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
+      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
+      std::vector<int> strides({kernel_size_h, kernel_size_w});
+      std::vector<int> paddings({padding_h, padding_w});
+      // split out and outgrad  ...  to flatten
+      framework::Tensor out_level;
+      framework::Tensor outgrad_level;
+      int out_flatten_w = in_x->dims()[1] * bins * bins;
+      std::vector<int64_t> out_flatten_shape_vec(
+          {in_x->dims()[0], out_flatten_w});
+      framework::DDim out_flatten_shape(
+          framework::make_ddim(out_flatten_shape_vec));
+      out_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
+      outgrad_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
+      auto flatten_stride = framework::stride(out_level.dims());
+      // memcpy
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out->data<T>() + out_offset, out_stride,
+                       out_level.dims(), flatten_stride, out_level.data<T>());
+
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out_grad->data<T>() + out_offset, out_stride,
+                       outgrad_level.dims(), flatten_stride,
+                       outgrad_level.data<T>());
+      out_offset += out_level.dims()[1] * out_stride[1];
+      // flatten backward to nchw
+
+      std::vector<int64_t> out_shape_vec({in_x->dims()[0], in_x->dims()[1]});
+      out_shape_vec.push_back(
+          (input_h - kernel_size_h + 2 * padding_h) / kernel_size_h + 1);
+      out_shape_vec.push_back(
+          (input_w - kernel_size_w + 2 * padding_w) / kernel_size_w + 1);
+      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+      out_level.ShareDataWith(out_level);
+      out_level.Resize(out_shape);
+      outgrad_level.ShareDataWith(outgrad_level);
+      outgrad_level.Resize(out_shape);
+      // pooling backward
+      if (pooling_type == "max") {
+        math::MaxPool2dGradFunctor<DeviceContext, T> pool2d_backward;
+        pool2d_backward(context.template device_context<DeviceContext>(), *in_x,
+                        *&out_level, *&outgrad_level, kernel_size, strides,
+                        paddings, in_x_grad);
+      } else if (pooling_type == "avg") {
+        math::Pool2dGradFunctor<DeviceContext, math::AvgPoolGrad<T>, T>
+            pool_backward;
+        math::AvgPoolGrad<T> avg_process;
+        pool_backward(context.template device_context<DeviceContext>(), *in_x,
+                      *&out_level, *&outgrad_level, kernel_size, strides,
+                      paddings, avg_process, in_x_grad);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
index bec2a2c18a..50bc6da196 100644
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -115,7 +115,7 @@ REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp,
             ops::SquaredL2DistanceGradOp);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_distance,
-    ops::SquaredL2DistanceKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    squared_l2_distance_grad,
-    ops::SquaredL2DistanceGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SquaredL2DistanceKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(squared_l2_distance_grad,
+                       ops::SquaredL2DistanceGradKernel<
+                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/squared_l2_distance_op.cu b/paddle/operators/squared_l2_distance_op.cu
index 3fe62f1a9c..ecc82ed1e4 100644
--- a/paddle/operators/squared_l2_distance_op.cu
+++ b/paddle/operators/squared_l2_distance_op.cu
@@ -17,9 +17,9 @@
 #include "paddle/operators/squared_l2_distance_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     squared_l2_distance,
-    ops::SquaredL2DistanceKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    squared_l2_distance_grad,
-    ops::SquaredL2DistanceGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SquaredL2DistanceKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(squared_l2_distance_grad,
+                        ops::SquaredL2DistanceGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/squared_l2_distance_op.h b/paddle/operators/squared_l2_distance_op.h
index 259ef40296..5bd5f4819a 100644
--- a/paddle/operators/squared_l2_distance_op.h
+++ b/paddle/operators/squared_l2_distance_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SquaredL2DistanceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -51,7 +51,8 @@ class SquaredL2DistanceKernel : public framework::OpKernel<T> {
     auto sub_result = EigenMatrix<T>::From(*out0);
     auto z = EigenVector<T>::Flatten(*out1);
 
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     auto x_dims = x.dimensions();
     auto y_dims = y.dimensions();
     // buffer the substraction result
@@ -67,7 +68,7 @@ class SquaredL2DistanceKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -89,7 +90,8 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
                     sub_result;
 
     // propagate back to input
-    auto eigen_place = context.GetEigenDevice<Place>();
+    auto& eigen_place =
+        *context.template device_context<DeviceContext>().eigen_device();
     if (x_g) {
       x_g->mutable_data<T>(context.GetPlace());
       // eigen matrix
diff --git a/paddle/operators/squared_l2_norm_op.cc b/paddle/operators/squared_l2_norm_op.cc
index 3c10e6159f..3cff61a02f 100644
--- a/paddle/operators/squared_l2_norm_op.cc
+++ b/paddle/operators/squared_l2_norm_op.cc
@@ -72,7 +72,7 @@ REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker,
             squared_l2_norm_grad, ops::SquaredL2NormGradOp);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_norm,
-    ops::SquaredL2NormKernel<paddle::platform::CPUPlace, float>);
+    ops::SquaredL2NormKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_norm_grad,
-    ops::SquaredL2NormGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SquaredL2NormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/squared_l2_norm_op.cu b/paddle/operators/squared_l2_norm_op.cu
index d384e9c28c..2d6567d090 100644
--- a/paddle/operators/squared_l2_norm_op.cu
+++ b/paddle/operators/squared_l2_norm_op.cu
@@ -16,9 +16,9 @@
 #include "paddle/operators/squared_l2_norm_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     squared_l2_norm,
-    ops::SquaredL2NormKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+    ops::SquaredL2NormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     squared_l2_norm_grad,
-    ops::SquaredL2NormGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SquaredL2NormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/squared_l2_norm_op.h b/paddle/operators/squared_l2_norm_op.h
index 48d7b1c2d5..0ced7e7d70 100644
--- a/paddle/operators/squared_l2_norm_op.h
+++ b/paddle/operators/squared_l2_norm_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 // Out = sum(square(X))
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SquaredL2NormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -30,14 +30,15 @@ class SquaredL2NormKernel : public framework::OpKernel<T> {
 
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto out = framework::EigenScalar<T>::From(*Out);
-    auto place = context.GetEigenDevice<Place>();
+    auto *place =
+        context.template device_context<DeviceContext>().eigen_device();
 
-    out.device(place) = x.square().sum();
+    out.device(*place) = x.square().sum();
   }
 };
 
 // dX = X
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SquaredL2NormGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -53,10 +54,11 @@ class SquaredL2NormGradKernel : public framework::OpKernel<T> {
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto dout = framework::EigenVector<T>::Flatten(*dOut);
     auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+    auto *place =
+        context.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<int, 1> x_dsize(X->numel());
-    dx.device(place) = (dout.broadcast(x_dsize) * x) * static_cast<T>(2.0);
+    dx.device(*place) = (dout.broadcast(x_dsize) * x) * static_cast<T>(2.0);
   }
 };
 
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index 744b2fe3f2..cd52672f78 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -195,7 +195,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker,
                   ops::SumOpVarTypeInference);
-REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel<paddle::platform::CPUPlace, float>,
-                       ops::SumKernel<paddle::platform::CPUPlace, double>,
-                       ops::SumKernel<paddle::platform::CPUPlace, int>,
-                       ops::SumKernel<paddle::platform::CPUPlace, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/sum_op.cu b/paddle/operators/sum_op.cu
index 5c30dd4d47..873155076c 100644
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
@@ -13,7 +13,8 @@ limitations under the License. */
 #include "paddle/operators/sum_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>,
-                       ops::SumKernel<paddle::platform::GPUPlace, double>,
-                       ops::SumKernel<paddle::platform::GPUPlace, int>,
-                       ops::SumKernel<paddle::platform::GPUPlace, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    sum, ops::SumKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
index ed6c80ce60..eaa36aa1ae 100644
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -26,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SumKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -43,12 +43,14 @@ class SumKernel : public framework::OpKernel<T> {
       auto result = EigenVector<T>::Flatten(*out);
 
       if (!in_place) {
-        math::SetConstant<Place, T> constant_functor;
-        constant_functor(context.device_context(), out, 0.0);
+        math::SetConstant<DeviceContext, T> constant_functor;
+        constant_functor(context.template device_context<DeviceContext>(), out,
+                         0.0);
       }
 
-      math::SelectedRowsAddToTensor<Place, T> functor;
-      auto place = context.GetEigenDevice<Place>();
+      math::SelectedRowsAddToTensor<DeviceContext, T> functor;
+      auto &place =
+          *context.template device_context<DeviceContext>().eigen_device();
       // If in_place, just skip the first tensor
       for (int i = in_place ? 1 : 0; i < N; i++) {
         if (in_vars[i]->IsType<framework::LoDTensor>()) {
@@ -60,7 +62,7 @@ class SumKernel : public framework::OpKernel<T> {
           result.device(place) = result + in;
         } else if (in_vars[i]->IsType<framework::SelectedRows>()) {
           auto &in_t = in_vars[i]->Get<framework::SelectedRows>();
-          functor(context.device_context(), in_t, out);
+          functor(context.template device_context<DeviceContext>(), in_t, out);
         } else {
           PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
         }
@@ -82,14 +84,14 @@ class SumKernel : public framework::OpKernel<T> {
       out_value->Resize(framework::make_ddim(in_dim_vec));
       out_value->mutable_data<T>(context.GetPlace());
 
-      math::SelectedRowsAddTo<Place, T> functor;
+      math::SelectedRowsAddTo<DeviceContext, T> functor;
 
       int64_t offset = 0;
       for (int i = 0; i < N; i++) {
         PADDLE_ENFORCE_EQ(out->height(),
                           in_vars[i]->Get<SelectedRows>().height());
-        functor(context.device_context(), in_vars[i]->Get<SelectedRows>(),
-                offset, out);
+        functor(context.template device_context<DeviceContext>(),
+                in_vars[i]->Get<SelectedRows>(), offset, out);
         offset += in_vars[i]->Get<SelectedRows>().value().numel();
       }
     } else if (out_var->IsType<framework::LoDTensorArray>()) {
@@ -112,7 +114,8 @@ class SumKernel : public framework::OpKernel<T> {
               PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
               auto in = EigenVector<T>::Flatten(in_array[i]);
               auto result = EigenVector<T>::Flatten(out_array[i]);
-              result.device(context.GetEigenDevice<Place>()) = result + in;
+              result.device(*context.template device_context<DeviceContext>()
+                                 .eigen_device()) = result + in;
             }
           }
         }
diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu
index 7851c71bbe..453bd07267 100644
--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
@@ -317,4 +317,4 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>);
diff --git a/paddle/operators/top_k_op.h b/paddle/operators/top_k_op.h
index bc8563717a..e9cd9bbd4d 100644
--- a/paddle/operators/top_k_op.h
+++ b/paddle/operators/top_k_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class TopkKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc
index 94de3d5069..de5ff561ad 100644
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
@@ -112,8 +112,8 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(transpose, ops::TransposeOp, ops::TransposeOpMaker, transpose_grad,
             ops::TransposeOpGrad);
-REGISTER_OP_CPU_KERNEL(transpose,
-                       ops::TransposeKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     transpose_grad,
-    ops::TransposeGradKernel<paddle::platform::CPUPlace, float>);
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/transpose_op.cu.cc b/paddle/operators/transpose_op.cu.cc
index af3f581462..7d23f1493e 100644
--- a/paddle/operators/transpose_op.cu.cc
+++ b/paddle/operators/transpose_op.cu.cc
@@ -15,8 +15,9 @@
 #include "paddle/operators/transpose_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(transpose,
-                       ops::TransposeKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    transpose,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     transpose_grad,
-    ops::TransposeGradKernel<paddle::platform::GPUPlace, float>);
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/transpose_op.h b/paddle/operators/transpose_op.h
index e296032f41..d995271a6b 100644
--- a/paddle/operators/transpose_op.h
+++ b/paddle/operators/transpose_op.h
@@ -20,33 +20,33 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
-inline void TransCompute(const int dim, const platform::DeviceContext& dev_ctx,
+template <typename DeviceContext, typename T>
+inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
                          const framework::Tensor& in, framework::Tensor* out,
                          const std::vector<int>& axis) {
   switch (dim) {
     case 1:
-      math::Transpose<Place, T, 1> trans1;
+      math::Transpose<DeviceContext, T, 1> trans1;
       trans1(dev_ctx, in, out, axis);
       break;
     case 2:
-      math::Transpose<Place, T, 2> trans2;
+      math::Transpose<DeviceContext, T, 2> trans2;
       trans2(dev_ctx, in, out, axis);
       break;
     case 3:
-      math::Transpose<Place, T, 3> trans3;
+      math::Transpose<DeviceContext, T, 3> trans3;
       trans3(dev_ctx, in, out, axis);
       break;
     case 4:
-      math::Transpose<Place, T, 4> trans4;
+      math::Transpose<DeviceContext, T, 4> trans4;
       trans4(dev_ctx, in, out, axis);
       break;
     case 5:
-      math::Transpose<Place, T, 5> trans5;
+      math::Transpose<DeviceContext, T, 5> trans5;
       trans5(dev_ctx, in, out, axis);
       break;
     case 6:
-      math::Transpose<Place, T, 6> trans6;
+      math::Transpose<DeviceContext, T, 6> trans6;
       trans6(dev_ctx, in, out, axis);
       break;
     default:
@@ -54,7 +54,7 @@ inline void TransCompute(const int dim, const platform::DeviceContext& dev_ctx,
   }
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class TransposeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -64,12 +64,12 @@ class TransposeKernel : public framework::OpKernel<T> {
 
     std::vector<int> axis = context.Attr<std::vector<int>>("axis");
     int ndims = axis.size();
-    auto& dev_ctx = context.device_context();
-    TransCompute<Place, T>(ndims, dev_ctx, *x, out, axis);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    TransCompute<DeviceContext, T>(ndims, dev_ctx, *x, out, axis);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class TransposeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -88,8 +88,9 @@ class TransposeGradKernel : public framework::OpKernel<T> {
     }
 
     int ndims = axis.size();
-    auto& dev_ctx = context.device_context();
-    TransCompute<Place, T>(ndims, dev_ctx, *out_grad, x_grad, reversed_axis);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad, x_grad,
+                                   reversed_axis);
   }
 };
 
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index fff1dc7ccd..2a49ee471f 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -67,7 +67,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
-        ctx.device_context());
+        ctx.GetPlace());
   }
 };
 
diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu
index 8b20bb8287..cfe9d293cf 100644
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
@@ -63,6 +63,6 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(uniform_random,
-                       paddle::operators::GPUUniformRandomKernel<float>,
-                       paddle::operators::GPUUniformRandomKernel<double>);
+REGISTER_OP_CUDA_KERNEL(uniform_random,
+                        paddle::operators::GPUUniformRandomKernel<float>,
+                        paddle::operators::GPUUniformRandomKernel<double>);
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index 89c48e071c..49df2a530c 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -135,9 +135,10 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
             ops::UnpoolOpGrad);
-REGISTER_OP_CPU_KERNEL(unpool,
-                       ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
-                       ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    unpool_grad, ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
-    ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
+    unpool, ops::UnpoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::UnpoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    unpool_grad,
+    ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
index 18aafb7dc7..9b002e35c4 100644
--- a/paddle/operators/unpool_op.cu.cc
+++ b/paddle/operators/unpool_op.cu.cc
@@ -15,9 +15,10 @@ limitations under the License. */
 #include "paddle/operators/unpool_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(unpool,
-                       ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
-                       ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
-    unpool_grad, ops::UnpoolGradKernel<paddle::platform::GPUPlace, float>,
-    ops::UnpoolGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    unpool, ops::UnpoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::UnpoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    unpool_grad,
+    ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
index 243eb7e532..ee18b118c9 100644
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class UnpoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -32,15 +32,16 @@ class UnpoolKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     T* output_data = out->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (output_data) {
-      math::SetConstant<Place, T> set_zero;
-      set_zero(context.device_context(), out, static_cast<T>(0));
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(dev_ctx, out, static_cast<T>(0));
     }
-    math::Unpool2dMaxFunctor<Place, T> unpool2d_max_forward;
-    unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
+    math::Unpool2dMaxFunctor<DeviceContext, T> unpool2d_max_forward;
+    unpool2d_max_forward(dev_ctx, *in_x, *in_y, out);
   }
 };
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class UnpoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -56,15 +57,14 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
 
-    auto& device_ctx = context.device_context();
-    math::SetConstant<Place, T> zero;
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
       zero(device_ctx, in_x_grad, static_cast<T>(0));
     }
-    math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
-    unpool2d_max_backward(context.device_context(), *in_x, *in_y, *out,
-                          *out_grad, in_x_grad);
+    math::Unpool2dMaxGradFunctor<DeviceContext, T> unpool2d_max_backward;
+    unpool2d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad);
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
index 9b3f21cf94..9a092a570f 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -25,7 +25,7 @@ namespace operators {
 using StepScopeVar = std::vector<framework::Scope *>;
 using LoDTensor = framework::LoDTensor;
 
-constexpr char kStepBlock[] = "step_block";
+constexpr char kStepBlock[] = "sub_block";
 constexpr char kCondition[] = "Condition";
 constexpr char kStepScopes[] = "StepScopes";
 constexpr char kParameters[] = "X";
@@ -187,7 +187,8 @@ class WhileGradOp : public framework::OperatorBase {
             attrs["value"] = 0.0f;
 
             auto zero_op = framework::OpRegistry::CreateOp(
-                "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs);
+                "fill_constant", framework::VariableNameMap{},
+                {{"Out", {pg_names[param_id]}}}, attrs);
             zero_op->Run(scope, dev_ctx);
           }
         }
@@ -195,7 +196,7 @@ class WhileGradOp : public framework::OperatorBase {
         auto new_inside_name = cur_scope.Rename(inside_grad_name);
         auto sum_op = framework::OpRegistry::CreateOp(
             "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-            {{"Out", {pg_names[param_id]}}}, {});
+            {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
         sum_op->Run(cur_scope, dev_ctx);
         cur_scope.Rename(new_inside_name, inside_grad_name);
       }
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index f157188a4f..5b0c52a30d 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -38,7 +38,7 @@ public:
     real torch_learningRate = optConfig_.learning_method() == "torch_momentum"
                                   ? 1.0 - paraConfig.momentum()
                                   : 1.0;
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
     sgdUpdate(learningRate_ * paraConfig.learning_rate() *
                   (firstTime_ ? 1.0 : torch_learningRate),
               paraConfig.momentum(),
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
index 1898598e49..d60cb36383 100644
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -30,7 +30,7 @@ void sgdUpdateCpu(real learningRate,
                   const real* grad,
                   real* momentumVec) {
   decayRate *= learningRate;
-#ifdef PADDLE_USE_MKLML
+#ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
 #endif
   for (size_t i = 0; i < size; ++i) {
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 7afcdfce93..8cdc5f4340 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -15,12 +15,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-template <>
-Eigen::DefaultDevice* DeviceContext::GetEigenDevice<
-    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return reinterpret_cast<const CPUDeviceContext*>(this)->eigen_device();
-}
-
 CPUDeviceContext::CPUDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
@@ -37,12 +31,6 @@ Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
 
 #ifdef PADDLE_WITH_CUDA
 
-template <>
-Eigen::GpuDevice*
-DeviceContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
-}
-
 class EigenCudaStreamDevice : public Eigen::StreamInterface {
  public:
   EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
@@ -122,10 +110,6 @@ Place CUDADeviceContext::GetPlace() const { return place_; }
 
 void CUDADeviceContext::Wait() const {
   PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-}
-
-void CUDADeviceContext::Finish() const {
-  Wait();
   PADDLE_ENFORCE(cudaGetLastError());
 }
 
@@ -141,6 +125,22 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
 
 cudaStream_t CUDADeviceContext::stream() const { return stream_; }
 
+CUDNNDeviceContext::CUDNNDeviceContext(CUDNNPlace place)
+    : CUDADeviceContext(place), place_(place) {
+  PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
+  PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream()));
+}
+
+CUDNNDeviceContext::~CUDNNDeviceContext() {
+  SetDeviceId(place_.device);
+  Wait();
+  PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
+}
+
+Place CUDNNDeviceContext::GetPlace() const { return CUDNNPlace(); }
+
+cudnnHandle_t CUDNNDeviceContext::cudnn_handle() const { return cudnn_handle_; }
+
 #endif
 
 }  // namespace platform
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 526d089e35..56813a1d5b 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -27,27 +27,12 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-template <typename T>
-struct EigenDeviceConverter;
-
-template <>
-struct EigenDeviceConverter<platform::CPUPlace> {
-  using EigenDeviceType = Eigen::DefaultDevice;
-};
-
 class DeviceContext {
  public:
   virtual ~DeviceContext() {}
   virtual Place GetPlace() const = 0;
 
-  template <typename PlaceType,
-            typename DeviceType =
-                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
-  DeviceType* GetEigenDevice() const;
-
   virtual void Wait() const {}
-
-  virtual void Finish() const {}
 };
 
 class CPUDeviceContext : public DeviceContext {
@@ -64,10 +49,6 @@ class CPUDeviceContext : public DeviceContext {
 };
 
 #ifdef PADDLE_WITH_CUDA
-template <>
-struct EigenDeviceConverter<platform::GPUPlace> {
-  using EigenDeviceType = Eigen::GpuDevice;
-};
 
 class EigenCudaStreamDevice;
 
@@ -79,9 +60,6 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Wait for all operations completion in the stream. */
   void Wait() const override;
 
-  /*! \brief  Check potential errors for the cuda kernel calls. */
-  void Finish() const override;
-
   /*! \brief  Return place in the device context. */
   Place GetPlace() const override;
 
@@ -108,6 +86,22 @@ class CUDADeviceContext : public DeviceContext {
   cublasHandle_t cublas_handle_;
 };
 
+class CUDNNDeviceContext : public CUDADeviceContext {
+ public:
+  explicit CUDNNDeviceContext(CUDNNPlace place);
+  virtual ~CUDNNDeviceContext();
+
+  /*! \brief  Return place in the device context. */
+  Place GetPlace() const final;
+
+  /*! \brief  Return cudnn  handle in the device context. */
+  cudnnHandle_t cudnn_handle() const;
+
+ private:
+  cudnnHandle_t cudnn_handle_;
+  CUDNNPlace place_;
+};
+
 #endif
 
 }  // namespace platform
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index 8bf5174c4a..109c13a881 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -22,9 +22,8 @@ TEST(Device, Init) {
 
   int count = paddle::platform::GetCUDADeviceCount();
   for (int i = 0; i < count; i++) {
-    DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
-    Eigen::GpuDevice* gpu_device =
-        device_context->template GetEigenDevice<GPUPlace>();
+    CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
+    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
     ASSERT_NE(nullptr, gpu_device);
     delete device_context;
   }
@@ -47,3 +46,19 @@ TEST(Device, CUDADeviceContext) {
     delete device_context;
   }
 }
+
+TEST(Device, CUDNNDeviceContext) {
+  using paddle::platform::CUDNNDeviceContext;
+  using paddle::platform::CUDNNPlace;
+  if (paddle::platform::dynload::HasCUDNN()) {
+    int count = paddle::platform::GetCUDADeviceCount();
+    for (int i = 0; i < count; ++i) {
+      CUDNNDeviceContext* device_context =
+          new CUDNNDeviceContext(CUDNNPlace(i));
+      cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
+      ASSERT_NE(nullptr, cudnn_handle);
+      ASSERT_NE(nullptr, device_context->stream());
+      delete device_context;
+    }
+  }
+}
diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc
index 761d9edd87..76ec82e108 100644
--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/platform/dynload/cudnn.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/platform/dynload/cudnn.h>
+#include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
@@ -41,6 +42,21 @@ CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 #endif
 
+#ifdef PADDLE_USE_DSO
+bool HasCUDNN() {
+  std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle);
+  return cudnn_dso_handle != nullptr;
+}
+
+void EnforceCUDNNLoaded(const char* fn_name) {
+  PADDLE_ENFORCE(cudnn_dso_handle != nullptr,
+                 "Cannot load cudnn shared library. Cannot invoke method %s",
+                 fn_name);
+}
+#else
+bool HasCUDNN() { return true; }
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h
index 61caac5450..8c937b37d7 100644
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
@@ -25,9 +25,11 @@ namespace dynload {
 
 extern std::once_flag cudnn_dso_flag;
 extern void* cudnn_dso_handle;
+extern bool HasCUDNN();
 
 #ifdef PADDLE_USE_DSO
 
+extern void EnforceCUDNNLoaded(const char* fn_name);
 #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                    \
   struct DynLoad__##__name {                                       \
     template <typename... Args>                                    \
@@ -36,6 +38,7 @@ extern void* cudnn_dso_handle;
       std::call_once(cudnn_dso_flag,                               \
                      paddle::platform::dynload::GetCudnnDsoHandle, \
                      &cudnn_dso_handle);                           \
+      EnforceCUDNNLoaded(#__name);                                 \
       void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
       return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
     }                                                              \
diff --git a/paddle/platform/dynload/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc
index 6feba42c0d..7a82d06a0a 100644
--- a/paddle/platform/dynload/dynamic_loader.cc
+++ b/paddle/platform/dynload/dynamic_loader.cc
@@ -78,12 +78,11 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
     *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
     if (nullptr == *dso_handle) {
       if (dso_path == "libcudnn.dylib") {
-        PADDLE_ENFORCE(true,
-                       "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
-                       "For instance, sudo tar -xzf "
-                       "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
-                       "chmod a+r /usr/local/cuda/include/cudnn.h "
-                       "/usr/local/cuda/lib/libcudnn*");
+        LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
+                        "For instance, sudo tar -xzf "
+                        "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
+                        "chmod a+r /usr/local/cuda/include/cudnn.h "
+                        "/usr/local/cuda/lib/libcudnn*";
       }
     }
   }
@@ -92,7 +91,8 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
 
 static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
                                               const std::string& dso_name,
-                                              void** dso_handle) {
+                                              void** dso_handle,
+                                              bool throw_on_error = true) {
   int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
   *dso_handle = nullptr;
 
@@ -111,15 +111,19 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
       GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
     }
   }
-  PADDLE_ENFORCE(nullptr != *dso_handle,
-                 "Failed to find dynamic library: %s ( %s ) \n Please specify "
-                 "its path correctly using following ways: \n Method. set "
-                 "environment variable LD_LIBRARY_PATH on Linux or "
-                 "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
-                 "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
-                 "using the DYLD_LIBRARY_PATH is impossible unless System "
-                 "Integrity Protection (SIP) is disabled.",
-                 dlPath, dlerror());
+  auto error_msg =
+      "Failed to find dynamic library: %s ( %s ) \n Please specify "
+      "its path correctly using following ways: \n Method. set "
+      "environment variable LD_LIBRARY_PATH on Linux or "
+      "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
+      "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
+      "using the DYLD_LIBRARY_PATH is impossible unless System "
+      "Integrity Protection (SIP) is disabled.";
+  if (throw_on_error) {
+    PADDLE_ENFORCE(nullptr != *dso_handle, error_msg, dlPath, dlerror());
+  } else if (nullptr == *dso_handle) {
+    LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror());
+  }
 }
 
 void GetCublasDsoHandle(void** dso_handle) {
@@ -132,9 +136,10 @@ void GetCublasDsoHandle(void** dso_handle) {
 
 void GetCudnnDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle,
+                             false);
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false);
 #endif
 }
 
diff --git a/paddle/platform/dynload/nccl.cc b/paddle/platform/dynload/nccl.cc
index 8f92b8d94d..91168f37ef 100644
--- a/paddle/platform/dynload/nccl.cc
+++ b/paddle/platform/dynload/nccl.cc
@@ -25,6 +25,11 @@ void *nccl_dso_handle;
 
 NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
+void LoadNCCLDSO() {
+  platform::call_once(nccl_dso_flag,
+                      [] { GetNCCLDsoHandle(&nccl_dso_handle); });
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h
index 981b2ab258..11007c1031 100644
--- a/paddle/platform/dynload/nccl.h
+++ b/paddle/platform/dynload/nccl.h
@@ -28,18 +28,18 @@ extern std::once_flag nccl_dso_flag;
 extern void* nccl_dso_handle;
 
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                         \
-  struct DynLoad__##__name {                                           \
-    template <typename... Args>                                        \
-    auto operator()(Args... args) -> decltype(__name(args...)) {       \
-      using nccl_func = decltype(__name(args...)) (*)(Args...);        \
-      platform::call_once(nccl_dso_flag,                               \
-                          paddle::platform::dynload::GetNCCLDsoHandle, \
-                          &nccl_dso_handle);                           \
-      void* p_##__name = dlsym(nccl_dso_handle, #__name);              \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);         \
-    }                                                                  \
-  };                                                                   \
+extern void LoadNCCLDSO();
+
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                   \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      using nccl_func = decltype(__name(args...)) (*)(Args...);  \
+      paddle::platform::dynload::LoadNCCLDSO();                  \
+      void* p_##__name = dlsym(nccl_dso_handle, #__name);        \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
+    }                                                            \
+  };                                                             \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index 4fa2eaed31..541eca5f39 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -73,19 +73,20 @@ size_t GpuMaxChunkSize() {
   size_t available = 0;
 
   GpuMemoryUsage(available, total);
-
-  // Reserving the rest memory for page tables, etc.
-  size_t reserving = 0.05 * total;
-
+  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
+           << total / 1024 / 1024 << "M";
+  size_t reserving = static_cast<size_t>(0.05 * total);
   // If available less than minimum chunk size, no usable memory exists.
   available =
-      std::max(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
-               reserving) -
-      reserving;
+      std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
+               total - reserving);
+
+  // Reserving the rest memory for page tables, etc.
 
-  size_t allocating = FLAGS_fraction_of_gpu_memory_to_use * total;
+  size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
+                                          (total - reserving));
 
-  PADDLE_ENFORCE_LT(allocating, available);
+  PADDLE_ENFORCE_LE(allocating, available);
 
   return allocating;
 }
diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu
index c99dae68be..94ab360a19 100644
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@@ -31,7 +31,7 @@ namespace platform {
 TEST(NCCL, init) {
   std::vector<ncclComm_t> comms;
   comms.resize(dev_count);
-  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
+  dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
   for (int i = 0; i < dev_count; ++i) {
     dynload::ncclCommDestroy(comms[i]);
   }
@@ -62,7 +62,7 @@ TEST(NCCL, all_reduce) {
   std::vector<ncclComm_t> comms;
   comms.resize(dev_count);
   VLOG(1) << "Initializing ncclComm";
-  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
+  dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
   VLOG(1) << "ncclComm initialized";
   VLOG(1) << "Creating thread data";
   std::vector<std::unique_ptr<PerThreadData<double>>> data;
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
index 856e54df89..25fe8d21b4 100644
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -23,6 +23,7 @@ class PlacePrinter : public boost::static_visitor<> {
  public:
   explicit PlacePrinter(std::ostream &os) : os_(os) {}
   void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
+  void operator()(const MKLDNNPlace &) { os_ << "MKLDNNPlace"; }
   void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; }
 
  private:
@@ -38,12 +39,17 @@ const Place &get_place() { return the_default_place; }
 
 const GPUPlace default_gpu() { return GPUPlace(0); }
 const CPUPlace default_cpu() { return CPUPlace(); }
+const MKLDNNPlace default_mkldnn() { return MKLDNNPlace(); }
 
 bool is_gpu_place(const Place &p) {
   return boost::apply_visitor(IsGPUPlace(), p);
 }
 bool is_cpu_place(const Place &p) {
-  return !boost::apply_visitor(IsGPUPlace(), p);
+  return !is_gpu_place(p) && !is_mkldnn_place(p);
+}
+
+bool is_mkldnn_place(const Place &p) {
+  return boost::apply_visitor(IsMKLDNNPlace(), p);
 }
 
 bool places_are_same_class(const Place &p1, const Place &p2) {
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index 5370360a7d..ca98920d41 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -31,6 +31,14 @@ struct CPUPlace {
   inline bool operator!=(const CPUPlace &) const { return false; }
 };
 
+struct MKLDNNPlace {
+  MKLDNNPlace() {}
+
+  // needed for variant equality comparison
+  inline bool operator==(const MKLDNNPlace &) const { return true; }
+  inline bool operator!=(const MKLDNNPlace &) const { return false; }
+};
+
 struct GPUPlace {
   GPUPlace() : GPUPlace(0) {}
   explicit GPUPlace(int d) : device(d) {}
@@ -43,16 +51,28 @@ struct GPUPlace {
   int device;
 };
 
+struct CUDNNPlace : public GPUPlace {
+  CUDNNPlace() : GPUPlace() {}
+  explicit CUDNNPlace(int d) : GPUPlace(d) {}
+};
+
 struct IsGPUPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const MKLDNNPlace &) const { return false; }
   bool operator()(const GPUPlace &gpu) const { return true; }
 };
 
+struct IsMKLDNNPlace : public boost::static_visitor<bool> {
+  bool operator()(const MKLDNNPlace &) const { return true; }
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const GPUPlace &) const { return false; }
+};
+
 // Define the max number of Place in bit length. i.e., the max number of places
 // should be less equal than 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
 #define NUM_PLACE_TYPE_LIMIT_IN_BIT 4
 
-typedef boost::variant<GPUPlace, CPUPlace> Place;
+typedef boost::variant<CUDNNPlace, GPUPlace, CPUPlace, MKLDNNPlace> Place;
 
 // static check number of place types is less equal than
 // 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
@@ -65,9 +85,11 @@ const Place &get_place();
 
 const GPUPlace default_gpu();
 const CPUPlace default_cpu();
+const MKLDNNPlace default_mkldnn();
 
 bool is_gpu_place(const Place &);
 bool is_cpu_place(const Place &);
+bool is_mkldnn_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
 
 std::ostream &operator<<(std::ostream &, const Place &);
diff --git a/paddle/platform/place_test.cc b/paddle/platform/place_test.cc
index 33e2e5a439..c536b59ed8 100644
--- a/paddle/platform/place_test.cc
+++ b/paddle/platform/place_test.cc
@@ -5,25 +5,37 @@
 TEST(Place, Equality) {
   paddle::platform::CPUPlace cpu;
   paddle::platform::GPUPlace g0(0), g1(1), gg0(0);
+  paddle::platform::CUDNNPlace d0(0), d1(1), dd0(0);
 
   EXPECT_EQ(cpu, cpu);
   EXPECT_EQ(g0, g0);
   EXPECT_EQ(g1, g1);
   EXPECT_EQ(g0, gg0);
+  EXPECT_EQ(d0, dd0);
 
   EXPECT_NE(g0, g1);
+  EXPECT_NE(d0, d1);
 
   EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0));
   EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu));
+
+  EXPECT_TRUE(paddle::platform::is_gpu_place(d0));
+  EXPECT_FALSE(paddle::platform::places_are_same_class(g0, d0));
 }
 
 TEST(Place, Default) {
   EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::get_place()));
   EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu()));
   EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu()));
+  EXPECT_TRUE(
+      paddle::platform::is_mkldnn_place(paddle::platform::default_mkldnn()));
 
   paddle::platform::set_place(paddle::platform::CPUPlace());
   EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
+
+  paddle::platform::set_place(paddle::platform::MKLDNNPlace());
+  EXPECT_FALSE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
+  EXPECT_TRUE(paddle::platform::is_mkldnn_place(paddle::platform::get_place()));
 }
 
 TEST(Place, Print) {
diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h
index bb9d59ec0a..148ebaed3d 100644
--- a/paddle/platform/transform.h
+++ b/paddle/platform/transform.h
@@ -31,7 +31,7 @@ namespace paddle {
 namespace platform {
 
 // Transform on host or device. It provides the same API in std library.
-template <typename Place>
+template <typename DeviceContext>
 struct Transform {
   template <typename InputIter, typename OutputIter, typename UnaryOperation>
   void operator()(const DeviceContext& context, InputIter first, InputIter last,
@@ -45,16 +45,16 @@ struct Transform {
 };
 
 template <>
-struct Transform<platform::CPUPlace> {
+struct Transform<platform::CPUDeviceContext> {
   template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(const DeviceContext& context, InputIter first, InputIter last,
-                  OutputIter result, UnaryOperation op) {
+  void operator()(const platform::CPUDeviceContext& context, InputIter first,
+                  InputIter last, OutputIter result, UnaryOperation op) {
     std::transform(first, last, result, op);
   }
 
   template <typename InputIter1, typename InputIter2, typename OutputIter,
             typename BinaryOperation>
-  void operator()(const DeviceContext& context, InputIter1 first1,
+  void operator()(const platform::CPUDeviceContext& context, InputIter1 first1,
                   InputIter1 last1, InputIter2 first2, OutputIter result,
                   BinaryOperation op) {
     std::transform(first1, last1, first2, result, op);
@@ -63,27 +63,25 @@ struct Transform<platform::CPUPlace> {
 
 #ifdef __NVCC__
 template <>
-struct Transform<platform::GPUPlace> {
+struct Transform<platform::CUDADeviceContext> {
   template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(const DeviceContext& context, InputIter first, InputIter last,
-                  OutputIter result, UnaryOperation op) {
+  void operator()(const platform::CUDADeviceContext& context, InputIter first,
+                  InputIter last, OutputIter result, UnaryOperation op) {
     auto place = context.GetPlace();
     PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
-    auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
-    thrust::transform(thrust::cuda::par.on(ctx.stream()),
+    thrust::transform(thrust::cuda::par.on(context.stream()),
                       details::DevPtrCast(first), details::DevPtrCast(last),
                       details::DevPtrCast(result), op);
   }
 
   template <typename InputIter1, typename InputIter2, typename OutputIter,
             typename BinaryOperation>
-  void operator()(const DeviceContext& context, InputIter1 first1,
+  void operator()(const platform::CUDADeviceContext& context, InputIter1 first1,
                   InputIter1 last1, InputIter2 first2, OutputIter result,
                   BinaryOperation op) {
     auto place = context.GetPlace();
     PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
-    auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
-    thrust::transform(thrust::cuda::par.on(ctx.stream()),
+    thrust::transform(thrust::cuda::par.on(context.stream()),
                       details::DevPtrCast(first1), details::DevPtrCast(last1),
                       details::DevPtrCast(first2), details::DevPtrCast(result),
                       op);
diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu
index c76cab80e4..d36eac8379 100644
--- a/paddle/platform/transform_test.cu
+++ b/paddle/platform/transform_test.cu
@@ -39,7 +39,7 @@ TEST(Transform, CPUUnary) {
   using namespace paddle::platform;
   CPUDeviceContext ctx;
   float buf[4] = {0.1, 0.2, 0.3, 0.4};
-  Transform<paddle::platform::CPUPlace> trans;
+  Transform<paddle::platform::CPUDeviceContext> trans;
   trans(ctx, buf, buf + 4, buf, Scale<float>(10));
   for (int i = 0; i < 4; ++i) {
     ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
@@ -54,7 +54,7 @@ TEST(Transform, GPUUnary) {
   float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
   float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
   Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf));
-  Transform<paddle::platform::GPUPlace> trans;
+  Transform<paddle::platform::CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
   ctx.Wait();
   Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf));
@@ -68,7 +68,7 @@ TEST(Transform, CPUBinary) {
   using namespace paddle::platform;
   using namespace paddle::memory;
   int buf[4] = {1, 2, 3, 4};
-  Transform<paddle::platform::CPUPlace> trans;
+  Transform<paddle::platform::CPUDeviceContext> trans;
   CPUDeviceContext ctx;
   trans(ctx, buf, buf + 4, buf, buf, Multiply<int>());
   for (int i = 0; i < 4; ++i) {
@@ -84,7 +84,7 @@ TEST(Transform, GPUBinary) {
   CUDADeviceContext ctx(gpu0);
   int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
   Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf));
-  Transform<paddle::platform::GPUPlace> trans;
+  Transform<paddle::platform::CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
   ctx.Wait();
   Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf));
diff --git a/paddle/platform/variant.h b/paddle/platform/variant.h
index 619897ca19..284b4c42ac 100644
--- a/paddle/platform/variant.h
+++ b/paddle/platform/variant.h
@@ -14,6 +14,19 @@
 
 #pragma once
 
+#ifdef __CUDACC__
+#ifdef __CUDACC_VER_MAJOR__
+// CUDA 9 define `__CUDACC_VER__` as a warning message, manually define
+// __CUDACC_VER__ instead.
+#undef __CUDACC_VER__
+
+#define __CUDACC_VER__                                         \
+  (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 + \
+   __CUDACC_VER_BUILD__)
+#endif
+
+#endif
+
 #include <boost/config.hpp>
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index fd55f410d3..1fb69de90d 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1,7 +1,7 @@
 if(WITH_PYTHON)
   cc_library(paddle_pybind SHARED
     SRCS pybind.cc exception.cc protobuf.cc
-    DEPS pybind python backward proto_desc paddle_memory executor prune
+    DEPS pybind python backward proto_desc paddle_memory executor prune init
     ${GLOB_OP_LIB})
 endif(WITH_PYTHON)
 
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index c16d3e0cbe..4248db34c6 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -16,11 +16,11 @@ limitations under the License. */
 
 #include <mutex>  // for call_once
 #include <unordered_map>
-#include "gflags/gflags.h"
 #include "paddle/framework/backward.h"
 #include "paddle/framework/executor.h"
 #include "paddle/framework/feed_fetch_method.h"
 #include "paddle/framework/framework.pb.h"
+#include "paddle/framework/init.h"
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/lod_tensor_array.h"
@@ -51,24 +51,6 @@ static size_t UniqueIntegerGenerator(const std::string &prefix) {
   return generators[prefix].fetch_add(1);
 }
 
-std::once_flag gflags_init_flag;
-
-// TODO(qijun) move init gflags to init.cc
-void InitGflags(std::vector<std::string> &argv) {
-  std::call_once(gflags_init_flag, [&]() {
-    int argc = argv.size();
-    char **arr = new char *[argv.size()];
-    std::string line;
-    for (size_t i = 0; i < argv.size(); i++) {
-      arr[i] = &argv[i][0];
-      line += argv[i];
-      line += ' ';
-    }
-    google::ParseCommandLineFlags(&argc, &arr, true);
-    VLOG(1) << "Init commandline: " << line;
-  });
-}
-
 bool IsCompileGPU() {
 #ifndef PADDLE_WITH_CUDA
   return false;
@@ -282,6 +264,23 @@ All parameter, weight, gradient are variables in Paddle.
     }
     return ret_values;
   });
+  m.def("get_grad_op_descs",
+        [](const OpDescBind &op_desc,
+           const std::unordered_set<std::string> &no_grad_set,
+           std::unordered_map<std::string, std::string> &grad_to_var,
+           const std::vector<BlockDescBind *> &grad_sub_block) {
+          std::vector<std::unique_ptr<OpDescBind>> grad_op_descs =
+              framework::OpInfoMap::Instance()
+                  .Get(op_desc.Type())
+                  .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
+                                 grad_sub_block);
+          std::vector<OpDescBind *> grad_op_desc_ptrs(grad_op_descs.size());
+          std::transform(
+              grad_op_descs.begin(), grad_op_descs.end(),
+              grad_op_desc_ptrs.begin(),
+              [](std::unique_ptr<OpDescBind> &p) { return p.release(); });
+          return grad_op_desc_ptrs;
+        });
   m.def("prune", [](const ProgramDescBind &origin,
                     const std::vector<std::array<size_t, 2>> &targets) {
     ProgramDescBind prog_with_targets(origin);
@@ -421,7 +420,8 @@ All parameter, weight, gradient are variables in Paddle.
       .def("run", &Executor::Run);
 
   m.def("unique_integer", UniqueIntegerGenerator);
-  m.def("init_gflags", InitGflags);
+  m.def("init_gflags", framework::InitGflags);
+  m.def("init_devices", &framework::InitDevices);
 
   m.def("is_compile_gpu", IsCompileGPU);
   m.def("set_feed_variable", framework::SetFeedVariable);
diff --git a/paddle/scripts/check_env.sh b/paddle/scripts/check_env.sh
new file mode 100755
index 0000000000..af16b84ca8
--- /dev/null
+++ b/paddle/scripts/check_env.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+
+if [ "`uname -s`" != "Linux" ]; then
+  echo "Current scenario only support in Linux yet!"
+  exit 0
+fi
+
+echo "========================= Hardware Information ========================="
+sockets=`grep 'physical id' /proc/cpuinfo | sort -u | wc -l`
+cores_per_socket=`grep 'core id' /proc/cpuinfo | sort -u | wc -l`
+ht=`lscpu |grep "per core" |awk -F':' '{print $2}'|xargs`
+physical_cores=$((sockets * cores_per_socket))
+virtual_cores=`grep 'processor' /proc/cpuinfo | sort -u | wc -l`
+numa_nodes=`lscpu |grep "NUMA node(s)"|awk -F':' '{print $2}'|xargs`
+echo "CPU Name               : `cat /proc/cpuinfo |grep -i "model name" |uniq |awk -F ':' '{print $2}'|xargs`"
+echo "CPU Family             : `lscpu |grep \"CPU family\" |awk -F':' '{print $2}'|xargs`"
+echo "Socket Number          : $sockets"
+echo "Cores Per Socket       : $cores_per_socket"
+echo "Total Physical Cores   : $physical_cores"
+echo "Total Virtual Cores    : $virtual_cores"
+if [ $ht -eq 1 ]; then
+  echo "Hyper Threading        : OFF"
+  if [ $physical_cores -ne $virtual_cores ]; then
+    echo "Error: HT logical error"
+  fi
+else
+  echo "Hyper Threading        : ON"
+  if [ $physical_cores -ge $virtual_cores ]; then
+    echo "Error: HT logical error"
+  fi
+fi
+echo "NUMA Nodes             : $numa_nodes"
+if [ $numa_nodes -lt $sockets ]; then
+  echo "Warning: NUMA node is not enough for the best performance,\
+ at least $sockets"
+fi
+
+echo "-------------------------- Memory Information --------------------------"
+# dmidecode support start from 2.11
+dmi_ver=`dmidecode --version|awk -F '.' '{print $1}'|xargs`
+if [ $dmi_ver -lt 2 ]; then
+  echo "Error: dmidecode unknown or version is too old"
+  exit 0
+fi
+if [ `dmidecode | grep -ic "Permission denied"` -ne 0 ]; then
+  echo "Error: need root to run dmidecode"
+  exit 0
+fi
+max_dimms=0
+num_dimms_installed=0
+for dimm_id in `dmidecode |grep Locator|sort -u | awk -F ':' '{print $2}'`; do
+  num_refered=`dmidecode |grep -wc "$dimm_id"`
+  # the actual dimm id should be refered only once
+  if [ $num_refered -eq 1 ]; then
+    num_unknown=`dmidecode | awk '/'$dimm_id'/ {s=1; f=0};
+      /Unknown/ {f=1};
+      /Manufacturer/ {if (s==1) {print f; exit 0;}};'`
+    if [ $num_unknown -eq 0 ]; then
+      dimms_installed="$dimms_installed \n $dimm_id"
+      ((num_dimms_installed++))
+    else
+      dimms_uninstalled="$dimms_uninstalled \n $dimm_id"
+    fi
+    ((max_dimms++))
+  fi
+done
+echo "Installed DIMM number  : $num_dimms_installed"
+num_dimms_mapped=`dmidecode | grep "Memory Device Mapped" | wc -l`
+if [ $num_dimms_installed -ne $num_dimms_mapped ]; then
+  echo "Error: The installed DIMMs number does ont match the mapped memory device: $num_dimms_mapped"
+fi
+num_clock_configed=`dmidecode | grep -i "Configured Clock Speed" |grep -ic "Hz"`
+if [ $num_dimms_installed -ne $num_clock_configed ]; then
+  echo "Error: The installed DIMMs number does ont match configured clocks: $num_clock_configed"
+fi
+echo -e "Installed DIMMs Locator: $dimms_installed"
+echo -e "Not installed DIMMs    : $dimms_uninstalled"
+max_dimm_slots=`dmidecode | grep -c "Bank Locator"`
+echo "DIMMs max slots        : $max_dimm_slots"
+if [ $max_dimms -ne $max_dimm_slots ]; then
+  echo "Error: The max dimm slots do not match the max dimms: $max_dimms"
+fi
+free_ver_main=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $1}'`
+free_ver_sub=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $2}'`
+if [ $free_ver_main -lt 3 ] || [ $free_ver_sub -lt 3 ]; then
+  mem_sz=`free |grep -i mem |awk -F' ' '{print $2}'|xargs`
+  swap_sz=`free |grep -i swap |awk -F' ' '{print $2}'|xargs`
+  total_sz=`free -t |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
+  mem_sz="`awk 'BEGIN{printf "%.1f\n",('$mem_sz'/1024/1024)}'` GB" 
+  swap_sz="`awk 'BEGIN{printf "%.1f\n",('$swap_sz'/1024/1024)}'` GB"
+  total_sz="`awk 'BEGIN{printf "%.1f\n",('$total_sz'/1024/1024)}'` GB"
+else
+  mem_sz=`free -h |grep -i mem |awk -F' ' '{print $2}'|xargs`
+  swap_sz=`free -h |grep -i swap |awk -F' ' '{print $2}'|xargs`
+  total_sz=`free -th |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
+fi
+echo "Memory Size            : $mem_sz"
+echo "Swap Memory Size       : $swap_sz"
+echo "Total Memory Size      : $total_sz"
+echo "Max Memory Capacity    : `dmidecode |grep -i \"maximum capacity\"|sort -u|awk -F':' '{print $2}'|xargs`"
+# DIMMs fequency
+clock_speeds=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | awk -F':' '{print $2}'|xargs`
+echo "Configed Clock Speed   : $clock_speeds"
+num_clock_type=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | wc -l`
+if [ $num_clock_type -ne 1 ]; then
+  echo "Warning: Have more than 1 speed type, all DIMMs should have same fequency: $clock_speeds"
+fi
+
+echo "-------------------------- Turbo Information  --------------------------"
+scaling_drive=`cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_driver`
+echo "Scaling Driver         : $scaling_drive"
+if [ $scaling_drive == "intel_pstate" ] && [ -e /sys/devices/system/cpu/intel_pstate/no_turbo ]; then
+  turbo=`cat /sys/devices/system/cpu/intel_pstate/no_turbo`
+  if [ $turbo -eq 1 ]; then
+    echo "Turbo Status           : OFF"
+  else
+    echo "Turbo Status           : ON"
+  fi
+else
+  echo "Warning: Scaling driver is not intel_pstarte, maybe should enable it in BIOS"
+  echo "Turbo Status           : Unknown"
+fi
+# cpu frequency
+num_max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| sort -u |wc -l`
+num_min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| sort -u |wc -l`
+if [ $num_max_freq -ne 1 ]; then
+  echo "Error: the max_frequency of all CPU should be equal"
+fi
+if [ $num_min_freq -ne 1 ]; then
+  echo "Error: the min_frequency of all CPU should be equal"
+fi
+max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| uniq|xargs` # kHz
+max_freq=`awk 'BEGIN{printf "%.2f",('$max_freq' / 1000000)}'` # GHz
+min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| uniq|xargs` # kHz
+min_freq=`awk 'BEGIN{printf "%.2f",('$min_freq' / 1000000)}'` # GHz
+echo "CPU Max Frequency      : $max_freq GHz"
+echo "CPU Min Frequency      : $min_freq GHz"
+# cpu governor
+num_governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |wc -l`
+if [ $num_governor -ne 1 ]; then
+  echo "Error: the governor of all CPU should be the same"
+fi
+governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |uniq`
+echo "CPU Freq Governor      : $governor"
+
+
+echo "========================= Software Information ========================="
+echo "BIOS Release Date      : `dmidecode | grep "Release Date"|awk -F ':' '{print $2}'|xargs`"
+echo "OS Version             : `cat /etc/redhat-release`"
+echo "Kernel Release Version : `uname -r`"
+echo "Kernel Patch Version   : `uname -v`"
+echo "GCC Version            :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`"
+if command -v cmake >/dev/null 2>&1; then 
+  cmake_ver=`cmake --version | head -n 1 | awk -F 'version' '{print $2}'`
+else
+  cmake_ver=" Not installed"
+fi
+echo "CMake Version          :$cmake_ver"
+echo "------------------ Environment Variables Information -------------------"
+kmp_affinity=`env | grep KMP_AFFINITY`
+omp_dynamic=`env | grep OMP_DYNAMIC`
+omp_nested=`env | grep OMP_NESTED`
+omp_num_threads=`env | grep OMP_NUM_THREADS`
+mkl_num_threads=`env | grep MKL_NUM_THREADS`
+mkl_dynamic=`env | grep MKL_DYNAMIC`
+if [ ! $kmp_affinity ]; then kmp_affinity="unset"; fi
+if [ ! $omp_dynamic ]; then omp_dynamic="unset"; fi
+if [ ! $omp_nested ]; then omp_nested="unset"; fi
+if [ ! $omp_num_threads ]; then omp_num_threads="unset"; fi
+if [ ! $mkl_num_threads ]; then mkl_num_threads="unset"; fi
+if [ ! $mkl_dynamic ]; then mkl_dynamic="unset"; fi
+echo "KMP_AFFINITY           : $kmp_affinity"
+echo "OMP_DYNAMIC            : $omp_dynamic"
+echo "OMP_NESTED             : $omp_nested"
+echo "OMP_NUM_THREADS        : $omp_num_threads"
+echo "MKL_NUM_THREADS        : $mkl_num_threads"
+echo "MKL_DYNAMIC            : $mkl_dynamic"
+# Check if any MKL related libraries have been installed in LD_LIBRARY_PATH
+for path in `echo $LD_LIBRARY_PATH | awk -F ':' '{for(i=1;i<=NF;++i)print $i}'`; do
+  mkldnn_found=`find $path -name "libmkldnn.so"`
+  if [ "$mkldnn_found" ]; then
+    echo "Found MKL-DNN          : $mkldnn_found"
+  fi
+  mklml_found=`find $path -name "libmklml_intel.so"`
+  if [ "$mklml_found" ]; then
+    echo "Found MKLML            : $mklml_found"
+  fi
+  iomp_found=`find $path -name "libiomp5.so"`
+  if [ "$iomp_found" ]; then
+    echo "Found IOMP             : $iomp_found"
+  fi
+done
+
+# dump all details for fully check
+lscpu > lscpu.dump
+dmidecode > dmidecode.dump
+
+# The expected result would be like:
+# ========================= Hardware Information =========================
+# CPU Name               : Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz
+# CPU Family             : 6
+# Socket Number          : 2
+# Cores Per Socket       : 20
+# Total Physical Cores   : 40
+# Total Virtual Cores    : 40
+# Hyper Threading        : OFF
+# NUMA Nodes             : 2
+# -------------------------- Memory Information --------------------------
+# Installed DIMM number  : 12
+# Installed DIMMs Locator:
+#  CPU1_DIMM_A1
+#  CPU1_DIMM_B1
+#  CPU1_DIMM_C1
+#  CPU1_DIMM_D1
+#  CPU1_DIMM_E1
+#  CPU1_DIMM_F1
+#  CPU2_DIMM_A1
+#  CPU2_DIMM_B1
+#  CPU2_DIMM_C1
+#  CPU2_DIMM_D1
+#  CPU2_DIMM_E1
+#  CPU2_DIMM_F1
+# Not installed DIMMs    :
+#  CPU1_DIMM_A2
+#  CPU1_DIMM_B2
+#  CPU1_DIMM_C2
+#  CPU1_DIMM_D2
+#  CPU1_DIMM_E2
+#  CPU1_DIMM_F2
+#  CPU2_DIMM_A2
+#  CPU2_DIMM_B2
+#  CPU2_DIMM_C2
+#  CPU2_DIMM_D2
+#  CPU2_DIMM_E2
+#  CPU2_DIMM_F2
+# DIMMs max slots        : 24
+# Memory Size            : 376G
+# Swap Memory Size       : 4.0G
+# Total Memory Size      : 380G
+# Max Memory Capacity    : 2304 GB
+# Configed Clock Speed   : 2666 MHz
+# -------------------------- Turbo Information  --------------------------
+# Scaling Driver         : intel_pstate
+# Turbo Status           : ON
+# CPU Max Frequency      : 3.70 GHz
+# CPU Min Frequency      : 1.00 GHz
+# CPU Freq Governor      : performance
+# ========================= Software Information =========================
+# BIOS Release Date      : 03/10/2017
+# OS Version             : CentOS Linux release 7.3.1611 (Core)
+# Kernel Release Version : 3.10.0-514.el7.x86_64
+# Kernel Patch Version   : #1 SMP Tue Nov 22 16:42:41 UTC 2016
+# GCC Version            : 4.8.5 20150623 (Red Hat 4.8.5-11)
+# CMake Version          : 3.5.2
+# ------------------ Environment Variables Information -------------------
+# KMP_AFFINITY           : unset
+# OMP_DYNAMIC            : unset
+# OMP_NESTED             : unset
+# OMP_NUM_THREADS        : unset
+# MKL_NUM_THREADS        : unset
+# MKL_DYNAMIC            : unset
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
index 1a2d19e823..c2f631bdf4 100644
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
@@ -1,7 +1,7 @@
 # Build this image:  docker build -t mpi .
 #
 
-FROM paddledev/paddle:0.10.0rc3
+FROM paddlepaddle/paddle:0.10.0rc3
 
 ENV DEBIAN_FRONTEND noninteractive
 
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
index f3a6f1dba7..f0620498cf 100644
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -20,7 +20,7 @@ binaries.
 
 ## Run The Build
 
-### Build Evironments
+### Build Environments
 
 The pre-built build environment images are:
 
@@ -192,7 +192,7 @@ For developers who are interested in the C++ source code, please use -e "WOBOQ=O
 - The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
 
 ```bash
-docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" -e "WOBOQ=ON" paddlepaddle/paddle:latest-dev
+docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" -e "WOBOQ=ON" paddlepaddle/paddle:latest-dev
 ```
 
 - You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 0f889e6853..e43b9c218a 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -113,7 +113,10 @@ EOF
             -DWITH_SWIG_PY=ON \
             -DWITH_STYLE_CHECK=OFF
         make -j `nproc` gen_proto_py
+        make -j `nproc` paddle_python
         make -j `nproc` paddle_docs paddle_docs_cn
+        make -j `nproc` print_operators_doc
+        paddle/pybind/print_operators_doc > doc/en/html/operators.json
         popd
     fi
 
@@ -175,7 +178,7 @@ EOF
     # run paddle version to install python packages first
     RUN apt-get update &&\
         ${NCCL_DEPS}\
-        apt-get install -y wget python-pip && pip install -U pip && \
+        apt-get install -y wget python-pip dmidecode && pip install -U pip && \
         pip install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f /*.whl && \
@@ -185,14 +188,6 @@ EOF
     ${DOCKERFILE_GPU_ENV}
     ADD go/cmd/pserver/pserver /usr/bin/
     ADD go/cmd/master/master /usr/bin/
-EOF
-
-    if [[ ${WITH_DOC:-OFF} == 'ON' ]]; then
-        cat >> /paddle/build/Dockerfile <<EOF
-        ADD paddle/pybind/print_operators_doc /usr/bin/
-EOF
-    fi
-    cat >> /paddle/build/Dockerfile <<EOF
     # default command shows the paddle version and exit
     CMD ["paddle", "version"]
 EOF
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index d71cb84df3..43d2d1b410 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -140,7 +140,11 @@ else:
   sys.exit(0)
 EOF
 
-cpu_config
+if [ "`uname -s`" == "Linux" ]; then
+  # only support on linux yet, with mac can use v2
+  cpu_config
+fi
+
 # echo $KMP_AFFINITY $OMP_DYNAMIC
 
 case "$1" in
diff --git a/paddle/scripts/tools/build_docs/build_docs.sh b/paddle/scripts/tools/build_docs/build_docs.sh
index c6cbbc4eef..f9bc8bf63a 100755
--- a/paddle/scripts/tools/build_docs/build_docs.sh
+++ b/paddle/scripts/tools/build_docs/build_docs.sh
@@ -5,4 +5,4 @@ docker run --rm \
        -e "WITH_AVX=ON" \
        -e "WITH_DOC=ON" \
        -e "WOBOQ=ON" \
-       ${1:-"paddledev/paddle:dev"}
+       ${1:-"paddlepaddle/paddle:latest-dev"}
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index 7d54f0254c..ff0bac6a07 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -8,7 +8,10 @@ cd $TRAVIS_BUILD_DIR/build
 # Compile Documentation only.
 cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 make -j `nproc` gen_proto_py
+make -j `nproc` paddle_python
 make -j `nproc` paddle_docs paddle_docs_cn
+make -j `nproc` print_operators_doc
+paddle/pybind/print_operators_doc > doc/en/html/operators.json
 
 # check websites for broken links
 # It will be failed now!
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index 8f100f02e9..9a7dc0e356 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -20,7 +20,7 @@ DEFINE_bool(use_gpu, false, "Only support CPU training");
 DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
 #endif
 
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
 // TODO(TJ): change to true when MKLDNN layers support multi-inputs
 DEFINE_bool(use_mkldnn, false, "Default still keep use CPU training");
 #else
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 2fcdbbc8bd..1fbdd5bbd8 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -139,6 +139,8 @@ message PoolConfig {
   optional uint32 output_z = 16 [ default = 1 ];
   optional uint32 img_size_z = 17 [ default = 1 ];
   optional uint32 padding_z = 18 [ default = 1 ];
+
+  optional bool exclude_mode = 19;
 }
 
 message SppConfig {
diff --git a/python/.gitignore b/python/.gitignore
index cc7d0ece4a..1ba1d4c9b0 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -2,6 +2,7 @@
 build
 dist
 paddle.egg-info
+paddlepaddle_gpu.egg-info
 .idea
 paddle/proto/*.py
 paddle/proto/*.pyc
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index c8632295a2..6f589e9169 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -33,6 +33,12 @@ if(WITH_MKLDNN)
   list(APPEND MKL_DEPENDS mkldnn)
 endif()
 
+if(WITH_GPU)
+  SET(PACKAGE_NAME "paddlepaddle-gpu")
+else()
+  SET(PACKAGE_NAME "paddlepaddle")
+endif()
+
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index f662d68263..1030c94e16 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -11,3 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+try:
+    from version import full_version as __version__
+    from version import commit as __git_commit__
+except ImportError:
+    import sys
+    sys.stderr.write('''Warning with import paddle: you should not 
+     import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
+                     )
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 5b173694dd..239fe4204b 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1233,7 +1233,7 @@ def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
     bilinear_conf.out_size_y = bilinear.out_size_y
 
 
-def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
+def parse_pool(pool, input_layer_name, pool_conf, ceil_mode, exclude_mode):
     pool_conf.pool_type = pool.pool_type
     config_assert(pool.pool_type in [
         'max-projection', 'avg-projection', 'max-pool-with-mask', 'cudnn-max-pool', 'cudnn-avg-pool'
@@ -1262,6 +1262,8 @@ def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
     pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
                                          pool_conf.padding_y,
                                          pool_conf.stride_y, not ceil_mode)
+    if exclude_mode != None:
+        pool_conf.exclude_mode = exclude_mode
 
 
 def parse_pool3d(pool, input_layer_name, pool_conf, ceil_mode):
@@ -2287,11 +2289,17 @@ class Conv3DLayer(Conv3DLayerBase):
 class NormLayer(LayerBase):
     def __init__(self, name, inputs, **xargs):
         super(NormLayer, self).__init__(name, 'norm', 0, inputs=inputs, **xargs)
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        use_mkldnn = True if use_mkldnn and self.inputs[
+            0].norm.norm_type == 'cmrnorm-projection' else False
+        self.config.type = 'mkldnn_lrn' if use_mkldnn else self.config.type
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             norm_conf = self.config.inputs[input_index].norm_conf
             parse_norm(self.inputs[input_index].norm, input_layer.name,
                        norm_conf)
+            norm_conf.scale = self.inputs[
+                input_index].norm.scale if use_mkldnn else norm_conf.scale
             self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
                                norm_conf.channels, False)
             if norm_conf.norm_type == "cross-channel-norm":
@@ -2303,7 +2311,8 @@ class NormLayer(LayerBase):
 class PoolLayer(LayerBase):
     layer_type = 'pool'
 
-    def __init__(self, name, inputs, ceil_mode=True, **xargs):
+    def __init__(self, name, inputs, ceil_mode=True, exclude_mode=None,
+                 **xargs):
         use_mkldnn = int(g_command_config_args.get("use_mkldnn", 0))
         if self.layer_type == "mkldnn_pool":
             config_assert(use_mkldnn, "mkldnn_pool only support MKLDNN")
@@ -2314,7 +2323,7 @@ class PoolLayer(LayerBase):
             input_layer = self.get_input_layer(input_index)
             pool_conf = self.config.inputs[input_index].pool_conf
             parse_pool(self.inputs[input_index].pool, input_layer.name,
-                       pool_conf, ceil_mode)
+                       pool_conf, ceil_mode, exclude_mode)
             self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x,
                                pool_conf.channels)
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index f6dc58b9c0..7e118b24a4 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -21,7 +21,7 @@ from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
     ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
 from .evaluators import *
 from .poolings import MaxPooling, AvgPooling, MaxWithMaskPooling, BasePoolingType, \
-    CudnnAvgPooling, CudnnMaxPooling
+    CudnnAvgPooling, CudnnAvgInclPadPooling, CudnnMaxPooling
 from .attrs import *
 from .default_decorators import *
 
@@ -1519,34 +1519,33 @@ def lstmemory(input,
     NOTE: This is a low level user interface. You can use network.simple_lstm
     to config a simple plain lstm layer.
 
-    Please refer to **Generating Sequences With Recurrent Neural Networks** for
-    more details about LSTM.
-
-    Link_ goes as below.
-
-    .. _Link: http://arxiv.org/abs/1308.0850
+    Reference:
+        `Generating Sequences With Recurrent Neural Networks
+        <https://arxiv.org/pdf/1308.0850.pdf>`_
 
-    :param name: The lstmemory layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param size: DEPRECATED. size of the lstm cell
+    :param size: DEPRECATED. The dimension of the lstm cell.
     :type size: int
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param reverse: is sequence process reversed or not.
+    :param reverse: Whether the input sequence is processed in a reverse order.
     :type reverse: bool
     :param act: Activation type. TanhActivation is the default activation.
     :type act: BaseActivation
-    :param gate_act: gate activation type, SigmoidActivation by default.
+    :param gate_act: Activation type of this layer's gates. SigmoidActivation is the
+                     default activation.
     :type gate_act: BaseActivation
-    :param state_act: state activation type, TanhActivation by default.
+    :param state_act: Activation type of the state. TanhActivation is the default activation.
     :type state_act: BaseActivation
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: Parameter Attribute.
-    :type param_attr: ParameterAttribute | None | False
-    :param layer_attr: Extra Layer attribute
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1635,14 +1634,14 @@ def grumemory(input,
         h_t = (1 - z_t) h_{t-1} + z_t {\\tilde{h_t}}
 
     NOTE: In PaddlePaddle's implementation, the multiplication operations
-    :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not computed in
-    gate_recurrent layer. Consequently, an additional mixed_layer with
+    :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not performed
+    in gate_recurrent layer. Consequently, an additional mixed_layer with
     full_matrix_projection or a fc_layer must be included before grumemory
     is called.
 
-    More details can be found by referring to `Empirical Evaluation of Gated
-    Recurrent Neural Networks on Sequence Modeling.
-    <https://arxiv.org/abs/1412.3555>`_
+    Reference:
+        `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling
+        <https://arxiv.org/abs/1412.3555>`_
 
     The simple usage is:
 
@@ -1650,28 +1649,29 @@ def grumemory(input,
 
        gru = grumemory(input)
 
-    :param name: The gru layer name.
-    :type name: None | basestring
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput.
-    :param size: DEPRECATED. size of the gru cell
+    :param size: DEPRECATED. The dimension of the gru cell.
     :type size: int
-    :param reverse: Whether sequence process is reversed or not.
+    :param reverse: Whether the input sequence is processed in a reverse order.
     :type reverse: bool
     :param act: Activation type, TanhActivation is the default. This activation
                 affects the :math:`{\\tilde{h_t}}`.
     :type act: BaseActivation
-    :param gate_act: gate activation type, SigmoidActivation by default.
-                     This activation affects the :math:`z_t` and :math:`r_t`. It is the
-                     :math:`\\sigma` in the above formula.
+    :param gate_act: Activation type of this layer's two gates. SigmoidActivation is
+                     the default activation. This activation affects the :math:`z_t`
+                     and :math:`r_t`. It is the :math:`\\sigma` in the above formula.
     :type gate_act: BaseActivation
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: Parameter Attribute.
-    :type param_attr: ParameterAttribute | None | False
-    :param layer_attr: Extra Layer attribute
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1715,10 +1715,10 @@ def last_seq(input,
     """
     Get Last Timestamp Activation of a sequence.
 
-    If stride > 0, this layer slides a window whose size is determined by stride,
-    and return the last value of the window as the output. Thus, a long sequence
-    will be shorten. Note that for sequence with sub-sequence, the default value
-    of stride is -1.
+    If stride > 0, this layer will slide a window whose size is determined by stride,
+    and return the last value of the sequence in the window as the output. Thus, a
+    long sequence will be shortened. Note that for sequence with sub-sequence, the
+    default value of stride is -1.
 
     The simple usage is:
 
@@ -1727,14 +1727,16 @@ def last_seq(input,
        seq = last_seq(input=layer)
 
     :param agg_level: Aggregated level
+    :type agg_level: AggregateLevel
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
     :param stride: The step size between successive pooling regions.
-    :type stride: Int
-    :param layer_attr: extra layer attributes.
-    :type layer_attr: ExtraLayerAttribute.
+    :type stride: int
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1771,10 +1773,10 @@ def first_seq(input,
     """
     Get First Timestamp Activation of a sequence.
 
-    If stride > 0, this layer slides a window whose size is determined by stride,
-    and return the first value of the window as the output. Thus, a long sequence
-    will be shorten. Note that for sequence with sub-sequence, the default value
-    of stride is -1.
+    If stride > 0, this layer will slide a window whose size is determined by stride,
+    and return the first value of the sequence in the window as the output. Thus, a
+    long sequence will be shortened. Note that for sequence with sub-sequence, the
+    default value of stride is -1.
 
     The simple usage is:
 
@@ -1783,13 +1785,15 @@ def first_seq(input,
        seq = first_seq(input=layer)
 
     :param agg_level: aggregation level
+    :type agg_level: AggregateLevel
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
     :param stride: The step size between successive pooling regions.
-    :type stride: Int
-    :param layer_attr: extra layer attributes.
+    :type stride: int
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1847,8 +1851,8 @@ def expand_layer(input,
                  expand_level=ExpandLevel.FROM_NO_SEQUENCE,
                  layer_attr=None):
     """
-    A layer for "Expand Dense data or (sequence data where the length of each
-    sequence is one) to sequence data."
+    A layer for expanding dense data or (sequence data where the length of each
+    sequence is one) to sequence data.
 
     The example usage is:
 
@@ -1860,7 +1864,9 @@ def expand_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param expand_as: Expand as this layer's sequence info.
+    :param expand_as: Expand the input according to this layer's sequence infomation. And
+                      after the operation, the input expanded will have the same number of
+                      elememts as this layer.
     :type expand_as: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
@@ -1868,9 +1874,10 @@ def expand_layer(input,
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param expand_level: whether input layer is timestep(default) or sequence.
+    :param expand_level: Whether the input layer is a sequence or the element of a sequence.
     :type expand_level: ExpandLevel
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2709,7 +2716,8 @@ def img_pool_layer(input,
                    pool_size_y=None,
                    stride_y=None,
                    padding_y=None,
-                   ceil_mode=True):
+                   ceil_mode=True,
+                   exclude_mode=None):
     """
     Image pooling Layer.
 
@@ -2721,15 +2729,17 @@ def img_pool_layer(input,
 
     ..  math::
 
-        w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+        w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride}
+
+        h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
 
     - ceil_mode=False:
 
     ..  math::
 
-        w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+        w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride}
+
+        h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
 
     The example usage is:
 
@@ -2773,10 +2783,15 @@ def img_pool_layer(input,
     :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
                        details.
     :type layer_attr: ExtraLayerAttribute
-    :param ceil_mode: Wether to use the ceil function to calculate output height and width.
+    :param ceil_mode: Whether to use the ceil function to calculate output height and width.
                       True is the default. If it is set to False, the floor function will
                       be used.
     :type ceil_mode: bool
+    :param exclude_mode: Whether to exclude the padding cells when calculating, but only 
+                         work when pool_type is AvgPooling. If None, also exclude the padding 
+                         cells. If use cudnn, use CudnnAvgPooling or CudnnAvgInclPadPooling 
+                         as pool_type to identify the mode.
+    :type exclude_mode: bool
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2790,7 +2805,7 @@ def img_pool_layer(input,
         pool_type.name = 'avg'
 
     assert type(pool_type) in [AvgPooling, MaxPooling, MaxWithMaskPooling, CudnnAvgPooling,
-                               CudnnMaxPooling], \
+                               CudnnMaxPooling, CudnnAvgInclPadPooling], \
         "only (Cudnn)AvgPooling, (Cudnn)MaxPooling, MaxWithMaskPooling are supported"
 
     type_name = pool_type.name + '-projection' \
@@ -2819,6 +2834,7 @@ def img_pool_layer(input,
                     padding_y=padding_y))
         ],
         ceil_mode=ceil_mode,
+        exclude_mode=exclude_mode,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
         name,
@@ -2856,17 +2872,21 @@ def img_pool3d_layer(input,
 
     ..  math::
 
-        w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
-        d = 1 + int(ceil(input\_depth + 2 * padding\_z - pool\_size\_z) / float(stride\_z))
+        w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride}
+
+        h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
+
+        d & = 1 + \\frac{ceil(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z}
 
     - ceil_mode=False:
 
     ..  math::
 
-        w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
-        d = 1 + int(floor(input\_depth + 2 * padding\_z - pool\_size\_z) / float(stride\_z))
+        w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride}
+
+        h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
+
+        d & = 1 + \\frac{floor(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z}
 
     The example usage is:
 
@@ -2989,7 +3009,7 @@ def spp_layer(input,
 
     Reference:
         `Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
-        https://arxiv.org/abs/1406.4729`_
+        <https://arxiv.org/abs/1406.4729>`_
 
     The example usage is:
 
@@ -3091,7 +3111,7 @@ def img_cmrnorm_layer(input,
 
     Reference:
         `ImageNet Classification with Deep Convolutional Neural Networks
-        http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf`_
+        <http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf>`_
 
     The example usage is:
 
@@ -3159,7 +3179,7 @@ def batch_norm_layer(input,
     Reference:
         `Batch Normalization: Accelerating Deep Network Training by Reducing
         Internal Covariate Shift
-        http://arxiv.org/abs/1502.03167`_
+        <http://arxiv.org/abs/1502.03167>`_
 
     The example usage is:
 
@@ -3297,7 +3317,7 @@ def row_l2_norm_layer(input, name=None, layer_attr=None):
     A layer for L2-normalization in each row.
 
     .. math::
-       out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
+       out[i] = \\frac{in[i]} {\\sqrt{\\sum_{k=1}^N in[k]^{2}}}
 
     where the size of :math:`in` is (batchSize x dataDim) ,
     and the size of :math:`out` is a (batchSize x dataDim) .
@@ -5417,17 +5437,27 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
 
     Reference:
         `Maxout Networks
-        http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf`_
+        <http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf>`_
         `Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
-        https://arxiv.org/pdf/1312.6082v4.pdf`_
+        <https://arxiv.org/pdf/1312.6082v4.pdf>`_
+
 
     .. math::
-       y_{si+j} = \max_k x_{gsi + sk + j}
-       g = groups
-       s = input.size / num_channels
-       0 \le i < num_channels / groups
-       0 \le j < s
-       0 \le k < groups
+
+       & out = \max_k (in[n, k, o_c , s])
+
+       & out_{i * s + j} = \max_k in_{  k * o_{c} * s + i * s + j}
+
+       & s = \\frac{input.size}{ num\_channels}
+
+       & o_{c} = \\frac{num\_channels}{groups}
+
+       & 0 \le i < o_{c}
+
+       & 0 \le j < s
+
+       & 0 \le k < groups
+
 
     The simple usage is:
 
@@ -5486,7 +5516,7 @@ def ctc_layer(input,
     Reference:
         `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
         with Recurrent Neural Networks
-        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
+        <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf>`_
 
     Note:
         Considering the 'blank' label needed by CTC, you need to use (num_classes + 1)
@@ -5560,7 +5590,7 @@ def warp_ctc_layer(input,
     Reference:
         `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
         with Recurrent Neural Networks
-        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
+        <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf>`_
 
     Note:
         - Let num_classes represents the category number. Considering the 'blank'
@@ -5781,7 +5811,7 @@ def nce_layer(input,
 
     Reference:
         `A fast and simple algorithm for training neural probabilistic language
-        models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf`_
+        models. <https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf>`_
 
     The example usage is:
 
@@ -5897,7 +5927,7 @@ def rank_cost(left,
 
     Reference:
         `Learning to Rank using Gradient Descent
-        http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf`_
+        <http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf>`_
 
     .. math::
 
@@ -6164,9 +6194,11 @@ def huber_regression_cost(input,
     Given a prediction f(x), a label y and :math:`\delta`, the loss function
     is defined as:
 
-    .. math:
-       loss = 0.5*\left ( y-f(x) \right )^2, \left | y-f(x) \right |\leq \delta
-       loss = \delta \left | y-f(x) \right |-0.5\delta ^2, otherwise
+    .. math::
+
+       loss = 0.5*(y-f(x))^{2}, | y-f(x) | < \delta
+
+       loss = \delta | y-f(x) | - 0.5 \delta ^2, otherwise
 
     The example usage is:
 
@@ -6213,12 +6245,14 @@ def huber_classification_cost(input,
     """
     For classification purposes, a variant of the Huber loss called modified Huber
     is sometimes used. Given a prediction f(x) (a real-valued classifier score) and
-    a true binary class label :math:`y\in \left \{-1, 1 \right \}`, the modified Huber
+    a true binary class label :math:`y\in \{-1, 1 \}`, the modified Huber
     loss is defined as:
 
     .. math:
-       loss = \max \left ( 0, 1-yf(x) \right )^2, yf(x)\geq 1
-       loss = -4yf(x), \text{otherwise}
+
+       loss = \max ( 0, 1-yf(x) )^2, yf(x) \geq -1
+
+       loss = -4yf(x), otherwise
 
     The example usage is:
 
@@ -6433,7 +6467,7 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
 
     Reference:
         `Fast R-CNN
-        https://arxiv.org/pdf/1504.08083v2.pdf`_
+        <https://arxiv.org/pdf/1504.08083v2.pdf>`_
 
     The example usage is:
 
@@ -6640,7 +6674,7 @@ def prelu_layer(input,
 
     Reference:
         `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
-        ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf`_
+        ImageNet Classification <http://arxiv.org/pdf/1502.01852v1.pdf>`_
 
     .. math::
        z_i &\\quad if \\quad z_i > 0 \\\\
@@ -6737,7 +6771,7 @@ def gated_unit_layer(input,
 
     Reference:
         `Language Modeling with Gated Convolutional Networks
-        https://arxiv.org/abs/1612.08083`_
+        <https://arxiv.org/abs/1612.08083>`_
 
     .. math::
        y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
@@ -6963,7 +6997,7 @@ def clip_layer(input, min, max, name=None):
 
     .. math::
 
-        out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right)
+        out[i] = \min (\max (in[i],p_{1} ),p_{2} )
 
     .. code-block:: python
 
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 9776ae1805..8bfe56d795 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1119,8 +1119,9 @@ def simple_gru2(input,
     :param gru_bias_attr: bias parameter attribute of gru layer, 
                           False means no bias, None means default bias.
     :type gru_bias_attr: ParameterAttribute|False|None
-    :param gru_layer_attr: Extra attribute of the gru layer.
-    :type gru_layer_attr: ExtraLayerAttribute
+    :param gru_param_attr: param parameter attribute of gru layer,
+                          None means default param.
+    :type gru_param_attr: ParameterAttribute|None
     :return: the gru group.
     :rtype: LayerOutput
     """
diff --git a/python/paddle/trainer_config_helpers/poolings.py b/python/paddle/trainer_config_helpers/poolings.py
index f45616551b..e0aeb311b3 100644
--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
@@ -16,7 +16,8 @@
 
 __all__ = [
     "BasePoolingType", "MaxPooling", "AvgPooling", "MaxWithMaskPooling",
-    "CudnnMaxPooling", "CudnnAvgPooling", "SumPooling", "SquareRootNPooling"
+    "CudnnMaxPooling", "CudnnAvgPooling", "CudnnAvgInclPadPooling",
+    "SumPooling", "SquareRootNPooling"
 ]
 
 
@@ -88,6 +89,16 @@ class CudnnAvgPooling(BasePoolingType):
         BasePoolingType.__init__(self, "cudnn-avg-pool")
 
 
+class CudnnAvgInclPadPooling(BasePoolingType):
+    """
+    Cudnn average pooling only support GPU. Return the average value in the
+    pooling window taking into account the padding cells.
+    """
+
+    def __init__(self):
+        BasePoolingType.__init__(self, "cudnn-avg-incl-pad-pool")
+
+
 class AvgPooling(BasePoolingType):
     """
     Average pooling.
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index e31e501ce9..191d9ecfb1 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -71,7 +71,7 @@ def download(url, module_name, md5sum):
         if retry < retry_limit:
             retry += 1
         else:
-            raise RuntimeError("Cannot download {0} within retry limit {2}".
+            raise RuntimeError("Cannot download {0} within retry limit {1}".
                                format(url, retry_limit))
         print "Cache file %s not found, downloading %s" % (filename, url)
         r = requests.get(url, stream=True)
diff --git a/python/paddle/v2/fluid/evaluator.py b/python/paddle/v2/fluid/evaluator.py
index 137c573622..2d23ff0a16 100644
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -4,7 +4,7 @@ import layers
 from framework import Program, unique_name, Variable
 from layer_helper import LayerHelper
 
-__all__ = ['Accuracy']
+__all__ = ['Accuracy', 'ChunkEvaluator']
 
 
 def _clone_var_(block, var):
@@ -132,3 +132,74 @@ class Accuracy(Evaluator):
         correct = layers.cast(correct, dtype='float32', **kwargs)
         out = layers.elementwise_div(x=correct, y=total, **kwargs)
         return np.array(executor.run(eval_program, fetch_list=[out])[0])
+
+
+class ChunkEvaluator(Evaluator):
+    """
+    Accumulate counter numbers output by chunk_eval from mini-batches and 
+    compute the precision recall and F1-score using the accumulated counter 
+    numbers.
+    """
+
+    def __init__(self,
+                 input,
+                 label,
+                 chunk_scheme,
+                 num_chunk_types,
+                 excluded_chunk_types=None,
+                 **kwargs):
+        super(ChunkEvaluator, self).__init__("chunk_eval", **kwargs)
+        main_program = self.helper.main_program
+        if main_program.current_block().idx != 0:
+            raise ValueError("You can only invoke Evaluator in root block")
+
+        self.num_infer_chunks = self.create_state(
+            dtype='int64', shape=[1], suffix='num_infer_chunks')
+        self.num_label_chunks = self.create_state(
+            dtype='int64', shape=[1], suffix='num_label_chunks')
+        self.num_correct_chunks = self.create_state(
+            dtype='int64', shape=[1], suffix='num_correct_chunks')
+        kwargs = {'main_program': main_program}
+        precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval(
+            input=input,
+            label=label,
+            chunk_scheme=chunk_scheme,
+            num_chunk_types=num_chunk_types,
+            excluded_chunk_types=excluded_chunk_types,
+            **kwargs)
+        layers.sums(
+            input=[self.num_infer_chunks, num_infer_chunks],
+            out=self.num_infer_chunks,
+            **kwargs)
+        layers.sums(
+            input=[self.num_label_chunks, num_label_chunks],
+            out=self.num_label_chunks,
+            **kwargs)
+        layers.sums(
+            input=[self.num_correct_chunks, num_correct_chunks],
+            out=self.num_correct_chunks,
+            **kwargs)
+
+        self.metrics.extend([precision, recall, f1_score])
+
+    def eval(self, executor, eval_program=None):
+        if eval_program is None:
+            eval_program = Program()
+        block = eval_program.current_block()
+        kwargs = {'main_program': eval_program}
+        num_infer_chunks, num_label_chunks, num_correct_chunks = executor.run(
+            eval_program,
+            fetch_list=[_clone_var_(block, state) for state in self.states])
+        num_infer_chunks = num_infer_chunks[0]
+        num_label_chunks = num_label_chunks[0]
+        num_correct_chunks = num_correct_chunks[0]
+        precision = float(
+            num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0
+        recall = float(
+            num_correct_chunks) / num_label_chunks if num_label_chunks else 0
+        f1_score = float(2 * precision * recall) / (
+            precision + recall) if num_correct_chunks else 0
+        return np.array(
+            [precision], dtype='float32'), np.array(
+                [recall], dtype='float32'), np.array(
+                    [f1_score], dtype='float32')
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index bdc82eede9..9a99b045dc 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -46,6 +46,13 @@ class Executor(object):
             p.set_place(each)
             act_places.append(p)
 
+        # TODO(dzhwinter) : consider that our fluid tests all written in 
+        # GPUPlace(gpu_id), this will be changed in next PR.
+        if core.is_compile_gpu():
+            core.init_devices(["CPU", "GPU:0"])
+        else:
+            core.init_devices(["CPU"])
+
         self.executor = core.Executor(act_places)
         self.places = places
 
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index e6e3190b99..bf0cd275b6 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -3,6 +3,7 @@ import collections
 import numpy as np
 from . import core
 import proto.framework_pb2 as framework_pb2
+import google.protobuf.message
 import contextlib
 
 __all__ = [
@@ -13,11 +14,28 @@ __all__ = [
 
 
 def unique_name(prefix):
+    """
+    Generate unique names with prefix
+
+    Args:
+        prefix(str): The prefix of return string
+
+    Returns(str): A unique string with the prefix
+
+    """
     uid = core.unique_integer(prefix)  # unique during whole process.
     return "_".join([prefix, str(uid)])
 
 
 def convert_np_dtype_to_dtype_(np_dtype):
+    """
+    Convert the data type in numpy to the data type in Paddle
+    Args:
+        np_dtype(np.dtype): the data type in numpy
+
+    Returns(core.DataType): the data type in Paddle
+
+    """
     dtype = np.dtype(np_dtype)
     if dtype == np.float32:
         return core.DataType.FP32
@@ -38,17 +56,33 @@ def convert_np_dtype_to_dtype_(np_dtype):
 
 
 def dtype_is_floating(dtype):
+    """
+    Check the data type is floating or not.
+    Args:
+        dtype(np.dtype|core.DataType): data type.
+            Could be numpy format or Paddle format
+
+    Returns(bool): True if data type is a float value
+
+    """
     if not isinstance(dtype, core.DataType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if (dtype == core.DataType.FP16 or dtype == core.DataType.FP32 or
-            dtype == core.DataType.FP64):
-        return True
-    else:
-        return False
+    return dtype in [core.DataType.FP16, core.DataType.FP32, core.DataType.FP64]
 
 
 def _debug_string_(proto, throw_on_error=True):
+    """
+    Get the debug string of a protobuf message. The message could be not
+    initialized.
+    Args:
+        proto(google.protobuf.message.Message): The protobuf message
+        throw_on_error(bool): True if raise an error when the protobuf message
+            is not initialized.
+
+    Returns(str): The debug string of the protobuf message
+
+    """
     error_fields = list()
     if not proto.IsInitialized(error_fields) and throw_on_error:
         raise ValueError("{0} are not initialized\nThe message is {1}".format(
@@ -57,6 +91,38 @@ def _debug_string_(proto, throw_on_error=True):
 
 
 class Variable(object):
+    """
+    Python variable. Every input and output of an operator is a variable. Every
+    variable belongs to a block. The variable has a name and two variables in
+    different blocks could have the same name.
+
+    There are many kinds of variables. Please reference the framework.proto for
+    details.
+
+    Notes: The constructor of Variable should not be invoked directly. Please
+    use `Block.create_var` to create a variable.
+
+    >>> cur_program = Program()
+    >>> cur_block = cur_program.current_block()
+    >>> new_variable = cur_block.create_var(
+    >>>                    name="X", shape=[-1, 23, 48], dtype='float32')
+
+    Args:
+        block(Block): The associated block. It will be passed by
+            `Block.create_var` automatically.
+        type(core.VarDesc.VarType): Variable type. Please reference the
+            framework.proto for details.
+        shape(tuple|list|None): The shape of variable. -1 means the batch size.
+            Some kinds of variable do not contain shape, just set it to None.
+        dtype(np.dtype|core.DataType|str): The data type of variable.
+        lod_level(int): The level of lod tensor. 0 means there is not a time
+            series data.
+        persistable(bool): True if the variable should be saved as check point.
+            Defaults to False.
+        stop_gradient(bool): True if the variable will stop to calculate
+            gradients when backward. Defaults to False.
+    """
+
     def __init__(self,
                  block,
                  type=core.VarDesc.VarType.LOD_TENSOR,
@@ -140,6 +206,16 @@ class Variable(object):
         return self.to_string(True)
 
     def to_string(self, throw_on_error):
+        """
+        Get debug string.
+
+        Args:
+            throw_on_error(bool): True if raise an exception when self is not
+                intialized.
+
+        Returns(str): The debug string.
+
+        """
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.VarDesc.FromString(str(protostr))
         return _debug_string_(proto, throw_on_error)
@@ -185,7 +261,9 @@ class Variable(object):
 def get_all_op_protos():
     """
     Get all registered op proto from PaddlePaddle C++ end.
-    :return: A list of registered OpProto.
+
+    Returns(list): list of OpProto
+
     """
     protostrs = core.get_all_op_protos()
     ret_values = []
@@ -196,6 +274,10 @@ def get_all_op_protos():
 
 
 class OpProtoHolder(object):
+    """
+    A global variable to hold all OpProtos from C++ as a map
+    """
+
     @classmethod
     def instance(cls):
         if not hasattr(cls, '_instance'):
@@ -212,12 +294,26 @@ class OpProtoHolder(object):
             self.op_proto_map[proto.type] = proto
 
     def get_op_proto(self, type):
+        """
+        Get OpProto by a type string.
+        Args:
+            type(str): The type that operator registered in C++ side.
+
+        Returns(framework_pb2.OpProto): The OpProto
+
+        """
         if type not in self.op_proto_map:
             raise ValueError("Operator \"%s\" has not been registered." % type)
         return self.op_proto_map[type]
 
 
 class Operator(object):
+    """
+    Python Operator class. The operator represents the build in instructs in a
+    Block. Users can use the build in instructs to describe their neural
+    network.
+    """
+
     def __init__(self,
                  block,
                  desc,
@@ -225,6 +321,30 @@ class Operator(object):
                  inputs=None,
                  outputs=None,
                  attrs=None):
+        """
+        Constructor.
+
+        Notes: The constructor of operator should not be invoked directly. Use
+        Block.append_op or Block.prepend_op instead.
+
+        >>> cur_program = Program()
+        >>> cur_block = cur_program.current_block()
+        >>> # var1 += var2 + var3
+        >>> cur_block.append_op(type="sum",
+        >>>                     inputs={"X": [var1, var2, var3]},
+        >>>                     outputs={"Out": [var1]})
+
+        Args:
+            block(Block): The block has the current operator
+            desc(core.OpDesc): The protobuf description
+            type(str): The type of operator.
+            inputs(dict): The input dictionary. Key is the input parameter name.
+                Value is a list of variables.
+            outputs(dict): The output dictionary. Has same format with inputs
+            attrs(dict): The attributes dictionary. Key is attribute name. Value
+                is the attribute value. The attribute type should be as same as
+                the type registered in C++
+        """
         self.block = block
         self.desc = desc
         if len(self.desc.type()) != 0:
@@ -311,6 +431,15 @@ class Operator(object):
             self.desc.infer_shape(self.block.desc)
 
     def to_string(self, throw_on_error):
+        """
+        To debug string.
+        Args:
+            throw_on_error(bool): raise exception when self is not initialized
+                when throw_on_error is True
+
+        Returns(str): The debug string.
+
+        """
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.OpDesc.FromString(str(protostr))
         return _debug_string_(proto, throw_on_error)
@@ -325,21 +454,55 @@ class Operator(object):
         return self.desc.type()
 
     def input(self, name):
+        """
+        Get input arguments by the input parameter name
+        Args:
+            name(str): The input parameter name
+
+        Returns(list): return the list of argument names associated with the
+            specific parameter name.
+
+        """
         return self.desc.input(name)
 
     @property
     def input_names(self):
+        """
+        Get all input parameter names
+        Returns(list): return a list of input parameter names
+
+        """
         return self.desc.input_names()
 
     def output(self, name):
+        """
+        Get output arguments by the output parameter name
+        Args:
+            name(str): The output parameter name
+
+        Returns(list): return the list of argument names associated with the
+            specific parameter name.
+
+        """
         return self.desc.output(name)
 
     @property
     def output_names(self):
+        """
+        Get all output parameter names
+        Returns(list): return a list of output parameter names
+
+        """
         return self.desc.output_names()
 
     @property
     def idx(self):
+        """
+        Return the array index of current operator.
+        Returns(int): The array index in block.ops array
+        Raises:
+            ValueError: when the operator is not found.
+        """
         for i, op in enumerate(self.block.ops):
             if op == self:
                 return i
@@ -347,19 +510,57 @@ class Operator(object):
             "Can't find op itself in it's block. It could be a bug of Paddle.")
 
     def has_attr(self, name):
+        """
+        operator has the attribute with name or not.
+        Args:
+            name(str): the attribute name
+
+        Returns(bool): True if has this attribute.
+
+        """
         return self.desc.has_attr(name)
 
     def attr_type(self, name):
+        """
+        Get the type of attribute by attribute name
+        Args:
+            name(str): the attribute name
+
+        Returns(core.AttrType): the attribute type
+
+        """
         return self.desc.attr_type(name)
 
     @property
     def attr_names(self):
+        """
+        Get all attribute names
+        Returns(list): The list of attribute name
+
+        """
         return self.desc.attr_names()
 
     def attr(self, name):
+        """
+        Get attribute by name
+        Args:
+            name(str): the attribute name
+
+        Returns(bool|int|str|float|list): The attribute value. The return value
+            can be any valid attribute type.
+
+        """
         return self.desc.attr(name)
 
     def block_attr(self, name):
+        """
+        Get the block attribute by name
+        Args:
+            name(str): the attribute name
+
+        Returns(int): the block index
+
+        """
         return self.desc.block_attr(name)
 
 
@@ -479,7 +680,7 @@ class Block(object):
         """
         Copy the information of parameters from other block
         Args:
-            other(Block): other block 
+            other(Block): other block
 
         Returns:
             None
@@ -623,7 +824,7 @@ class Program(object):
 
     def copy_param_info_from(self, other):
         """
-        Copy the information of parameters from other program. 
+        Copy the information of parameters from other program.
         Args:
             other(Program): Other program
 
@@ -675,7 +876,7 @@ def default_startup_program():
     """
     Get default startup program. In startup program, Paddle will initialize
     parameters, initialize nccl handle, etc.
-    
+
     Returns:
         Program: startup program
     """
@@ -685,7 +886,7 @@ def default_startup_program():
 def default_main_program():
     """
     Get default main program. The main program is used for training or testing.
-    
+
     Returns:
         Program: main program
     """
@@ -695,7 +896,7 @@ def default_main_program():
 def switch_main_program(program):
     """
     Switch the main program to a new program.
-    
+
     Args:
         program(Program): The new main program
 
@@ -710,7 +911,7 @@ def switch_main_program(program):
 
 def switch_startup_program(program):
     """
-    Switch the startup program to a new program 
+    Switch the startup program to a new program
     Args:
         program(Program): The new startup program
 
@@ -727,15 +928,15 @@ def switch_startup_program(program):
 def program_guard(main_program, startup_program=None):
     """
     Switch program with `with` statement
-    
+
     Examples:
         >>> with program_guard(Program()):
         >>>   data = fluid.layers.data(...)
         >>>   hidden = fluid.layers.fc(...)
-        
+
     Args:
         main_program(Program): New main program inside `with` statement
-        startup_program(Program): New startup program inside `with` statement. 
+        startup_program(Program): New startup program inside `with` statement.
             None means do not change startup program.
 
     Returns:
diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py
deleted file mode 100644
index 98a04ea9c2..0000000000
--- a/python/paddle/v2/fluid/layers.py
+++ /dev/null
@@ -1,2103 +0,0 @@
-import core
-import proto.framework_pb2 as framework_pb2
-from framework import OpProtoHolder, Variable, Program, Operator
-from initializer import Constant, Normal, Xavier, Initializer
-from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
-import re
-import cStringIO
-from param_attr import ParamAttr
-import contextlib
-
-__all__ = [
-    'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
-    'StaticRNN', 'cast', 'sequence_conv', 'sequence_pool', 'sums', 'cos_sim',
-    'batch_norm', 'accuracy', 'split_lod_tensor', 'While'
-]
-
-
-def fc(input,
-       size,
-       num_flatten_dims=1,
-       param_attr=None,
-       bias_attr=None,
-       act=None,
-       name=None,
-       main_program=None,
-       startup_program=None):
-    """
-    Fully Connected Layer.
-
-    Args:
-       input: The input tensor to the function
-       size: The size of the layer
-       num_flatten_dims: Number of columns in input
-       param_attr: The parameters/weights to the FC Layer
-       param_initializer: Initializer used for the weight/parameter. If None, XavierInitializer() is used
-       bias_attr: The bias parameter for the FC layer
-       bias_initializer: Initializer used for the bias. If None, then ConstantInitializer() is used
-       act: Activation to be applied to the output of FC layer
-       name: Name/alias of the function
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
-
-    This function can take in multiple inputs and performs the Fully Connected
-    function (linear transformation) on top of each of them.
-    So for input x, the output will be : Wx + b. Where W is the parameter,
-    b the bias and x is the input.
-
-    The function also applies an activation (non-linearity) on top of the
-    output, if activation is passed in the input.
-
-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
-
-    """
-    helper = LayerHelper('fc', **locals())
-
-    dtype = helper.input_dtype()
-
-    mul_results = []
-    for input_var, param_attr in helper.iter_inputs_and_params():
-        input_shape = input_var.shape
-        param_shape = [
-            reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
-        ] + [size]
-        w = helper.create_parameter(
-            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
-        tmp = helper.create_tmp_variable(dtype)
-        helper.append_op(
-            type="mul",
-            inputs={
-                "X": input_var,
-                "Y": w,
-            },
-            outputs={"Out": tmp},
-            attrs={'x_num_col_dims': num_flatten_dims,
-                   'y_num_col_dims': 1})
-        mul_results.append(tmp)
-
-    # sum
-    if len(mul_results) == 1:
-        pre_bias = mul_results[0]
-    else:
-        pre_bias = helper.create_tmp_variable(dtype)
-        helper.append_op(
-            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
-    # add bias
-    pre_activation = helper.append_bias_op(pre_bias)
-    # add activation
-    return helper.append_activation(pre_activation)
-
-
-def embedding(input,
-              size,
-              is_sparse=False,
-              param_attr=None,
-              dtype='float32',
-              main_program=None,
-              startup_program=None):
-    """
-    Embedding Layer.
-
-    Args:
-       param_initializer:
-       input: The input to the function
-       size: The size of the layer
-       is_sparse: A flag that decleares whether the input is sparse
-       param_attr: Parameters for this layer
-       dtype: The type of data : float32, float_16, int etc
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
-
-    This function can take in the input (which is a vector of IDs) and
-    performs a lookup in the lookup_table using these IDs, to result into
-    the embedding of each ID in the input.
-
-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
-
-    """
-
-    helper = LayerHelper('embedding', **locals())
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
-    tmp = helper.create_tmp_variable(dtype)
-    helper.append_op(
-        type='lookup_table',
-        inputs={'Ids': input,
-                'W': w},
-        outputs={'Out': tmp},
-        attrs={'is_sparse': is_sparse})
-    return tmp
-
-
-# TODO(qijun): expose H0 and C0
-def dynamic_lstm(input,
-                 size,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_peepholes=True,
-                 is_reverse=False,
-                 gate_activation='sigmoid',
-                 cell_activation='tanh',
-                 candidate_activation='tanh',
-                 dtype='float32',
-                 main_program=None,
-                 startup_program=None):
-    helper = LayerHelper('lstm', **locals())
-    size = size / 4
-    weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
-    bias_size = [1, 7 * size]
-    if not use_peepholes:
-        bias_size[1] = 4 * size
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
-
-    hidden = helper.create_tmp_variable(dtype)
-    cell = helper.create_tmp_variable(dtype)
-    batch_gate = helper.create_tmp_variable(dtype)
-    batch_cell_pre_act = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type='lstm',
-        inputs={'Input': input,
-                'Weight': weight,
-                'Bias': bias},
-        outputs={
-            'Hidden': hidden,
-            'Cell': cell,
-            'BatchGate': batch_gate,
-            'BatchCellPreAct': batch_cell_pre_act
-        },
-        attrs={
-            'use_peepholes': use_peepholes,
-            'is_reverse': is_reverse,
-            'gate_activation': gate_activation,
-            'cell_activation': cell_activation,
-            'candidate_activation': candidate_activation
-        })
-    return hidden, cell
-
-
-def data(name,
-         shape,
-         append_batch_size=True,
-         dtype='float32',
-         lod_level=0,
-         type=core.VarDesc.VarType.LOD_TENSOR,
-         main_program=None,
-         startup_program=None,
-         stop_gradient=True):
-    """
-    Data Layer.
-
-    Args:
-       name: The name/alias of the function
-       shape: Tuple declaring the shape.
-       append_batch_size: Whether or not to append the data as a batch.
-       dtype: The type of data : float32, float_16, int etc
-       type: The output type. By default it is LOD_TENSOR.
-       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
-       stop_gradient: A boolean that mentions whether gradient should flow.
-
-    This function takes in input and based on whether data has
-    to be returned back as a minibatch, it creates the global variable using
-    the helper functions. The global variables can be accessed by all the
-    following operations and layers in the graph.
-
-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
-
-    """
-    helper = LayerHelper('data', **locals())
-    shape = list(shape)
-    for i in xrange(len(shape)):
-        if shape[i] is None:
-            shape[i] = -1
-            append_batch_size = False
-        elif shape[i] < 0:
-            append_batch_size = False
-
-    if append_batch_size:
-        shape = [-1] + shape  # append batch size as -1
-
-    return helper.create_global_variable(
-        name=name,
-        shape=shape,
-        dtype=dtype,
-        type=type,
-        stop_gradient=stop_gradient,
-        lod_level=lod_level)
-
-
-def create_tensor(dtype, name=None, main_program=None, startup_program=None):
-    helper = LayerHelper("create_tensor", **locals())
-    return helper.create_variable(name=helper.name, dtype=dtype)
-
-
-def _convert_(name):
-    """
-    Formatting.
-
-    Args:
-       name: The name/alias
-
-    This function takes in a name and converts it to a standard format of
-    group1_group2. Where as per the regular expression, group1 can have
-    alphabets and numbers and group2 has capital alphabets.
-
-    """
-    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
-    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
-
-
-def _generate_doc_string_(op_proto):
-    """
-    Generate docstring by OpProto
-
-    Args:
-        op_proto (framework_pb2.OpProto): a protobuf message typed OpProto
-
-    Returns:
-        str: the document string
-    """
-
-    def _type_to_str_(tp):
-        return framework_pb2.AttrType.Name(tp)
-
-    if not isinstance(op_proto, framework_pb2.OpProto):
-        raise TypeError("OpProto should be `framework_pb2.OpProto`")
-
-    buf = cStringIO.StringIO()
-    buf.write(op_proto.comment)
-    buf.write('\nArgs:\n')
-    for each_input in op_proto.inputs:
-        line_begin = '    {0}: '.format(_convert_(each_input.name))
-        buf.write(line_begin)
-        buf.write(each_input.comment)
-        buf.write('\n')
-        buf.write(' ' * len(line_begin))
-        buf.write('Duplicable: ')
-        buf.write(str(each_input.duplicable))
-        buf.write('  Optional: ')
-        buf.write(str(each_input.dispensable))
-        buf.write('\n')
-
-    for each_attr in op_proto.attrs:
-        buf.write('    ')
-        buf.write(each_attr.name)
-        buf.write(' (')
-        buf.write(_type_to_str_(each_attr.type))
-        buf.write('): ')
-        buf.write(each_attr.comment)
-        buf.write('\n')
-
-    if len(op_proto.outputs) != 0:
-        buf.write('\nReturns:\n')
-        buf.write('    ')
-        for each_opt in op_proto.outputs:
-            if not each_opt.intermediate:
-                break
-        buf.write(each_opt.comment)
-
-    return buf.getvalue()
-
-
-def _create_op_func_(op_type):
-    """
-    Create an Operator for a Function.
-
-    Args:
-       op_type: The name of the operator to be created
-
-    This function takes in the operator type (sigmoid, mean , average etc) and
-    creates the operator functionality.
-
-    """
-    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
-    not_intermediate_outputs = \
-        filter(lambda output: not output.intermediate, op_proto.outputs)
-    intermediate_outputs = \
-        filter(lambda output: output.intermediate, op_proto.outputs)
-
-    if len(not_intermediate_outputs) != 1:
-        raise ValueError("Only one non intermediate output operator can be",
-                         "automatically generated")
-
-    if not_intermediate_outputs[0].duplicable:
-        raise ValueError(
-            "Only non duplicable op can be automatically generated")
-
-    for output in intermediate_outputs:
-        if output.duplicable:
-            raise ValueError("The op can be automatically generated only when ",
-                             "all intermediate ops are not duplicable")
-
-    o_name = not_intermediate_outputs[0].name
-    intermediate_output_names = [output.name for output in intermediate_outputs]
-
-    def infer_and_check_dtype(op_proto, **kwargs):
-        """
-        This function performs the sanity check for dtype and
-        instance type.
-        """
-        dtype = None
-        for ipt in op_proto.inputs:
-            name = _convert_(ipt.name)
-            val = kwargs.pop(name, [])
-            if not isinstance(val, list) and not isinstance(val, tuple):
-                val = [val]
-            for each in val:
-                if not isinstance(each, Variable):
-                    raise ValueError("input of {0} must be variable".format(
-                        op_type))
-
-                if dtype is None:
-                    dtype = each.dtype
-                elif dtype != each.dtype:
-                    raise ValueError(
-                        "operator {0} must input same dtype".format(op_type))
-
-        return dtype
-
-    def func(**kwargs):
-        helper = LayerHelper(op_type, **kwargs)
-
-        dtype = infer_and_check_dtype(op_proto, **kwargs)
-
-        inputs = dict()
-        for ipt in op_proto.inputs:
-            name = _convert_(ipt.name)
-            val = kwargs.pop(name, [])
-            if not isinstance(val, list) and not isinstance(val, tuple):
-                val = [val]
-            inputs[ipt.name] = val
-
-        outputs = dict()
-        out = helper.create_tmp_variable(dtype=dtype)
-        outputs[o_name] = [out]
-        for name in intermediate_output_names:
-            outputs[name] = [helper.create_tmp_variable(dtype=dtype)]
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
-        return helper.append_activation(out)
-
-    func.__name__ = op_type
-    globals()[op_type] = func
-    func.__doc__ = _generate_doc_string_(op_proto)
-    global __all__
-    __all__.append(op_type)
-
-
-_create_op_func_('mean')
-_create_op_func_('mul')
-_create_op_func_('elementwise_add')
-_create_op_func_('elementwise_div')
-_create_op_func_('dropout')
-_create_op_func_('reshape')
-_create_op_func_('sigmoid')
-_create_op_func_('scale')
-_create_op_func_('reshape')
-_create_op_func_('transpose')
-_create_op_func_('sigmoid_cross_entropy_with_logits')
-
-
-def cast(x, dtype, main_program=None):
-    """
-    This function takes in the input with input_dtype
-    and casts it to the output_dtype as the output.
-    """
-    helper = LayerHelper('cast', **locals())
-    out = helper.create_tmp_variable(dtype=dtype)
-    helper.append_op(
-        type='cast',
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={'in_dtype': x.dtype,
-               'out_dtype': out.dtype})
-    return out
-
-
-def concat(input, axis, main_program=None, startup_program=None):
-    """
-    This function concats the input along the axis mentioned
-    and returns that as the output.
-    """
-    helper = LayerHelper('concat', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
-    helper.append_op(
-        type='concat',
-        inputs={'X': input},
-        outputs={'Out': [out]},
-        attrs={'axis': axis})
-    return out
-
-
-def sums(input, out=None, main_program=None, startup_program=None):
-    """
-    This function takes in the input and performs the sum operation on it
-    and returns that as the output.
-    """
-    helper = LayerHelper('sum', **locals())
-    if out is None:
-        out = helper.create_tmp_variable(dtype=helper.input_dtype())
-    helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
-    return out
-
-
-def linear_chain_crf(input,
-                     label,
-                     param_attr=None,
-                     main_program=None,
-                     startup_program=None):
-    helper = LayerHelper('linear_chain_crf', **locals())
-    size = input.shape[1]
-    transition = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=[size + 2, size],
-        dtype=helper.input_dtype())
-    alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
-    emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
-    transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
-    log_likelihood = helper.create_tmp_variable(dtype=helper.input_dtype())
-    helper.append_op(
-        type='linear_chain_crf',
-        inputs={"Emission": [input],
-                "Transition": transition,
-                "Label": label},
-        outputs={
-            "Alpha": [alpha],
-            "EmissionExps": [emission_exps],
-            "TransitionExps": transition_exps,
-            "LogLikelihood": log_likelihood
-        })
-
-    return log_likelihood
-
-
-def crf_decoding(input,
-                 param_attr,
-                 label=None,
-                 main_program=None,
-                 startup_program=None):
-    helper = LayerHelper('crf_decoding', **locals())
-    transition = helper.get_parameter(param_attr.name)
-    viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
-    helper.append_op(
-        type='crf_decoding',
-        inputs={"Emission": [input],
-                "Transition": transition,
-                "Label": label},
-        outputs={"ViterbiPath": [viterbi_path]})
-
-    return viterbi_path
-
-
-def assign(input, output, main_program=None, startup_program=None):
-    helper = LayerHelper('assign', **locals())
-    helper.append_op(
-        type='scale',
-        inputs={'X': [input]},
-        outputs={'Out': [output]},
-        attrs={'scale': 1.0})
-    return output
-
-
-def split_lod_tensor(input,
-                     mask,
-                     level=0,
-                     main_program=None,
-                     startup_program=None):
-    helper = LayerHelper('split_lod_tensor', **locals())
-    out_true = helper.create_tmp_variable(dtype=input.dtype)
-    out_false = helper.create_tmp_variable(dtype=input.dtype)
-    helper.append_op(
-        type='split_lod_tensor',
-        inputs={
-            'X': input,
-            'Mask': mask,
-        },
-        outputs={'OutTrue': out_true,
-                 'OutFalse': out_false},
-        attrs={'level': level})
-    return out_true, out_false
-
-
-def merge_lod_tensor(in_true,
-                     in_false,
-                     x,
-                     mask,
-                     level=0,
-                     main_program=None,
-                     startup_program=None):
-    helper = LayerHelper('merge_lod_tensor', **locals())
-    out = helper.create_tmp_variable(dtype=in_true.dtype)
-    helper.append_op(
-        type='merge_lod_tensor',
-        inputs={'X': x,
-                'Mask': mask,
-                'InTrue': in_true,
-                'InFalse': in_false},
-        outputs={'Out': out},
-        attrs={'level': level})
-    return out
-
-
-def cos_sim(X, Y, **kwargs):
-    """
-    This function performs the cosine similarity between two tensors
-    X and Y and returns that as the output.
-    """
-    helper = LayerHelper('cos_sim', **kwargs)
-    out = helper.create_tmp_variable(dtype=X.dtype)
-    xnorm = helper.create_tmp_variable(dtype=X.dtype)
-    ynorm = helper.create_tmp_variable(dtype=X.dtype)
-    helper.append_op(
-        type='cos_sim',
-        inputs={'X': [X],
-                'Y': [Y]},
-        outputs={'Out': [out],
-                 'XNorm': [xnorm],
-                 'YNorm': [ynorm]})
-    return out
-
-
-def cross_entropy(input, label, **kwargs):
-    """
-    This function computes cross_entropy using the input and label.
-    """
-    helper = LayerHelper('cross_entropy', **kwargs)
-    out = helper.create_tmp_variable(dtype=input.dtype)
-    helper.append_op(
-        type='cross_entropy',
-        inputs={'X': [input],
-                'Label': [label]},
-        outputs={'Y': [out]},
-        attrs=kwargs)
-    return out
-
-
-def square_error_cost(input, label, **kwargs):
-    """
-    This functions returns the squared error cost using the input and label.
-    The output is appending the op to do the above.
-    """
-    helper = LayerHelper('square_error_cost', **kwargs)
-    minus_out = helper.create_tmp_variable(dtype=input.dtype)
-    helper.append_op(
-        type='elementwise_sub',
-        inputs={'X': [input],
-                'Y': [label]},
-        outputs={'Out': [minus_out]})
-
-    square_out = helper.create_tmp_variable(dtype=input.dtype)
-    helper.append_op(
-        type='square', inputs={'X': [minus_out]}, outputs={'Y': [square_out]})
-    return square_out
-
-
-def accuracy(input, label, k=1, correct=None, total=None, **kwargs):
-    """
-    This function computes the accuracy using the input and label.
-    The output is the top_k inputs and their indices.
-    """
-    helper = LayerHelper("accuracy", **kwargs)
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": k})
-    acc_out = helper.create_tmp_variable(dtype="float32")
-    if correct is None:
-        correct = helper.create_tmp_variable(dtype="int64")
-    if total is None:
-        total = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="accuracy",
-        inputs={
-            "Out": [topk_out],
-            "Indices": [topk_indices],
-            "Label": [label]
-        },
-        outputs={
-            "Accuracy": [acc_out],
-            "Correct": [correct],
-            "Total": [total],
-        })
-    return acc_out
-
-
-def chunk_eval(input,
-               label,
-               chunk_scheme,
-               num_chunk_types,
-               excluded_chunk_types=None,
-               **kwargs):
-    """
-    This function computes the accuracy using the input and label.
-    The output is the top_k inputs and their indices.
-    """
-    helper = LayerHelper("chunk_eval", **kwargs)
-
-    # prepare output
-    precision = helper.create_tmp_variable(dtype="float32")
-    recall = helper.create_tmp_variable(dtype="float32")
-    f1_score = helper.create_tmp_variable(dtype="float32")
-
-    helper.append_op(
-        type="chunk_eval",
-        inputs={"Inference": [input],
-                "Label": [label]},
-        outputs={
-            "Precision": [precision],
-            "Recall": [recall],
-            "F1-Score": [f1_score]
-        },
-        attrs={
-            "num_chunk_types": num_chunk_types,
-            'chunk_scheme': chunk_scheme,
-            'excluded_chunk_types': excluded_chunk_types or []
-        })
-    return precision, recall, f1_score
-
-
-def sequence_conv(input,
-                  num_filters,
-                  filter_size=3,
-                  filter_stride=1,
-                  padding=None,
-                  bias_attr=None,
-                  param_attr=None,
-                  act=None,
-                  main_program=None,
-                  startup_program=None):
-    """
-    This function creates the op for sequence_conv, using the inputs and
-    other convolutional configurations for the filters and stride as given
-    in the input parameters to the function.
-    """
-
-    # FIXME(dzh) : want to unify the argument of python layer
-    # function. So we ignore some unecessary attributes.
-    # such as, padding_trainable, context_start.
-
-    helper = LayerHelper('sequence_conv', **locals())
-    dtype = helper.input_dtype()
-    filter_shape = [filter_size * input.shape[1], num_filters]
-    filter = helper.create_parameter(
-        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
-    pre_bias = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type='sequence_conv',
-        inputs={
-            'X': [input],
-            'Filter': [filter],
-        },
-        outputs={"Out": pre_bias},
-        attrs={
-            'contextStride': filter_stride,
-            'contextStart': -int(filter_size / 2),
-            'contextLength': filter_size
-        })
-    pre_act = helper.append_bias_op(pre_bias)
-    return helper.append_activation(pre_act)
-
-
-def conv2d(input,
-           num_filters,
-           filter_size,
-           stride=[1, 1],
-           padding=None,
-           groups=None,
-           param_attr=None,
-           bias_attr=None,
-           act=None,
-           name=None,
-           main_program=None,
-           startup_program=None):
-    """
-    This function creates the op for a 2-dimensional Convolution.
-    This is performed using the parameters of filters(size, dimensionality etc)
-    , stride and other configurations for a Convolution operation.
-    This funciton can also append an activation on top of the
-    conv-2d output, if mentioned in the input parameters.
-    """
-
-    helper = LayerHelper('conv2d', **locals())
-    dtype = helper.input_dtype()
-
-    num_channels = input.shape[1]
-    if groups is None:
-        num_filter_channels = num_channels
-    else:
-        if num_channels % groups != 0:
-            raise ValueError("num_channels must be divisible by groups.")
-        num_filter_channels = num_channels / groups
-
-    if isinstance(filter_size, int):
-        filter_size = [filter_size, filter_size]
-    if isinstance(stride, int):
-        stride = [stride, stride]
-    if isinstance(padding, int):
-        padding = [padding, padding]
-
-    input_shape = input.shape
-    filter_shape = [num_filters, num_filter_channels] + filter_size
-
-    def _get_default_param_initializer():
-        std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
-        return Normal(0.0, std, 0)
-
-    filter = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=filter_shape,
-        dtype=dtype,
-        default_initializer=_get_default_param_initializer())
-
-    pre_bias = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type='conv2d',
-        inputs={
-            'Input': input,
-            'Filter': filter,
-        },
-        outputs={"Output": pre_bias},
-        attrs={'strides': stride,
-               'paddings': padding,
-               'groups': groups})
-
-    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
-
-    return helper.append_activation(pre_act)
-
-
-def sequence_pool(input, pool_type, **kwargs):
-    """
-    This function add the operator for sequence pooling.
-    This is applied on top of the input using pool_type mentioned
-    in the parameters.
-    """
-    helper = LayerHelper('sequence_pool', input=input, **kwargs)
-    dtype = helper.input_dtype()
-    pool_out = helper.create_tmp_variable(dtype)
-    max_index = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type="sequence_pool",
-        inputs={"X": input},
-        outputs={"Out": pool_out,
-                 "MaxIndex": max_index},
-        attrs={"pooltype": pool_type.upper()})
-
-    return pool_out
-
-
-def pool2d(input,
-           pool_size,
-           pool_type,
-           pool_stride=[1, 1],
-           pool_padding=[0, 0],
-           global_pooling=False,
-           main_program=None,
-           startup_program=None):
-    """
-    This function adds the operator for pooling in 2 dimensions, using the
-    pooling configurations mentioned in input parameters.
-    """
-    if pool_type not in ["max", "avg"]:
-        raise ValueError(
-            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
-            str(pool_type))
-    if isinstance(pool_size, int):
-        pool_size = [pool_size, pool_size]
-    if isinstance(pool_stride, int):
-        pool_stride = [pool_stride, pool_stride]
-    if isinstance(pool_padding, int):
-        pool_padding = [pool_padding, pool_padding]
-
-    helper = LayerHelper('pool2d', **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type="pool2d",
-        inputs={"X": input},
-        outputs={"Out": pool_out},
-        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "global_pooling": global_pooling,
-            "strides": pool_stride,
-            "paddings": pool_padding
-        })
-
-    return pool_out
-
-
-def batch_norm(input,
-               act=None,
-               is_test=False,
-               momentum=0.9,
-               epsilon=1e-05,
-               param_attr=None,
-               bias_attr=None,
-               data_layout='NCHW',
-               main_program=None,
-               startup_program=None):
-    """
-    This function helps create an operator to implement
-    the BatchNorm layer using the configurations from the input parameters.
-    """
-    helper = LayerHelper('batch_norm', **locals())
-    dtype = helper.input_dtype()
-
-    input_shape = input.shape
-    if data_layout == 'NCHW':
-        channel_num = input_shape[1]
-    else:
-        if data_layout == 'NHWC':
-            channel_num = input_shape[-1]
-        else:
-            raise ValueError("unsupported data layout:" + data_layout)
-
-    param_shape = [channel_num]
-
-    # create parameter
-    scale = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype=dtype,
-        default_initializer=Constant(1.0))
-
-    bias = helper.create_parameter(
-        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True)
-
-    mean = helper.create_global_variable(
-        dtype=input.dtype, shape=param_shape, persistable=True)
-    helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
-
-    variance = helper.create_global_variable(
-        dtype=input.dtype, shape=param_shape, persistable=True)
-    helper.set_variable_initializer(var=variance, initializer=Constant(1.0))
-
-    # create output
-    # mean and mean_out share the same memory
-    mean_out = mean
-    # variance and variance out share the same memory
-    variance_out = variance
-    saved_mean = helper.create_tmp_variable(dtype)
-    saved_variance = helper.create_tmp_variable(dtype)
-
-    batch_norm_out = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type="batch_norm",
-        inputs={
-            "X": input,
-            "Scale": scale,
-            "Bias": bias,
-            "Mean": mean,
-            "Variance": variance
-        },
-        outputs={
-            "Y": batch_norm_out,
-            "MeanOut": mean_out,
-            "VarianceOut": variance_out,
-            "SavedMean": saved_mean,
-            "SavedVariance": saved_variance
-        },
-        attrs={"momentum": momentum,
-               "epsilon": epsilon,
-               "is_test": is_test})
-
-    return helper.append_activation(batch_norm_out)
-
-
-def beam_search_decode(ids, scores, main_program=None, startup_program=None):
-    helper = LayerHelper('beam_search_decode', **locals())
-    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
-    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
-
-    helper.append_op(
-        type="beam_search_decode",
-        inputs={"Ids": ids,
-                "Scores": scores},
-        outputs={
-            "SentenceIds": sentence_ids,
-            "SentenceScores": sentence_scores
-        })
-
-    return sentence_ids, sentence_scores
-
-
-class BlockGuard(object):
-    """
-    BlockGuard class.
-
-    BlockGuard class is used to create a sub-block in a program by
-    using the Python `with` keyword.
-    """
-
-    def __init__(self, main_program):
-        if not isinstance(main_program, Program):
-            raise TypeError("BlockGuard takes a program")
-        self.main_program = main_program
-
-    def __enter__(self):
-        self.main_program.create_block()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.main_program.rollback()
-        if exc_type is not None:
-            return False  # re-raise exception
-        return True
-
-
-class StaticRNNGuard(BlockGuard):
-    """
-    StaticRNNGuard class.
-
-    StaticRNNGuard class is used to create a StaticRNN block in a program.
-    """
-
-    def __init__(self, rnn):
-        if not isinstance(rnn, StaticRNN):
-            raise TypeError("StaticRNNGuard takes a StaticRNN")
-        super(StaticRNNGuard, self).__init__(rnn.helper.main_program)
-        self.rnn = rnn
-
-    def __enter__(self):
-        self.rnn.status = StaticRNN.IN_RNN_BLOCK
-        return super(StaticRNNGuard, self).__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-        self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
-        self.rnn.complete_rnn_op()
-        return super(StaticRNNGuard, self).__exit__(exc_type, exc_val, exc_tb)
-
-
-class StaticRNNMemoryLink(object):
-    """
-    StaticRNNMemoryLink class.
-
-    Args:
-        init: the initial variable for Memory
-        init: Variable
-        pre_mem: the memory variable in previous time step
-        pre_mem: Variable
-        mem: the memory variable in current time step
-        mem: Variable
-
-    StaticRNNMemoryLink class is used to create a link between two
-    memory cells of a StaticRNN.
-    """
-
-    def __init__(self, init, pre_mem, mem=None):
-        self.init = init
-        self.pre_mem = pre_mem
-        self.mem = mem
-
-
-class StaticRNN(object):
-    """
-    StaticRNN class.
-
-    StaticRNN class is used to create a StaticRNN. The RNN will have its
-    own parameters like inputs, outputs, memories, status and length.
-    """
-    BEFORE_RNN_BLOCK = 0
-    IN_RNN_BLOCK = 1
-    AFTER_RNN_BLOCK = 2
-
-    def __init__(self, name=None, main_program=None):
-        self.helper = LayerHelper(
-            "static_rnn", name=name, main_program=main_program)
-        self.memories = {}  # memory map, from pre_mem.name --> MemoryLink
-        self.inputs = []  # input variable list in current block
-        self.outputs = []  # output variable list in parent block
-        self.status = StaticRNN.BEFORE_RNN_BLOCK  # status flag.
-        # sequence length, since it is a static RNN, sequence length are fixed.
-        self.seq_len = None
-
-    def step(self):
-        return StaticRNNGuard(self)
-
-    def _assert_in_rnn_block_(self, method):
-        if self.status != StaticRNN.IN_RNN_BLOCK:
-            raise ValueError("You must invoke {0} in rnn block".format(method))
-
-    def memory(self,
-               init=None,
-               shape=None,
-               batch_ref=None,
-               init_value=0.0,
-               init_batch_dim_idx=0,
-               ref_batch_dim_idx=1):
-        """
-        Args:
-            init: boot memory, if not set, a shape, batch_ref must be provided
-            shape: shape of the boot memory
-            batch_ref: batch size reference variable
-            init_value: the init value of boot memory
-            init_batch_dim_idx: the index of batch size in init's dimension
-            ref_batch_dim_idx: the index of batch size in batch_ref's dimension
-        """
-        self._assert_in_rnn_block_('memory')
-        if init is None:
-            if shape is None or batch_ref is None:
-                raise ValueError(
-                    "if init is None, memory at least need shape and batch_ref")
-            parent_block = self.parent_block()
-            var_name = unique_name("@".join([self.helper.name, "memory_boot"]))
-            boot_var = parent_block.create_var(
-                name=var_name,
-                shape=shape,
-                dtype=batch_ref.dtype,
-                persistable=False)
-
-            parent_block.append_op(
-                type="fill_constant_batch_size_like",
-                inputs={'Input': [batch_ref]},
-                outputs={'Out': [boot_var]},
-                attrs={
-                    'value': init_value,
-                    'shape': boot_var.shape,
-                    'dtype': boot_var.dtype,
-                    'input_dim_idx': ref_batch_dim_idx,
-                    'output_dim_idx': init_batch_dim_idx
-                })
-
-            return self.memory(init=boot_var)
-        else:
-            pre_mem = self.helper.create_variable(
-                name=unique_name("@".join([self.helper.name, "mem"])),
-                dtype=init.dtype,
-                shape=init.shape)
-            self.memories[pre_mem.name] = StaticRNNMemoryLink(
-                init=init, pre_mem=pre_mem)
-            return pre_mem
-
-    def step_input(self, x):
-        self._assert_in_rnn_block_('step_input')
-        if not isinstance(x, Variable):
-            raise TypeError("step input takes a Variable")
-        if self.seq_len is None:
-            self.seq_len = x.shape[0]
-        elif self.seq_len != x.shape[0]:
-            raise ValueError("Static RNN only take fix seq_len input")
-
-        ipt = self.helper.create_variable(
-            name=x.name, dtype=x.dtype, shape=list(x.shape[1:]), type=x.type)
-        self.inputs.append(ipt)
-        return ipt
-
-    def step_output(self, o):
-        self._assert_in_rnn_block_('step_output')
-        if not isinstance(o, Variable):
-            raise TypeError("step output takes a Variable")
-
-        tmp_o = self.helper.create_tmp_variable(dtype=o.dtype)
-        self.helper.append_op(
-            type='rnn_memory_helper',
-            inputs={'X': [o]},
-            outputs={'Out': tmp_o},
-            attrs={'dtype': o.dtype})
-
-        out_var = self.parent_block().create_var(
-            name=tmp_o.name,
-            shape=[self.seq_len] + list(tmp_o.shape),
-            dtype=tmp_o.dtype)
-
-        self.outputs.append(out_var)
-
-    def output(self, *outputs):
-        for each in outputs:
-            self.step_output(each)
-
-    def update_memory(self, mem, var):
-        if not isinstance(mem, Variable) or not isinstance(var, Variable):
-            raise TypeError("update memory should take variables")
-        self.memories[mem.name].mem = var
-
-    def parent_block(self):
-        prog = self.helper.main_program
-        parent_idx = prog.current_block().parent_idx
-        assert parent_idx >= 0
-        parent_block = prog.block(parent_idx)
-        return parent_block
-
-    def __call__(self, *args, **kwargs):
-        if self.status != StaticRNN.AFTER_RNN_BLOCK:
-            raise ValueError("RNN output can only be retrieved after rnn block")
-        if len(self.outputs) == 0:
-            raise ValueError("RNN has no output")
-        elif len(self.outputs) == 1:
-            return self.outputs[0]
-        else:
-            return self.outputs
-
-    def complete_rnn_op(self):
-        main_program = self.helper.main_program
-        rnn_block = main_program.current_block()
-        parent_block = self.parent_block()
-
-        local_inputs = set()
-
-        for op in rnn_block.ops:
-            assert isinstance(op, Operator)
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    local_inputs.add(out_var_name)
-
-        for var in self.inputs:
-            local_inputs.add(var.name)
-        for m in self.memories:
-            local_inputs.add(m)
-
-        params = list()
-        for op in rnn_block.ops:
-            assert isinstance(op, Operator)
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in local_inputs:
-                        params.append(in_var_name)
-
-        parameters = [parent_block.var(name) for name in params]
-
-        step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-
-        inlinks = [parent_block.var(i.name) for i in self.inputs]
-        outlinks = self.outputs
-
-        boot_memories = []
-        pre_memories = []
-        memories = []
-        for _, mem in self.memories.iteritems():
-            boot_memories.append(mem.init)
-            pre_memories.append(mem.pre_mem.name)
-            mem_var = rnn_block.var(mem.mem.name)
-            assert isinstance(mem_var, Variable)
-            new_mem = self.helper.create_tmp_variable(dtype=mem_var.dtype)
-
-            rnn_block.append_op(
-                type='rnn_memory_helper',
-                inputs={'X': [mem_var]},
-                outputs={'Out': [new_mem]},
-                attrs={'dtype': mem_var.dtype})
-
-            memories.append(new_mem.name)
-
-        parent_block.append_op(
-            type='recurrent',
-            inputs={
-                'inputs': inlinks,
-                'initial_states': boot_memories,
-                'parameters': parameters
-            },
-            outputs={'outputs': outlinks,
-                     'step_scopes': [step_scope]},
-            attrs={
-                'ex_states': pre_memories,
-                'states': memories,
-                'step_block': rnn_block
-            })
-
-
-class WhileGuard(BlockGuard):
-    def __init__(self, while_op):
-        if not isinstance(while_op, While):
-            raise TypeError("WhileGuard takes a while op")
-        super(WhileGuard, self).__init__(while_op.helper.main_program)
-        self.while_op = while_op
-
-    def __enter__(self):
-        self.while_op.status = While.IN_WHILE_BLOCK
-        return super(WhileGuard, self).__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-        self.while_op.status = While.AFTER_WHILE_BLOCK
-        self.while_op.complete()
-        return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb)
-
-
-class While(object):
-    BEFORE_WHILE_BLOCK = 0
-    IN_WHILE_BLOCK = 1
-    AFTER_WHILE_BLOCK = 2
-
-    def __init__(self, cond, name=None, main_program=None):
-        self.helper = LayerHelper("while", name=name, main_program=main_program)
-        self.status = While.BEFORE_WHILE_BLOCK
-        if not isinstance(cond, Variable):
-            raise TypeError("condition should be a variable")
-        assert isinstance(cond, Variable)
-        if cond.dtype != core.DataType.BOOL:
-            raise TypeError("condition should be a bool variable")
-        if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
-            raise TypeError("condition should be a bool scalar")
-        self.cond_var = cond
-
-    def block(self):
-        return WhileGuard(self)
-
-    def complete(self):
-        main_program = self.helper.main_program
-        while_block = main_program.current_block()
-        parent_block = main_program.block(main_program.current_block()
-                                          .parent_idx)
-
-        inner_outputs = {self.cond_var.name}
-        x_name_list = set()
-        for op in while_block.ops:
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in inner_outputs:
-                        x_name_list.add(in_var_name)
-
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    inner_outputs.add(out_var_name)
-
-        out_vars = []
-        for inner_out_name in inner_outputs:
-            if inner_out_name in parent_block.vars:
-                out_vars.append(parent_block.var(inner_out_name))
-
-        step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-
-        parent_block.append_op(
-            type='while',
-            inputs={
-                'X': [parent_block.var(x_name) for x_name in x_name_list],
-                'Condition': [self.cond_var]
-            },
-            outputs={'Out': out_vars,
-                     'StepScopes': [step_scope]},
-            attrs={'step_block': while_block})
-
-
-def lstm(x,
-         c_pre_init,
-         hidden_dim,
-         forget_bias=None,
-         main_program=None,
-         startup_program=None):
-    """
-    This function helps create an operator for the LSTM (Long Short Term
-    Memory) cell that can be used inside an RNN.
-    """
-    helper = LayerHelper('lstm_unit', **locals())
-    rnn = StaticRNN()
-    with rnn.step():
-        c_pre = rnn.memory(init=c_pre_init)
-        x_t = rnn.step_input(x)
-
-        before_fc = concat(
-            input=[x_t, c_pre],
-            axis=1,
-            main_program=main_program,
-            startup_program=startup_program)
-        after_fc = fc(input=before_fc,
-                      size=hidden_dim * 4,
-                      main_program=main_program,
-                      startup_program=startup_program)
-
-        dtype = x.dtype
-        c = helper.create_tmp_variable(dtype)
-        h = helper.create_tmp_variable(dtype)
-
-        helper.append_op(
-            type='lstm_unit',
-            inputs={"X": after_fc,
-                    "C_prev": c_pre},
-            outputs={"C": c,
-                     "H": h},
-            attrs={"forget_bias": forget_bias})
-
-        rnn.update_memory(c_pre, c)
-        rnn.output(h)
-
-    return rnn()
-
-
-def lod_rank_table(x, level=0, main_program=None):
-    """
-    This function creates an operator for creating a LOD_RANK_TABLE
-    using the input x.
-    """
-    helper = LayerHelper("lod_rank_table", **locals())
-    table = helper.create_variable(
-        type=core.VarDesc.VarType.LOD_RANK_TABLE,
-        name=unique_name("lod_rank_table"))
-    helper.append_op(
-        type='lod_rank_table',
-        inputs={'X': x},
-        outputs={'Out': table},
-        attrs={'level': level})
-    return table
-
-
-def max_sequence_len(rank_table, main_program=None):
-    """
-    This function creates an operator to calculate the length of
-    max seqence through input rank_table(should be a lod_rank_table)
-    """
-    helper = LayerHelper("max_seqence_len", **locals())
-    res = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="max_sequence_len",
-        inputs={"RankTable": rank_table},
-        outputs={"Out": res})
-    return res
-
-
-def topk(input, k, main_program=None, startup_program=None):
-    helper = LayerHelper('topk', **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.data_type)
-    topk_indices = helper.create_tmp_variable(dtype='int64')
-    helper.append_op(
-        type='top_k',
-        inputs={'X': [input]},
-        outputs={'Out': [topk_out],
-                 'Indices': [topk_indices]},
-        attrs={'k': k})
-    return topk_out, topk_indices
-
-
-def lod_tensor_to_array(x, table, main_program=None):
-    """
-    This function creates an operator to convert an LOD_Tensor to
-    an array.
-    """
-    helper = LayerHelper("lod_tensor_to_array", **locals())
-    array = helper.create_variable(
-        name=unique_name("lod_tensor_to_array"),
-        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-        dtype=x.dtype)
-    helper.append_op(
-        type='lod_tensor_to_array',
-        inputs={'X': x,
-                'RankTable': table},
-        outputs={'Out': array})
-    return array
-
-
-def array_to_lod_tensor(x, table, main_program=None, startup_program=None):
-    """
-    This function creates an operator to convert an array to a
-    LOD_Tensor.
-    """
-    helper = LayerHelper("array_to_lod_tensor", **locals())
-    tmp = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(
-        type="array_to_lod_tensor",
-        inputs={'X': x,
-                'RankTable': table},
-        outputs={'Out': tmp})
-    return tmp
-
-
-def fill_constant(shape,
-                  dtype,
-                  value,
-                  out=None,
-                  main_program=None,
-                  startup_program=None):
-    """
-    This function creates a tensor , with shape as mentioned in the input and
-    specified dtype and fills this up with a constant value that
-    comes in the input. It also sets the stop_gradient to be True.
-    """
-    helper = LayerHelper("fill_constant", **locals())
-    if out is None:
-        out = helper.create_tmp_variable(dtype=dtype)
-    helper.append_op(
-        type='fill_constant',
-        inputs={},
-        outputs={'Out': [out]},
-        attrs={'shape': shape,
-               'dtype': out.dtype,
-               'value': float(value)})
-    out.stop_gradient = True
-    return out
-
-
-def fill_constant_batch_size_like(input,
-                                  shape,
-                                  dtype,
-                                  value,
-                                  input_dim_idx=0,
-                                  output_dim_idx=0,
-                                  main_program=None,
-                                  startup_program=None):
-    helper = LayerHelper("fill_constant_batch_size_like", **locals())
-    out = helper.create_tmp_variable(dtype=dtype)
-    helper.append_op(
-        type='fill_constant_batch_size_like',
-        inputs={'Input': input},
-        outputs={'Out': [out]},
-        attrs={
-            'shape': shape,
-            'dtype': out.dtype,
-            'value': float(value),
-            'input_dim_idx': input_dim_idx,
-            'output_dim_idx': output_dim_idx
-        })
-    out.stop_gradient = True
-    return out
-
-
-def ones(shape, dtype, main_program=None):
-    """
-    This function performs the same function as fill_constant() declared above
-    with the constant value being 1.0.
-    """
-    return fill_constant(value=1.0, **locals())
-
-
-def zeros(shape, dtype, main_program=None):
-    """
-    This function performs the same function as fill_constant() declared above
-    with the constant value being 0.0.
-    """
-    return fill_constant(value=0.0, **locals())
-
-
-def increment(x,
-              value=1.0,
-              in_place=True,
-              main_program=None,
-              startup_program=None):
-    """
-    This function creates an operator to increment each value in the input
-    `x` by an amount: `value` as mentioned in the input parameter. This
-    operation is performed in-place by default.
-    """
-    helper = LayerHelper("increment", **locals())
-    if not in_place:
-        out = helper.create_tmp_variable(dtype=x.dtype)
-    else:
-        out = x
-    helper.append_op(
-        type='increment',
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={'step': float(value)})
-    return out
-
-
-def array_write(x, i, array=None, main_program=None, startup_program=None):
-    """
-    This function creates an operator to write the data out as a
-    LOD_TENSOR_ARRAY.
-    """
-    helper = LayerHelper('array_write', **locals())
-    if array is None:
-        array = helper.create_variable(
-            name="{0}.out".format(helper.name),
-            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-            dtype=x.dtype)
-    helper.append_op(
-        type='write_to_array',
-        inputs={'X': [x],
-                'I': [i]},
-        outputs={'Out': [array]})
-    return array
-
-
-def create_array(dtype, main_program=None):
-    helper = LayerHelper("array", **locals())
-    return helper.create_variable(
-        name="{0}.out".format(helper.name),
-        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-        dtype=dtype)
-
-
-def less_than(x, y, cond=None, main_program=None, **ignored):
-    helper = LayerHelper("less_than", **locals())
-    if cond is None:
-        cond = helper.create_tmp_variable(dtype='bool')
-        cond.stop_gradient = True
-
-    helper.append_op(
-        type='less_than', inputs={'X': [x],
-                                  'Y': [y]}, outputs={'Out': [cond]})
-    return cond
-
-
-def array_read(array, i, main_program=None, startup_program=None):
-    """
-    This function creates an operator to read the data in as a
-    LOD_TENSOR_ARRAY.
-    """
-    helper = LayerHelper('array_read', **locals())
-    if not isinstance(
-            array,
-            Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
-        raise TypeError("array should be tensor array vairable")
-    out = helper.create_tmp_variable(dtype=array.dtype)
-    helper.append_op(
-        type='read_from_array',
-        inputs={'X': [array],
-                'I': [i]},
-        outputs={'Out': [out]})
-    return out
-
-
-def shrink_memory(x, i, table, main_program=None, startup_program=None):
-    """
-    This function creates an operator to shrink_rnn_memory using the RankTable
-    as mentioned in the input parameter.
-    """
-    helper = LayerHelper('shrink_memory', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(
-        type='shrink_rnn_memory',
-        inputs={'X': [x],
-                'I': [i],
-                'RankTable': [table]},
-        outputs={'Out': [out]},
-        attrs={})
-    return out
-
-
-def array_length(array, main_program=None):
-    """
-    This function creates an operator to find the length of the
-    LOD_TENSOR_ARRAY.
-    """
-    helper = LayerHelper('array_length', **locals())
-    tmp = helper.create_tmp_variable(dtype='int64')
-    tmp.stop_gradient = True
-    helper.append_op(
-        type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
-    return tmp
-
-
-def conv2d_transpose(input,
-                     num_filters,
-                     output_size=None,
-                     filter_size=None,
-                     padding=None,
-                     stride=None,
-                     param_attr=None,
-                     main_program=None,
-                     startup_program=None):
-    """
-    The transpose of conv2d layer.
-
-    This layer is also known as deconvolution layer.
-
-    Args:
-        input(Variable): The input image with [N, C, H, W] format.
-        num_filters(int): The number of filter. It is as same as the output
-            image channel.
-        output_size(int|tuple|None): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). This
-            parameter only works when filter_size is None.
-        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.  None if use output size to
-            calculate filter_size
-        padding(int|tuple): The padding size. If padding is a tuple, it must
-            contain two integers, (padding_H, padding_W). Otherwise, the
-            padding_H = padding_W = padding.
-        stride(int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride.
-        param_attr: Parameter Attribute.
-        main_program(Program): the main program
-        startup_program(Program): the startup program
-
-    Returns:
-        Variable: Output image.
-    """
-    helper = LayerHelper("conv2d_transpose", **locals())
-    if not isinstance(input, Variable):
-        raise TypeError("Input of conv2d_transpose must be Variable")
-    input_channel = input.shape[1]
-
-    op_attr = dict()
-
-    if isinstance(padding, int):
-        op_attr['paddings'] = [padding, padding]
-    elif padding is not None:
-        op_attr['paddings'] = padding
-
-    if isinstance(stride, int):
-        op_attr['strides'] = stride
-    elif stride is not None:
-        op_attr['strides'] = stride
-
-    if filter_size is None:
-        if output_size is None:
-            raise ValueError("output_size must be set when filter_size is None")
-        if isinstance(output_size, int):
-            output_size = [output_size, output_size]
-
-        padding = op_attr.get('paddings', [0, 0])
-        stride = op_attr.get('strides', [1, 1])
-
-        h_in = input.shape[2]
-        w_in = input.shape[3]
-        filter_size_h = output_size[0] - (h_in - 1) * stride[0] + 2 * padding[0]
-        filter_size_w = output_size[1] - (w_in - 1) * stride[1] + 2 * padding[1]
-        filter_size = [filter_size_h, filter_size_w]
-    elif isinstance(filter_size, int):
-        filter_size = [filter_size, filter_size]
-
-    filter_shape = [input_channel, num_filters] + filter_size
-    img_filter = helper.create_parameter(
-        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
-
-    out = helper.create_tmp_variable(dtype=input.dtype)
-    helper.append_op(
-        type='conv2d_transpose',
-        inputs={'Input': [input],
-                'Filter': [img_filter]},
-        outputs={'Output': out},
-        attrs=op_attr)
-
-    return out
-
-
-class ConditionalBlockGuard(BlockGuard):
-    def __init__(self, block):
-        if not isinstance(block, ConditionalBlock):
-            raise TypeError("block should be conditional block")
-        super(ConditionalBlockGuard, self).__init__(block.helper.main_program)
-        self.block = block
-
-    def __enter__(self):
-        return super(ConditionalBlockGuard, self).__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.block.complete()
-        return super(ConditionalBlockGuard, self).__exit__(exc_type, exc_val,
-                                                           exc_tb)
-
-
-class ConditionalBlock(object):
-    def __init__(self,
-                 inputs,
-                 name=None,
-                 main_program=None,
-                 startup_program=None):
-        for each_input in inputs:
-            if not isinstance(each_input, Variable):
-                raise TypeError("Each input should be variable")
-        self.inputs = inputs
-        self.helper = LayerHelper(
-            'conditional_block',
-            name=name,
-            main_program=main_program,
-            startup_program=startup_program)
-
-    def block(self):
-        return ConditionalBlockGuard(self)
-
-    def complete(self):
-        inside_block = self.helper.main_program.current_block()
-        parent_block = self.helper.main_program.block(inside_block.parent_idx)
-
-        intermediate = set()
-        params = set()
-
-        for each_op in inside_block.ops:
-            assert isinstance(each_op, Operator)
-            for iname in each_op.input_names:
-                for in_var_name in each_op.input(iname):
-                    if in_var_name not in intermediate:
-                        params.add(in_var_name)
-
-            for oname in each_op.output_names:
-                for out_var_name in each_op.output(oname):
-                    intermediate.add(out_var_name)
-        input_set = set([ipt.name for ipt in self.inputs])
-
-        param_list = [
-            parent_block.var(each_name) for each_name in params
-            if each_name not in input_set
-        ]
-
-        out_list = [
-            parent_block.var(var_name) for var_name in parent_block.vars
-            if var_name not in intermediate
-        ]
-
-        step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-        parent_block.append_op(
-            type='conditional_block',
-            inputs={
-                'X': self.inputs,
-                'Params': param_list,
-            },
-            outputs={'Out': out_list,
-                     'Scope': [step_scope]},
-            attrs={'block': inside_block})
-
-
-class IfElseBlockGuard(object):
-    def __init__(self, is_true, ifelse):
-        if not isinstance(ifelse, IfElse):
-            raise TypeError("ifelse must be an instance of IfElse class")
-
-        if ifelse.status != IfElse.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("You cannot invoke IfElse.block() inside a block")
-
-        self.is_true = is_true
-        self.ie = ifelse
-        if is_true:
-            self.cond_block = ifelse.conditional_true_block
-        else:
-            self.cond_block = ifelse.conditional_false_block
-
-        if not isinstance(self.cond_block, ConditionalBlock):
-            raise TypeError("Unexpected situation")
-
-        self.cond_block = self.cond_block.block()
-
-    def __enter__(self):
-        self.ie.status = IfElse.IN_IF_ELSE_TRUE_BLOCKS if self.is_true else IfElse.IN_IF_ELSE_FALSE_BLOCKS
-        self.cond_block.__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if not self.cond_block.__exit__(exc_type, exc_val, exc_tb):
-            # re-raise inside exception
-            return False
-        if len(self.ie.output_table[1 if self.is_true else 0]) == 0:
-            raise ValueError("Must set output inside block")
-        self.ie.status = IfElse.OUT_IF_ELSE_BLOCKS
-
-
-class IfElse(object):
-    OUT_IF_ELSE_BLOCKS = 0
-    IN_IF_ELSE_TRUE_BLOCKS = 1
-    IN_IF_ELSE_FALSE_BLOCKS = 2
-
-    def __init__(self, cond, name=None, main_program=None,
-                 startup_program=None):
-        if not isinstance(cond, Variable):
-            raise TypeError("cond must be a Variable")
-        self.helper = LayerHelper(
-            'ifelse',
-            name=name,
-            main_program=main_program,
-            startup_program=startup_program)
-        self.cond = cond
-        self.input_table = {}
-        self.status = IfElse.OUT_IF_ELSE_BLOCKS
-        self.conditional_true_block = ConditionalBlock(inputs=[self.cond])
-        self.conditional_false_block = ConditionalBlock(inputs=[self.cond])
-        self.output_table = ([], [])  # (true_outs, false_outs)
-
-    def input(self, x):
-        if self.status == IfElse.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("input must in true/false blocks")
-        if id(x) not in self.input_table:
-            parent_block = self.parent_block()
-            out_true = parent_block.create_var(
-                name=unique_name('ifelse_input' + self.helper.name),
-                dtype=x.dtype)
-
-            out_false = parent_block.create_var(
-                name=unique_name('ifelse_input' + self.helper.name),
-                dtype=x.dtype)
-            parent_block.append_op(
-                type='split_lod_tensor',
-                inputs={
-                    'X': x,
-                    'Mask': self.cond,
-                },
-                outputs={'OutTrue': out_true,
-                         'OutFalse': out_false},
-                attrs={'level': 0})
-            self.input_table[id(x)] = (out_true, out_false)
-        else:
-            out_true, out_false = self.input_table[id(x)]
-
-        if self.status == IfElse.IN_IF_ELSE_TRUE_BLOCKS:
-            return out_true
-        else:
-            return out_false
-
-    def parent_block(self):
-        current_block = self.helper.main_program.current_block()
-        return self.helper.main_program.block(current_block.parent_idx)
-
-    def true_block(self):
-        return IfElseBlockGuard(True, self)
-
-    def false_block(self):
-        return IfElseBlockGuard(False, self)
-
-    def output(self, *outs):
-        if self.status == self.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("output can only be invoked in the sub-block")
-
-        out_table = self.output_table[1 if self.status ==
-                                      self.IN_IF_ELSE_TRUE_BLOCKS else 0]
-        parent_block = self.parent_block()
-        for each_out in outs:
-            if not isinstance(each_out, Variable):
-                raise TypeError("Each output should be a variable")
-            # create outside tensor
-            outside_out = parent_block.create_var(
-                name=unique_name("_".join([self.helper.name, 'output'])),
-                dtype=each_out.dtype)
-            out_table.append(outside_out)
-
-            # assign local var to outside
-            assign(
-                input=each_out,
-                output=outside_out,
-                main_program=self.helper.main_program,
-                startup_program=self.helper.startup_program)
-
-    def __call__(self):
-        if self.status != self.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("IfElse::__call__ must be out of sub-block")
-        false_len, true_len = map(len, self.output_table)
-        if false_len == 0 and true_len == 0:
-            raise ValueError("Must invoke true_block/false_block before "
-                             "__call__")
-        elif false_len != true_len and false_len != 0 and true_len != 0:
-            raise ValueError("The output side must be same")
-        elif false_len == 0 or true_len == 0:
-            return self.output_table[0 if false_len != 0 else 1]
-
-        # else none of false_len/true_len is zero
-        # merge together
-        rlist = []
-        for false_var, true_var in zip(*self.output_table):
-            rlist.append(
-                merge_lod_tensor(
-                    in_true=true_var,
-                    in_false=false_var,
-                    mask=self.cond,
-                    x=self.cond,
-                    level=0,
-                    main_program=self.helper.main_program,
-                    startup_program=self.helper.startup_program))
-        return rlist
-
-
-class DynamicRNN(object):
-    BEFORE_RNN = 0
-    IN_RNN = 1
-    AFTER_RNN = 2
-
-    def __init__(self, name=None, main_program=None, startup_program=None):
-        self.helper = LayerHelper(
-            'dynamic_rnn',
-            name=name,
-            main_program=main_program,
-            startup_program=startup_program)
-        self.status = DynamicRNN.BEFORE_RNN
-        self.lod_rank_table = None
-        self.max_seq_len = None
-        self.step_idx = None
-        self.zero_idx = fill_constant(shape=[1], value=0, dtype='int64')
-        self.mem_dict = dict()
-        self.output_array = []
-        self.outputs = []
-        self.cond = self.helper.create_tmp_variable(dtype='bool')
-        self.cond.stop_gradient = False
-        self.while_op = While(self.cond)
-        self.input_array = []
-        self.mem_link = []
-
-    def step_input(self, x):
-        self._assert_in_rnn_block_("step_input")
-        if not isinstance(x, Variable):
-            raise TypeError(
-                "step_input() can only take a Variable as its input")
-        parent_block = self._parent_block_()
-        if self.lod_rank_table is None:
-            self.lod_rank_table = parent_block.create_var(
-                name=unique_name('lod_rank_table'),
-                type=core.VarDesc.VarType.LOD_RANK_TABLE)
-            self.lod_rank_table.stop_gradient = True
-            parent_block.append_op(
-                type='lod_rank_table',
-                inputs={"X": x},
-                outputs={"Out": self.lod_rank_table})
-            self.max_seq_len = parent_block.create_var(
-                name=unique_name('dynamic_rnn_max_seq_len'), dtype='int64')
-            self.max_seq_len.stop_gradient = False
-            parent_block.append_op(
-                type='max_sequence_len',
-                inputs={'RankTable': self.lod_rank_table},
-                outputs={"Out": self.max_seq_len})
-            self.cond.stop_gradient = True
-            parent_block.append_op(
-                type='less_than',
-                inputs={'X': self.step_idx,
-                        'Y': self.max_seq_len},
-                outputs={'Out': self.cond})
-
-        input_array = parent_block.create_var(
-            name=unique_name('dynamic_rnn_input_array'),
-            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-            dtype=x.dtype)
-        self.input_array.append((input_array, x.dtype))
-        parent_block.append_op(
-            type='lod_tensor_to_array',
-            inputs={'X': x,
-                    'RankTable': self.lod_rank_table},
-            outputs={'Out': input_array})
-        return array_read(
-            array=input_array, i=self.step_idx, **self.helper.to_kwargs)
-
-    @contextlib.contextmanager
-    def block(self):
-        if self.status != DynamicRNN.BEFORE_RNN:
-            raise ValueError("rnn.block() can only be invoke once")
-        self.step_idx = fill_constant(shape=[1], dtype='int64', value=0)
-        self.step_idx.stop_gradient = False
-        self.status = DynamicRNN.IN_RNN
-        with self.while_op.block():
-            yield
-            increment(
-                x=self.step_idx,
-                value=1.0,
-                in_place=True,
-                **self.helper.to_kwargs)
-
-            for new_mem, mem_array in self.mem_link:
-                array_write(
-                    x=new_mem,
-                    i=self.step_idx,
-                    array=mem_array,
-                    **self.helper.to_kwargs)
-
-            less_than(
-                x=self.step_idx,
-                y=self.max_seq_len,
-                cond=self.cond,
-                **self.helper.to_kwargs)
-
-        self.status = DynamicRNN.AFTER_RNN
-        for each_array in self.output_array:
-            self.outputs.append(
-                array_to_lod_tensor(
-                    x=each_array,
-                    table=self.lod_rank_table,
-                    **self.helper.to_kwargs))
-
-    def __call__(self, *args, **kwargs):
-        if self.status != DynamicRNN.AFTER_RNN:
-            raise ValueError(
-                "Dynamic RNN outputs can only be retrieved after rnn block")
-        if len(self.outputs) == 1:
-            return self.outputs[0]
-        else:
-            return self.outputs
-
-    def memory(self, init=None, shape=None, value=0.0, dtype='float32'):
-        self._assert_in_rnn_block_('memory')
-        if init is not None:
-            if not isinstance(init, Variable):
-                raise TypeError(
-                    "The input arg `init` of memory() must be a Variable")
-            parent_block = self._parent_block_()
-            mem_array = parent_block.create_var(
-                name=unique_name('dynamic_rnn_mem_array'),
-                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-                dtype=init.dtype)
-            parent_block.append_op(
-                type='write_to_array',
-                inputs={'X': init,
-                        'I': self.zero_idx},
-                outputs={'Out': mem_array})
-            retv = array_read(
-                array=mem_array, i=self.step_idx, **self.helper.to_kwargs)
-            retv = shrink_memory(
-                x=retv,
-                i=self.step_idx,
-                table=self.lod_rank_table,
-                **self.helper.to_kwargs)
-            self.mem_dict[retv.name] = mem_array
-            return retv
-        else:
-            if len(self.input_array) == 0:
-                raise ValueError(
-                    "step_input should be invoked before memory(shape=..., value=...)"
-                )
-            parent_block = self._parent_block_()
-            init = parent_block.create_var(
-                name=unique_name('mem_init'), dtype=dtype)
-            arr, dtype = self.input_array[0]
-            in0 = parent_block.create_var(name=unique_name('in0'), dtype=dtype)
-            parent_block.append_op(
-                type='read_from_array',
-                inputs={'X': [arr],
-                        'I': [self.zero_idx]},
-                outputs={'Out': [in0]})
-            parent_block.append_op(
-                type='fill_constant_batch_size_like',
-                inputs={'Input': [in0]},
-                outputs={'Out': [init]},
-                attrs={
-                    'shape': [-1] + shape,
-                    'value': float(value),
-                    'dtype': init.dtype
-                })
-            return self.memory(init=init)
-
-    def update_memory(self, ex_mem, new_mem):
-        self._assert_in_rnn_block_('update_memory')
-        if not isinstance(ex_mem, Variable):
-            raise TypeError("The input arg `ex_mem` of update_memory() must "
-                            "be a Variable")
-        if not isinstance(new_mem, Variable):
-            raise TypeError("The input arg `new_mem` of update_memory() must "
-                            "be a Variable")
-
-        mem_array = self.mem_dict.get(ex_mem.name, None)
-        if mem_array is None:
-            raise ValueError("Please invoke memory before update_memory")
-        if self.lod_rank_table is None:
-            raise ValueError("Please invoke step_input before update_memory")
-
-        self.mem_link.append((new_mem, mem_array))
-
-    def output(self, *outputs):
-        self._assert_in_rnn_block_('output')
-        parent_block = self._parent_block_()
-        for each in outputs:
-            outside_array = parent_block.create_var(
-                name=unique_name("_".join(
-                    [self.helper.name, "output_array", each.name])),
-                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-                dtype=each.dtype)
-            array_write(x=each, i=self.step_idx, array=outside_array)
-            self.output_array.append(outside_array)
-
-    def _parent_block_(self):
-        prog = self.helper.main_program
-        parent_idx = prog.current_block().parent_idx
-        assert parent_idx >= 0
-        parent_block = prog.block(parent_idx)
-
-        return parent_block
-
-    def _assert_in_rnn_block_(self, method):
-        if self.status != DynamicRNN.IN_RNN:
-            raise ValueError("{0} can only be invoked inside rnn block.".format(
-                method))
diff --git a/python/paddle/v2/fluid/layers/__init__.py b/python/paddle/v2/fluid/layers/__init__.py
new file mode 100644
index 0000000000..249f570e13
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/__init__.py
@@ -0,0 +1,17 @@
+import ops
+from ops import *
+import nn
+from nn import *
+import io
+from io import *
+import tensor
+from tensor import *
+import control_flow
+from control_flow import *
+
+__all__ = []
+__all__ += nn.__all__
+__all__ += io.__all__
+__all__ += tensor.__all__
+__all__ += control_flow.__all__
+__all__ += ops.__all__
diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py
new file mode 100644
index 0000000000..5af6c78977
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -0,0 +1,1022 @@
+from ..layer_helper import LayerHelper, unique_name
+from ..framework import Program, Variable, Operator
+from .. import core
+from tensor import assign, fill_constant
+import contextlib
+
+__all__ = [
+    'split_lod_tensor', 'merge_lod_tensor', 'BlockGuard', 'StaticRNNGuard',
+    'StaticRNNMemoryLink', 'WhileGuard', 'While', 'lod_rank_table',
+    'max_sequence_len', 'topk', 'lod_tensor_to_array', 'array_to_lod_tensor',
+    'increment', 'array_write', 'create_array', 'less_than', 'array_read',
+    'shrink_memory', 'array_length', 'IfElse', 'DynamicRNN', 'ConditionalBlock',
+    'StaticRNN'
+]
+
+
+def split_lod_tensor(input,
+                     mask,
+                     level=0,
+                     main_program=None,
+                     startup_program=None):
+    helper = LayerHelper('split_lod_tensor', **locals())
+    out_true = helper.create_tmp_variable(dtype=input.dtype)
+    out_false = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='split_lod_tensor',
+        inputs={
+            'X': input,
+            'Mask': mask,
+        },
+        outputs={'OutTrue': out_true,
+                 'OutFalse': out_false},
+        attrs={'level': level})
+    return out_true, out_false
+
+
+def merge_lod_tensor(in_true,
+                     in_false,
+                     x,
+                     mask,
+                     level=0,
+                     main_program=None,
+                     startup_program=None):
+    helper = LayerHelper('merge_lod_tensor', **locals())
+    out = helper.create_tmp_variable(dtype=in_true.dtype)
+    helper.append_op(
+        type='merge_lod_tensor',
+        inputs={'X': x,
+                'Mask': mask,
+                'InTrue': in_true,
+                'InFalse': in_false},
+        outputs={'Out': out},
+        attrs={'level': level})
+    return out
+
+
+class BlockGuard(object):
+    """
+    BlockGuard class.
+
+    BlockGuard class is used to create a sub-block in a program by
+    using the Python `with` keyword.
+    """
+
+    def __init__(self, main_program):
+        if not isinstance(main_program, Program):
+            raise TypeError("BlockGuard takes a program")
+        self.main_program = main_program
+
+    def __enter__(self):
+        self.main_program.create_block()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.main_program.rollback()
+        if exc_type is not None:
+            return False  # re-raise exception
+        return True
+
+
+class StaticRNNGuard(BlockGuard):
+    """
+    StaticRNNGuard class.
+
+    StaticRNNGuard class is used to create a StaticRNN block in a program.
+    """
+
+    def __init__(self, rnn):
+        if not isinstance(rnn, StaticRNN):
+            raise TypeError("StaticRNNGuard takes a StaticRNN")
+        super(StaticRNNGuard, self).__init__(rnn.helper.main_program)
+        self.rnn = rnn
+
+    def __enter__(self):
+        self.rnn.status = StaticRNN.IN_RNN_BLOCK
+        return super(StaticRNNGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+        self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
+        self.rnn.complete_rnn_op()
+        return super(StaticRNNGuard, self).__exit__(exc_type, exc_val, exc_tb)
+
+
+class StaticRNNMemoryLink(object):
+    """
+    StaticRNNMemoryLink class.
+
+    Args:
+        init: the initial variable for Memory
+        init: Variable
+        pre_mem: the memory variable in previous time step
+        pre_mem: Variable
+        mem: the memory variable in current time step
+        mem: Variable
+
+    StaticRNNMemoryLink class is used to create a link between two
+    memory cells of a StaticRNN.
+    """
+
+    def __init__(self, init, pre_mem, mem=None):
+        self.init = init
+        self.pre_mem = pre_mem
+        self.mem = mem
+
+
+class StaticRNN(object):
+    """
+    StaticRNN class.
+
+    StaticRNN class is used to create a StaticRNN. The RNN will have its
+    own parameters like inputs, outputs, memories, status and length.
+    """
+    BEFORE_RNN_BLOCK = 0
+    IN_RNN_BLOCK = 1
+    AFTER_RNN_BLOCK = 2
+
+    def __init__(self, name=None, main_program=None):
+        self.helper = LayerHelper(
+            "static_rnn", name=name, main_program=main_program)
+        self.memories = {}  # memory map, from pre_mem.name --> MemoryLink
+        self.inputs = []  # input variable list in current block
+        self.outputs = []  # output variable list in parent block
+        self.status = StaticRNN.BEFORE_RNN_BLOCK  # status flag.
+        # sequence length, since it is a static RNN, sequence length are fixed.
+        self.seq_len = None
+
+    def step(self):
+        return StaticRNNGuard(self)
+
+    def _assert_in_rnn_block_(self, method):
+        if self.status != StaticRNN.IN_RNN_BLOCK:
+            raise ValueError("You must invoke {0} in rnn block".format(method))
+
+    def memory(self,
+               init=None,
+               shape=None,
+               batch_ref=None,
+               init_value=0.0,
+               init_batch_dim_idx=0,
+               ref_batch_dim_idx=1):
+        """
+        Args:
+            init: boot memory, if not set, a shape, batch_ref must be provided
+            shape: shape of the boot memory
+            batch_ref: batch size reference variable
+            init_value: the init value of boot memory
+            init_batch_dim_idx: the index of batch size in init's dimension
+            ref_batch_dim_idx: the index of batch size in batch_ref's dimension
+        """
+        self._assert_in_rnn_block_('memory')
+        if init is None:
+            if shape is None or batch_ref is None:
+                raise ValueError(
+                    "if init is None, memory at least need shape and batch_ref")
+            parent_block = self.parent_block()
+            var_name = unique_name("@".join([self.helper.name, "memory_boot"]))
+            boot_var = parent_block.create_var(
+                name=var_name,
+                shape=shape,
+                dtype=batch_ref.dtype,
+                persistable=False)
+
+            parent_block.append_op(
+                type="fill_constant_batch_size_like",
+                inputs={'Input': [batch_ref]},
+                outputs={'Out': [boot_var]},
+                attrs={
+                    'value': init_value,
+                    'shape': boot_var.shape,
+                    'dtype': boot_var.dtype,
+                    'input_dim_idx': ref_batch_dim_idx,
+                    'output_dim_idx': init_batch_dim_idx
+                })
+
+            return self.memory(init=boot_var)
+        else:
+            pre_mem = self.helper.create_variable(
+                name=unique_name("@".join([self.helper.name, "mem"])),
+                dtype=init.dtype,
+                shape=init.shape)
+            self.memories[pre_mem.name] = StaticRNNMemoryLink(
+                init=init, pre_mem=pre_mem)
+            return pre_mem
+
+    def step_input(self, x):
+        self._assert_in_rnn_block_('step_input')
+        if not isinstance(x, Variable):
+            raise TypeError("step input takes a Variable")
+        if self.seq_len is None:
+            self.seq_len = x.shape[0]
+        elif self.seq_len != x.shape[0]:
+            raise ValueError("Static RNN only take fix seq_len input")
+
+        ipt = self.helper.create_variable(
+            name=x.name, dtype=x.dtype, shape=list(x.shape[1:]), type=x.type)
+        self.inputs.append(ipt)
+        return ipt
+
+    def step_output(self, o):
+        self._assert_in_rnn_block_('step_output')
+        if not isinstance(o, Variable):
+            raise TypeError("step output takes a Variable")
+
+        tmp_o = self.helper.create_tmp_variable(dtype=o.dtype)
+        self.helper.append_op(
+            type='rnn_memory_helper',
+            inputs={'X': [o]},
+            outputs={'Out': tmp_o},
+            attrs={'dtype': o.dtype})
+
+        out_var = self.parent_block().create_var(
+            name=tmp_o.name,
+            shape=[self.seq_len] + list(tmp_o.shape),
+            dtype=tmp_o.dtype)
+
+        self.outputs.append(out_var)
+
+    def output(self, *outputs):
+        for each in outputs:
+            self.step_output(each)
+
+    def update_memory(self, mem, var):
+        if not isinstance(mem, Variable) or not isinstance(var, Variable):
+            raise TypeError("update memory should take variables")
+        self.memories[mem.name].mem = var
+
+    def parent_block(self):
+        prog = self.helper.main_program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+        return parent_block
+
+    def __call__(self, *args, **kwargs):
+        if self.status != StaticRNN.AFTER_RNN_BLOCK:
+            raise ValueError("RNN output can only be retrieved after rnn block")
+        if len(self.outputs) == 0:
+            raise ValueError("RNN has no output")
+        elif len(self.outputs) == 1:
+            return self.outputs[0]
+        else:
+            return self.outputs
+
+    def complete_rnn_op(self):
+        main_program = self.helper.main_program
+        rnn_block = main_program.current_block()
+        parent_block = self.parent_block()
+
+        local_inputs = set()
+
+        for op in rnn_block.ops:
+            assert isinstance(op, Operator)
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    local_inputs.add(out_var_name)
+
+        for var in self.inputs:
+            local_inputs.add(var.name)
+        for m in self.memories:
+            local_inputs.add(m)
+
+        params = list()
+        for op in rnn_block.ops:
+            assert isinstance(op, Operator)
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in local_inputs:
+                        params.append(in_var_name)
+
+        parameters = [parent_block.var(name) for name in params]
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        inlinks = [parent_block.var(i.name) for i in self.inputs]
+        outlinks = self.outputs
+
+        boot_memories = []
+        pre_memories = []
+        memories = []
+        for _, mem in self.memories.iteritems():
+            boot_memories.append(mem.init)
+            pre_memories.append(mem.pre_mem.name)
+            mem_var = rnn_block.var(mem.mem.name)
+            assert isinstance(mem_var, Variable)
+            new_mem = self.helper.create_tmp_variable(dtype=mem_var.dtype)
+
+            rnn_block.append_op(
+                type='rnn_memory_helper',
+                inputs={'X': [mem_var]},
+                outputs={'Out': [new_mem]},
+                attrs={'dtype': mem_var.dtype})
+
+            memories.append(new_mem.name)
+
+        parent_block.append_op(
+            type='recurrent',
+            inputs={
+                'inputs': inlinks,
+                'initial_states': boot_memories,
+                'parameters': parameters
+            },
+            outputs={'outputs': outlinks,
+                     'step_scopes': [step_scope]},
+            attrs={
+                'ex_states': pre_memories,
+                'states': memories,
+                'sub_block': rnn_block
+            })
+
+
+class WhileGuard(BlockGuard):
+    def __init__(self, while_op):
+        if not isinstance(while_op, While):
+            raise TypeError("WhileGuard takes a while op")
+        super(WhileGuard, self).__init__(while_op.helper.main_program)
+        self.while_op = while_op
+
+    def __enter__(self):
+        self.while_op.status = While.IN_WHILE_BLOCK
+        return super(WhileGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+        self.while_op.status = While.AFTER_WHILE_BLOCK
+        self.while_op.complete()
+        return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb)
+
+
+class While(object):
+    BEFORE_WHILE_BLOCK = 0
+    IN_WHILE_BLOCK = 1
+    AFTER_WHILE_BLOCK = 2
+
+    def __init__(self, cond, name=None, main_program=None):
+        self.helper = LayerHelper("while", name=name, main_program=main_program)
+        self.status = While.BEFORE_WHILE_BLOCK
+        if not isinstance(cond, Variable):
+            raise TypeError("condition should be a variable")
+        assert isinstance(cond, Variable)
+        if cond.dtype != core.DataType.BOOL:
+            raise TypeError("condition should be a bool variable")
+        if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
+            raise TypeError("condition should be a bool scalar")
+        self.cond_var = cond
+
+    def block(self):
+        return WhileGuard(self)
+
+    def complete(self):
+        main_program = self.helper.main_program
+        while_block = main_program.current_block()
+        parent_block = main_program.block(main_program.current_block()
+                                          .parent_idx)
+
+        inner_outputs = {self.cond_var.name}
+        x_name_list = set()
+        for op in while_block.ops:
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in inner_outputs:
+                        x_name_list.add(in_var_name)
+
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    inner_outputs.add(out_var_name)
+
+        out_vars = []
+        for inner_out_name in inner_outputs:
+            if inner_out_name in parent_block.vars:
+                out_vars.append(parent_block.var(inner_out_name))
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        parent_block.append_op(
+            type='while',
+            inputs={
+                'X': [parent_block.var(x_name) for x_name in x_name_list],
+                'Condition': [self.cond_var]
+            },
+            outputs={'Out': out_vars,
+                     'StepScopes': [step_scope]},
+            attrs={'sub_block': while_block})
+
+
+def lod_rank_table(x, level=0, main_program=None):
+    """
+    This function creates an operator for creating a LOD_RANK_TABLE
+    using the input x.
+    """
+    helper = LayerHelper("lod_rank_table", **locals())
+    table = helper.create_variable(
+        type=core.VarDesc.VarType.LOD_RANK_TABLE,
+        name=unique_name("lod_rank_table"))
+    helper.append_op(
+        type='lod_rank_table',
+        inputs={'X': x},
+        outputs={'Out': table},
+        attrs={'level': level})
+    return table
+
+
+def max_sequence_len(rank_table, main_program=None):
+    """
+    This function creates an operator to calculate the length of
+    max seqence through input rank_table(should be a lod_rank_table)
+    """
+    helper = LayerHelper("max_seqence_len", **locals())
+    res = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="max_sequence_len",
+        inputs={"RankTable": rank_table},
+        outputs={"Out": res})
+    return res
+
+
+def topk(input, k, main_program=None, startup_program=None):
+    helper = LayerHelper('topk', **locals())
+    topk_out = helper.create_tmp_variable(dtype=input.data_type)
+    topk_indices = helper.create_tmp_variable(dtype='int64')
+    helper.append_op(
+        type='top_k',
+        inputs={'X': [input]},
+        outputs={'Out': [topk_out],
+                 'Indices': [topk_indices]},
+        attrs={'k': k})
+    return topk_out, topk_indices
+
+
+def lod_tensor_to_array(x, table, main_program=None):
+    """
+    This function creates an operator to convert an LOD_Tensor to
+    an array.
+    """
+    helper = LayerHelper("lod_tensor_to_array", **locals())
+    array = helper.create_variable(
+        name=unique_name("lod_tensor_to_array"),
+        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        dtype=x.dtype)
+    helper.append_op(
+        type='lod_tensor_to_array',
+        inputs={'X': x,
+                'RankTable': table},
+        outputs={'Out': array})
+    return array
+
+
+def array_to_lod_tensor(x, table, main_program=None, startup_program=None):
+    """
+    This function creates an operator to convert an array to a
+    LOD_Tensor.
+    """
+    helper = LayerHelper("array_to_lod_tensor", **locals())
+    tmp = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="array_to_lod_tensor",
+        inputs={'X': x,
+                'RankTable': table},
+        outputs={'Out': tmp})
+    return tmp
+
+
+def increment(x,
+              value=1.0,
+              in_place=True,
+              main_program=None,
+              startup_program=None):
+    """
+    This function creates an operator to increment each value in the input
+    `x` by an amount: `value` as mentioned in the input parameter. This
+    operation is performed in-place by default.
+    """
+    helper = LayerHelper("increment", **locals())
+    if not in_place:
+        out = helper.create_tmp_variable(dtype=x.dtype)
+    else:
+        out = x
+    helper.append_op(
+        type='increment',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={'step': float(value)})
+    return out
+
+
+def array_write(x, i, array=None, main_program=None, startup_program=None):
+    """
+    This function creates an operator to write the data out as a
+    LOD_TENSOR_ARRAY.
+    """
+    helper = LayerHelper('array_write', **locals())
+    if array is None:
+        array = helper.create_variable(
+            name="{0}.out".format(helper.name),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=x.dtype)
+    helper.append_op(
+        type='write_to_array',
+        inputs={'X': [x],
+                'I': [i]},
+        outputs={'Out': [array]})
+    return array
+
+
+def create_array(dtype, main_program=None):
+    helper = LayerHelper("array", **locals())
+    return helper.create_variable(
+        name="{0}.out".format(helper.name),
+        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        dtype=dtype)
+
+
+def less_than(x, y, cond=None, main_program=None, **ignored):
+    helper = LayerHelper("less_than", **locals())
+    if cond is None:
+        cond = helper.create_tmp_variable(dtype='bool')
+        cond.stop_gradient = True
+
+    helper.append_op(
+        type='less_than', inputs={'X': [x],
+                                  'Y': [y]}, outputs={'Out': [cond]})
+    return cond
+
+
+def array_read(array, i, main_program=None, startup_program=None):
+    """
+    This function creates an operator to read the data in as a
+    LOD_TENSOR_ARRAY.
+    """
+    helper = LayerHelper('array_read', **locals())
+    if not isinstance(
+            array,
+            Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+        raise TypeError("array should be tensor array vairable")
+    out = helper.create_tmp_variable(dtype=array.dtype)
+    helper.append_op(
+        type='read_from_array',
+        inputs={'X': [array],
+                'I': [i]},
+        outputs={'Out': [out]})
+    return out
+
+
+def shrink_memory(x, i, table, main_program=None, startup_program=None):
+    """
+    This function creates an operator to shrink_rnn_memory using the RankTable
+    as mentioned in the input parameter.
+    """
+    helper = LayerHelper('shrink_memory', **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='shrink_rnn_memory',
+        inputs={'X': [x],
+                'I': [i],
+                'RankTable': [table]},
+        outputs={'Out': [out]},
+        attrs={})
+    return out
+
+
+def array_length(array, main_program=None):
+    """
+    This function creates an operator to find the length of the
+    LOD_TENSOR_ARRAY.
+    """
+    helper = LayerHelper('array_length', **locals())
+    tmp = helper.create_tmp_variable(dtype='int64')
+    tmp.stop_gradient = True
+    helper.append_op(
+        type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
+    return tmp
+
+
+class ConditionalBlockGuard(BlockGuard):
+    def __init__(self, block):
+        if not isinstance(block, ConditionalBlock):
+            raise TypeError("block should be conditional block")
+        super(ConditionalBlockGuard, self).__init__(block.helper.main_program)
+        self.block = block
+
+    def __enter__(self):
+        return super(ConditionalBlockGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.block.complete()
+        return super(ConditionalBlockGuard, self).__exit__(exc_type, exc_val,
+                                                           exc_tb)
+
+
+class ConditionalBlock(object):
+    def __init__(self,
+                 inputs,
+                 name=None,
+                 main_program=None,
+                 startup_program=None):
+        for each_input in inputs:
+            if not isinstance(each_input, Variable):
+                raise TypeError("Each input should be variable")
+        self.inputs = inputs
+        self.helper = LayerHelper(
+            'conditional_block',
+            name=name,
+            main_program=main_program,
+            startup_program=startup_program)
+
+    def block(self):
+        return ConditionalBlockGuard(self)
+
+    def complete(self):
+        inside_block = self.helper.main_program.current_block()
+        parent_block = self.helper.main_program.block(inside_block.parent_idx)
+
+        intermediate = set()
+        params = set()
+
+        for each_op in inside_block.ops:
+            assert isinstance(each_op, Operator)
+            for iname in each_op.input_names:
+                for in_var_name in each_op.input(iname):
+                    if in_var_name not in intermediate:
+                        params.add(in_var_name)
+
+            for oname in each_op.output_names:
+                for out_var_name in each_op.output(oname):
+                    intermediate.add(out_var_name)
+        input_set = set([ipt.name for ipt in self.inputs])
+
+        param_list = [
+            parent_block.var(each_name) for each_name in params
+            if each_name not in input_set
+        ]
+
+        out_list = [
+            parent_block.var(var_name) for var_name in parent_block.vars
+            if var_name not in intermediate
+        ]
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+        parent_block.append_op(
+            type='conditional_block',
+            inputs={
+                'X': self.inputs,
+                'Params': param_list,
+            },
+            outputs={'Out': out_list,
+                     'Scope': [step_scope]},
+            attrs={'sub_block': inside_block})
+
+
+class IfElseBlockGuard(object):
+    def __init__(self, is_true, ifelse):
+        if not isinstance(ifelse, IfElse):
+            raise TypeError("ifelse must be an instance of IfElse class")
+
+        if ifelse.status != IfElse.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("You cannot invoke IfElse.block() inside a block")
+
+        self.is_true = is_true
+        self.ie = ifelse
+        if is_true:
+            self.cond_block = ifelse.conditional_true_block
+        else:
+            self.cond_block = ifelse.conditional_false_block
+
+        if not isinstance(self.cond_block, ConditionalBlock):
+            raise TypeError("Unexpected situation")
+
+        self.cond_block = self.cond_block.block()
+
+    def __enter__(self):
+        self.ie.status = IfElse.IN_IF_ELSE_TRUE_BLOCKS if self.is_true else IfElse.IN_IF_ELSE_FALSE_BLOCKS
+        self.cond_block.__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.cond_block.__exit__(exc_type, exc_val, exc_tb):
+            # re-raise inside exception
+            return False
+        if len(self.ie.output_table[1 if self.is_true else 0]) == 0:
+            raise ValueError("Must set output inside block")
+        self.ie.status = IfElse.OUT_IF_ELSE_BLOCKS
+
+
+class IfElse(object):
+    OUT_IF_ELSE_BLOCKS = 0
+    IN_IF_ELSE_TRUE_BLOCKS = 1
+    IN_IF_ELSE_FALSE_BLOCKS = 2
+
+    def __init__(self, cond, name=None, main_program=None,
+                 startup_program=None):
+        if not isinstance(cond, Variable):
+            raise TypeError("cond must be a Variable")
+        self.helper = LayerHelper(
+            'ifelse',
+            name=name,
+            main_program=main_program,
+            startup_program=startup_program)
+        self.cond = cond
+        self.input_table = {}
+        self.status = IfElse.OUT_IF_ELSE_BLOCKS
+        self.conditional_true_block = ConditionalBlock(inputs=[self.cond])
+        self.conditional_false_block = ConditionalBlock(inputs=[self.cond])
+        self.output_table = ([], [])  # (true_outs, false_outs)
+
+    def input(self, x):
+        if self.status == IfElse.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("input must in true/false blocks")
+        if id(x) not in self.input_table:
+            parent_block = self.parent_block()
+            out_true = parent_block.create_var(
+                name=unique_name('ifelse_input' + self.helper.name),
+                dtype=x.dtype)
+
+            out_false = parent_block.create_var(
+                name=unique_name('ifelse_input' + self.helper.name),
+                dtype=x.dtype)
+            parent_block.append_op(
+                type='split_lod_tensor',
+                inputs={
+                    'X': x,
+                    'Mask': self.cond,
+                },
+                outputs={'OutTrue': out_true,
+                         'OutFalse': out_false},
+                attrs={'level': 0})
+            self.input_table[id(x)] = (out_true, out_false)
+        else:
+            out_true, out_false = self.input_table[id(x)]
+
+        if self.status == IfElse.IN_IF_ELSE_TRUE_BLOCKS:
+            return out_true
+        else:
+            return out_false
+
+    def parent_block(self):
+        current_block = self.helper.main_program.current_block()
+        return self.helper.main_program.block(current_block.parent_idx)
+
+    def true_block(self):
+        return IfElseBlockGuard(True, self)
+
+    def false_block(self):
+        return IfElseBlockGuard(False, self)
+
+    def output(self, *outs):
+        if self.status == self.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("output can only be invoked in the sub-block")
+
+        out_table = self.output_table[1 if self.status ==
+                                      self.IN_IF_ELSE_TRUE_BLOCKS else 0]
+        parent_block = self.parent_block()
+        for each_out in outs:
+            if not isinstance(each_out, Variable):
+                raise TypeError("Each output should be a variable")
+            # create outside tensor
+            outside_out = parent_block.create_var(
+                name=unique_name("_".join([self.helper.name, 'output'])),
+                dtype=each_out.dtype)
+            out_table.append(outside_out)
+
+            # assign local var to outside
+            assign(
+                input=each_out,
+                output=outside_out,
+                main_program=self.helper.main_program,
+                startup_program=self.helper.startup_program)
+
+    def __call__(self):
+        if self.status != self.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("IfElse::__call__ must be out of sub-block")
+        false_len, true_len = map(len, self.output_table)
+        if false_len == 0 and true_len == 0:
+            raise ValueError("Must invoke true_block/false_block before "
+                             "__call__")
+        elif false_len != true_len and false_len != 0 and true_len != 0:
+            raise ValueError("The output side must be same")
+        elif false_len == 0 or true_len == 0:
+            return self.output_table[0 if false_len != 0 else 1]
+
+        # else none of false_len/true_len is zero
+        # merge together
+        rlist = []
+        for false_var, true_var in zip(*self.output_table):
+            rlist.append(
+                merge_lod_tensor(
+                    in_true=true_var,
+                    in_false=false_var,
+                    mask=self.cond,
+                    x=self.cond,
+                    level=0,
+                    main_program=self.helper.main_program,
+                    startup_program=self.helper.startup_program))
+        return rlist
+
+
+class DynamicRNN(object):
+    BEFORE_RNN = 0
+    IN_RNN = 1
+    AFTER_RNN = 2
+
+    def __init__(self, name=None, main_program=None, startup_program=None):
+        self.helper = LayerHelper(
+            'dynamic_rnn',
+            name=name,
+            main_program=main_program,
+            startup_program=startup_program)
+        self.status = DynamicRNN.BEFORE_RNN
+        self.lod_rank_table = None
+        self.max_seq_len = None
+        self.step_idx = None
+        self.zero_idx = fill_constant(shape=[1], value=0, dtype='int64')
+        self.mem_dict = dict()
+        self.output_array = []
+        self.outputs = []
+        self.cond = self.helper.create_tmp_variable(dtype='bool')
+        self.cond.stop_gradient = False
+        self.while_op = While(self.cond)
+        self.input_array = []
+        self.mem_link = []
+
+    def step_input(self, x):
+        self._assert_in_rnn_block_("step_input")
+        if not isinstance(x, Variable):
+            raise TypeError(
+                "step_input() can only take a Variable as its input")
+        parent_block = self._parent_block_()
+        if self.lod_rank_table is None:
+            self.lod_rank_table = parent_block.create_var(
+                name=unique_name('lod_rank_table'),
+                type=core.VarDesc.VarType.LOD_RANK_TABLE)
+            self.lod_rank_table.stop_gradient = True
+            parent_block.append_op(
+                type='lod_rank_table',
+                inputs={"X": x},
+                outputs={"Out": self.lod_rank_table})
+            self.max_seq_len = parent_block.create_var(
+                name=unique_name('dynamic_rnn_max_seq_len'), dtype='int64')
+            self.max_seq_len.stop_gradient = False
+            parent_block.append_op(
+                type='max_sequence_len',
+                inputs={'RankTable': self.lod_rank_table},
+                outputs={"Out": self.max_seq_len})
+            self.cond.stop_gradient = True
+            parent_block.append_op(
+                type='less_than',
+                inputs={'X': self.step_idx,
+                        'Y': self.max_seq_len},
+                outputs={'Out': self.cond})
+
+        input_array = parent_block.create_var(
+            name=unique_name('dynamic_rnn_input_array'),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=x.dtype)
+        self.input_array.append((input_array, x.dtype))
+        parent_block.append_op(
+            type='lod_tensor_to_array',
+            inputs={'X': x,
+                    'RankTable': self.lod_rank_table},
+            outputs={'Out': input_array})
+        return array_read(
+            array=input_array, i=self.step_idx, **self.helper.to_kwargs)
+
+    @contextlib.contextmanager
+    def block(self):
+        if self.status != DynamicRNN.BEFORE_RNN:
+            raise ValueError("rnn.block() can only be invoke once")
+        self.step_idx = fill_constant(shape=[1], dtype='int64', value=0)
+        self.step_idx.stop_gradient = False
+        self.status = DynamicRNN.IN_RNN
+        with self.while_op.block():
+            yield
+            increment(
+                x=self.step_idx,
+                value=1.0,
+                in_place=True,
+                **self.helper.to_kwargs)
+
+            for new_mem, mem_array in self.mem_link:
+                array_write(
+                    x=new_mem,
+                    i=self.step_idx,
+                    array=mem_array,
+                    **self.helper.to_kwargs)
+
+            less_than(
+                x=self.step_idx,
+                y=self.max_seq_len,
+                cond=self.cond,
+                **self.helper.to_kwargs)
+
+        self.status = DynamicRNN.AFTER_RNN
+        for each_array in self.output_array:
+            self.outputs.append(
+                array_to_lod_tensor(
+                    x=each_array,
+                    table=self.lod_rank_table,
+                    **self.helper.to_kwargs))
+
+    def __call__(self, *args, **kwargs):
+        if self.status != DynamicRNN.AFTER_RNN:
+            raise ValueError(
+                "Dynamic RNN outputs can only be retrieved after rnn block")
+        if len(self.outputs) == 1:
+            return self.outputs[0]
+        else:
+            return self.outputs
+
+    def memory(self, init=None, shape=None, value=0.0, dtype='float32'):
+        self._assert_in_rnn_block_('memory')
+        if init is not None:
+            if not isinstance(init, Variable):
+                raise TypeError(
+                    "The input arg `init` of memory() must be a Variable")
+            parent_block = self._parent_block_()
+            mem_array = parent_block.create_var(
+                name=unique_name('dynamic_rnn_mem_array'),
+                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                dtype=init.dtype)
+            parent_block.append_op(
+                type='write_to_array',
+                inputs={'X': init,
+                        'I': self.zero_idx},
+                outputs={'Out': mem_array})
+            retv = array_read(
+                array=mem_array, i=self.step_idx, **self.helper.to_kwargs)
+            retv = shrink_memory(
+                x=retv,
+                i=self.step_idx,
+                table=self.lod_rank_table,
+                **self.helper.to_kwargs)
+            self.mem_dict[retv.name] = mem_array
+            return retv
+        else:
+            if len(self.input_array) == 0:
+                raise ValueError(
+                    "step_input should be invoked before memory(shape=..., value=...)"
+                )
+            parent_block = self._parent_block_()
+            init = parent_block.create_var(
+                name=unique_name('mem_init'), dtype=dtype)
+            arr, dtype = self.input_array[0]
+            in0 = parent_block.create_var(name=unique_name('in0'), dtype=dtype)
+            parent_block.append_op(
+                type='read_from_array',
+                inputs={'X': [arr],
+                        'I': [self.zero_idx]},
+                outputs={'Out': [in0]})
+            parent_block.append_op(
+                type='fill_constant_batch_size_like',
+                inputs={'Input': [in0]},
+                outputs={'Out': [init]},
+                attrs={
+                    'shape': [-1] + shape,
+                    'value': float(value),
+                    'dtype': init.dtype
+                })
+            return self.memory(init=init)
+
+    def update_memory(self, ex_mem, new_mem):
+        self._assert_in_rnn_block_('update_memory')
+        if not isinstance(ex_mem, Variable):
+            raise TypeError("The input arg `ex_mem` of update_memory() must "
+                            "be a Variable")
+        if not isinstance(new_mem, Variable):
+            raise TypeError("The input arg `new_mem` of update_memory() must "
+                            "be a Variable")
+
+        mem_array = self.mem_dict.get(ex_mem.name, None)
+        if mem_array is None:
+            raise ValueError("Please invoke memory before update_memory")
+        if self.lod_rank_table is None:
+            raise ValueError("Please invoke step_input before update_memory")
+
+        self.mem_link.append((new_mem, mem_array))
+
+    def output(self, *outputs):
+        self._assert_in_rnn_block_('output')
+        parent_block = self._parent_block_()
+        for each in outputs:
+            outside_array = parent_block.create_var(
+                name=unique_name("_".join(
+                    [self.helper.name, "output_array", each.name])),
+                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                dtype=each.dtype)
+            array_write(x=each, i=self.step_idx, array=outside_array)
+            self.output_array.append(outside_array)
+
+    def _parent_block_(self):
+        prog = self.helper.main_program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+
+        return parent_block
+
+    def _assert_in_rnn_block_(self, method):
+        if self.status != DynamicRNN.IN_RNN:
+            raise ValueError("{0} can only be invoked inside rnn block.".format(
+                method))
diff --git a/python/paddle/v2/fluid/layers/io.py b/python/paddle/v2/fluid/layers/io.py
new file mode 100644
index 0000000000..f03d8e3c3e
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/io.py
@@ -0,0 +1,57 @@
+from .. import core
+from ..layer_helper import LayerHelper
+
+__all__ = ['data']
+
+
+def data(name,
+         shape,
+         append_batch_size=True,
+         dtype='float32',
+         lod_level=0,
+         type=core.VarDesc.VarType.LOD_TENSOR,
+         main_program=None,
+         startup_program=None,
+         stop_gradient=True):
+    """
+    Data Layer.
+
+    Args:
+       name: The name/alias of the function
+       shape: Tuple declaring the shape.
+       append_batch_size: Whether or not to append the data as a batch.
+       dtype: The type of data : float32, float_16, int etc
+       type: The output type. By default it is LOD_TENSOR.
+       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
+       main_program: Name of the main program that calls this
+       startup_program: Name of the startup program
+       stop_gradient: A boolean that mentions whether gradient should flow.
+
+    This function takes in input and based on whether data has
+    to be returned back as a minibatch, it creates the global variable using
+    the helper functions. The global variables can be accessed by all the
+    following operations and layers in the graph.
+
+    All the input variables of this function are passed in as local variables
+    to the LayerHelper constructor.
+
+    """
+    helper = LayerHelper('data', **locals())
+    shape = list(shape)
+    for i in xrange(len(shape)):
+        if shape[i] is None:
+            shape[i] = -1
+            append_batch_size = False
+        elif shape[i] < 0:
+            append_batch_size = False
+
+    if append_batch_size:
+        shape = [-1] + shape  # append batch size as -1
+
+    return helper.create_global_variable(
+        name=name,
+        shape=shape,
+        dtype=dtype,
+        type=type,
+        stop_gradient=stop_gradient,
+        lod_level=lod_level)
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
new file mode 100644
index 0000000000..2be8c8af9b
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -0,0 +1,858 @@
+"""
+All layers just related to the neural network.
+"""
+
+from ..layer_helper import LayerHelper
+from ..initializer import Normal, Constant
+from ..framework import Variable
+
+__all__ = [
+    'fc', 'embedding', 'dynamic_lstm', 'gru_unit', 'linear_chain_crf',
+    'crf_decoding', 'cos_sim', 'cross_entropy', 'square_error_cost', 'accuracy',
+    'chunk_eval', 'sequence_conv', 'conv2d', 'sequence_pool', 'pool2d',
+    'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'sequence_expand'
+]
+
+
+def fc(input,
+       size,
+       num_flatten_dims=1,
+       param_attr=None,
+       bias_attr=None,
+       act=None,
+       name=None,
+       main_program=None,
+       startup_program=None):
+    """
+    Fully Connected Layer.
+
+    Args:
+       input: The input tensor to the function
+       size: The size of the layer
+       num_flatten_dims: Number of columns in input
+       param_attr: The parameters/weights to the FC Layer
+       param_initializer: Initializer used for the weight/parameter. If None, XavierInitializer() is used
+       bias_attr: The bias parameter for the FC layer
+       bias_initializer: Initializer used for the bias. If None, then ConstantInitializer() is used
+       act: Activation to be applied to the output of FC layer
+       name: Name/alias of the function
+       main_program: Name of the main program that calls this
+       startup_program: Name of the startup program
+
+    This function can take in multiple inputs and performs the Fully Connected
+    function (linear transformation) on top of each of them.
+    So for input x, the output will be : Wx + b. Where W is the parameter,
+    b the bias and x is the input.
+
+    The function also applies an activation (non-linearity) on top of the
+    output, if activation is passed in the input.
+
+    All the input variables of this function are passed in as local variables
+    to the LayerHelper constructor.
+
+    """
+    helper = LayerHelper('fc', **locals())
+
+    dtype = helper.input_dtype()
+
+    mul_results = []
+    for input_var, param_attr in helper.iter_inputs_and_params():
+        input_shape = input_var.shape
+        param_shape = [
+            reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
+        ] + [size]
+        w = helper.create_parameter(
+            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
+        tmp = helper.create_tmp_variable(dtype)
+        helper.append_op(
+            type="mul",
+            inputs={
+                "X": input_var,
+                "Y": w,
+            },
+            outputs={"Out": tmp},
+            attrs={'x_num_col_dims': num_flatten_dims,
+                   'y_num_col_dims': 1})
+        mul_results.append(tmp)
+
+    # sum
+    if len(mul_results) == 1:
+        pre_bias = mul_results[0]
+    else:
+        pre_bias = helper.create_tmp_variable(dtype)
+        helper.append_op(
+            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
+    # add bias
+    pre_activation = helper.append_bias_op(pre_bias)
+    # add activation
+    return helper.append_activation(pre_activation)
+
+
+def embedding(input,
+              size,
+              is_sparse=False,
+              param_attr=None,
+              dtype='float32',
+              main_program=None,
+              startup_program=None):
+    """
+    Embedding Layer.
+
+    Args:
+       param_initializer:
+       input: The input to the function
+       size: The size of the layer
+       is_sparse: A flag that decleares whether the input is sparse
+       param_attr: Parameters for this layer
+       dtype: The type of data : float32, float_16, int etc
+       main_program: Name of the main program that calls this
+       startup_program: Name of the startup program
+
+    This function can take in the input (which is a vector of IDs) and
+    performs a lookup in the lookup_table using these IDs, to result into
+    the embedding of each ID in the input.
+
+    All the input variables of this function are passed in as local variables
+    to the LayerHelper constructor.
+
+    """
+
+    helper = LayerHelper('embedding', **locals())
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
+    tmp = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='lookup_table',
+        inputs={'Ids': input,
+                'W': w},
+        outputs={'Out': tmp},
+        attrs={'is_sparse': is_sparse})
+    return tmp
+
+
+# TODO(qijun): expose H0 and C0
+def dynamic_lstm(input,
+                 size,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_peepholes=True,
+                 is_reverse=False,
+                 gate_activation='sigmoid',
+                 cell_activation='tanh',
+                 candidate_activation='tanh',
+                 dtype='float32',
+                 main_program=None,
+                 startup_program=None):
+    helper = LayerHelper('lstm', **locals())
+    size = size / 4
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
+    bias_size = [1, 7 * size]
+    if not use_peepholes:
+        bias_size[1] = 4 * size
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    hidden = helper.create_tmp_variable(dtype)
+    cell = helper.create_tmp_variable(dtype)
+    batch_gate = helper.create_tmp_variable(dtype)
+    batch_cell_pre_act = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='lstm',
+        inputs={'Input': input,
+                'Weight': weight,
+                'Bias': bias},
+        outputs={
+            'Hidden': hidden,
+            'Cell': cell,
+            'BatchGate': batch_gate,
+            'BatchCellPreAct': batch_cell_pre_act
+        },
+        attrs={
+            'use_peepholes': use_peepholes,
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'cell_activation': cell_activation,
+            'candidate_activation': candidate_activation
+        })
+    return hidden, cell
+
+
+def gru_unit(input,
+             hidden,
+             size,
+             weight=None,
+             bias=None,
+             activation='tanh',
+             gate_activation='sigmoid',
+             main_program=None,
+             startup_program=None):
+    """
+    GRUUnit Operator implements partial calculations of the GRU unit as following:
+
+    $$
+    update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+    reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+    output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+    output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
+    $$
+
+    which is same as one time step of GRU Operator.
+
+    @note To implement the complete GRU unit, fully-connected operator must be
+    used before to feed xu, xr and xc as the Input of GRUUnit operator.
+
+    TODO(ChunweiYan) add more document here
+    """
+    activation_dict = dict(
+        identity=0,
+        sigmoid=1,
+        tanh=2,
+        relu=3, )
+    activation = activation_dict[activation]
+    gate_activation = activation_dict[gate_activation]
+
+    helper = LayerHelper('gru_unit', **locals())
+    dtype = helper.input_dtype()
+    size = size / 3
+
+    # create weight
+    if weight is None:
+        weight = helper.create_parameter(
+            attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
+
+    # create bias
+    if bias is None:
+        bias_size = [1, 3 * size]
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    gate = helper.create_tmp_variable(dtype)
+    reset_hidden_pre = helper.create_tmp_variable(dtype)
+    updated_hidden = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='gru_unit',
+        inputs={'Input': input,
+                'HiddenPrev': hidden,
+                'Weight': weight},
+        outputs={
+            'Gate': gate,
+            'ResetHiddenPrev': reset_hidden_pre,
+            'Hidden': updated_hidden,
+        },
+        attrs={
+            'activation': 0,
+            'gate_activation': 1,
+        })
+
+    return updated_hidden, reset_hidden_pre, gate
+
+
+def linear_chain_crf(input,
+                     label,
+                     param_attr=None,
+                     main_program=None,
+                     startup_program=None):
+    helper = LayerHelper('linear_chain_crf', **locals())
+    size = input.shape[1]
+    transition = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[size + 2, size],
+        dtype=helper.input_dtype())
+    alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
+    emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
+    transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
+    log_likelihood = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='linear_chain_crf',
+        inputs={"Emission": [input],
+                "Transition": transition,
+                "Label": label},
+        outputs={
+            "Alpha": [alpha],
+            "EmissionExps": [emission_exps],
+            "TransitionExps": transition_exps,
+            "LogLikelihood": log_likelihood
+        })
+
+    return log_likelihood
+
+
+def crf_decoding(input,
+                 param_attr,
+                 label=None,
+                 main_program=None,
+                 startup_program=None):
+    helper = LayerHelper('crf_decoding', **locals())
+    transition = helper.get_parameter(param_attr.name)
+    viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='crf_decoding',
+        inputs={"Emission": [input],
+                "Transition": transition,
+                "Label": label},
+        outputs={"ViterbiPath": [viterbi_path]})
+
+    return viterbi_path
+
+
+def cos_sim(X, Y, **kwargs):
+    """
+    This function performs the cosine similarity between two tensors
+    X and Y and returns that as the output.
+    """
+    helper = LayerHelper('cos_sim', **kwargs)
+    out = helper.create_tmp_variable(dtype=X.dtype)
+    xnorm = helper.create_tmp_variable(dtype=X.dtype)
+    ynorm = helper.create_tmp_variable(dtype=X.dtype)
+    helper.append_op(
+        type='cos_sim',
+        inputs={'X': [X],
+                'Y': [Y]},
+        outputs={'Out': [out],
+                 'XNorm': [xnorm],
+                 'YNorm': [ynorm]})
+    return out
+
+
+def cross_entropy(input, label, **kwargs):
+    """
+    This function computes cross_entropy using the input and label.
+    """
+    helper = LayerHelper('cross_entropy', **kwargs)
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='cross_entropy',
+        inputs={'X': [input],
+                'Label': [label]},
+        outputs={'Y': [out]},
+        attrs=kwargs)
+    return out
+
+
+def square_error_cost(input, label, **kwargs):
+    """
+    This functions returns the squared error cost using the input and label.
+    The output is appending the op to do the above.
+    """
+    helper = LayerHelper('square_error_cost', **kwargs)
+    minus_out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='elementwise_sub',
+        inputs={'X': [input],
+                'Y': [label]},
+        outputs={'Out': [minus_out]})
+
+    square_out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='square', inputs={'X': [minus_out]}, outputs={'Y': [square_out]})
+    return square_out
+
+
+def accuracy(input, label, k=1, correct=None, total=None, **kwargs):
+    """
+    This function computes the accuracy using the input and label.
+    The output is the top_k inputs and their indices.
+    """
+    helper = LayerHelper("accuracy", **kwargs)
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [topk_out],
+                 "Indices": [topk_indices]},
+        attrs={"k": k})
+    acc_out = helper.create_tmp_variable(dtype="float32")
+    if correct is None:
+        correct = helper.create_tmp_variable(dtype="int64")
+    if total is None:
+        total = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="accuracy",
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
+        outputs={
+            "Accuracy": [acc_out],
+            "Correct": [correct],
+            "Total": [total],
+        })
+    return acc_out
+
+
+def chunk_eval(input,
+               label,
+               chunk_scheme,
+               num_chunk_types,
+               excluded_chunk_types=None,
+               **kwargs):
+    """
+    This function computes and outputs the precision, recall and
+    F1-score of chunk detection.
+    """
+    helper = LayerHelper("chunk_eval", **kwargs)
+
+    # prepare output
+    precision = helper.create_tmp_variable(dtype="float32")
+    recall = helper.create_tmp_variable(dtype="float32")
+    f1_score = helper.create_tmp_variable(dtype="float32")
+    num_infer_chunks = helper.create_tmp_variable(dtype="int64")
+    num_label_chunks = helper.create_tmp_variable(dtype="int64")
+    num_correct_chunks = helper.create_tmp_variable(dtype="int64")
+
+    helper.append_op(
+        type="chunk_eval",
+        inputs={"Inference": [input],
+                "Label": [label]},
+        outputs={
+            "Precision": [precision],
+            "Recall": [recall],
+            "F1-Score": [f1_score],
+            "NumInferChunks": [num_infer_chunks],
+            "NumLabelChunks": [num_label_chunks],
+            "NumCorrectChunks": [num_correct_chunks]
+        },
+        attrs={
+            "num_chunk_types": num_chunk_types,
+            'chunk_scheme': chunk_scheme,
+            'excluded_chunk_types': excluded_chunk_types or []
+        })
+    return precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks
+
+
+def sequence_conv(input,
+                  num_filters,
+                  filter_size=3,
+                  filter_stride=1,
+                  padding=None,
+                  bias_attr=None,
+                  param_attr=None,
+                  act=None,
+                  main_program=None,
+                  startup_program=None):
+    """
+    This function creates the op for sequence_conv, using the inputs and
+    other convolutional configurations for the filters and stride as given
+    in the input parameters to the function.
+    """
+
+    # FIXME(dzh) : want to unify the argument of python layer
+    # function. So we ignore some unecessary attributes.
+    # such as, padding_trainable, context_start.
+
+    helper = LayerHelper('sequence_conv', **locals())
+    dtype = helper.input_dtype()
+    filter_shape = [filter_size * input.shape[1], num_filters]
+    filter_param = helper.create_parameter(
+        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
+    pre_bias = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='sequence_conv',
+        inputs={
+            'X': [input],
+            'Filter': [filter_param],
+        },
+        outputs={"Out": pre_bias},
+        attrs={
+            'contextStride': filter_stride,
+            'contextStart': -int(filter_size / 2),
+            'contextLength': filter_size
+        })
+    pre_act = helper.append_bias_op(pre_bias)
+    return helper.append_activation(pre_act)
+
+
+def conv2d(input,
+           num_filters,
+           filter_size,
+           stride=None,
+           padding=None,
+           groups=None,
+           param_attr=None,
+           bias_attr=None,
+           act=None,
+           name=None,
+           main_program=None,
+           startup_program=None):
+    """
+    This function creates the op for a 2-dimensional Convolution.
+    This is performed using the parameters of filters(size, dimensionality etc)
+    , stride and other configurations for a Convolution operation.
+    This funciton can also append an activation on top of the
+    conv-2d output, if mentioned in the input parameters.
+    """
+
+    if stride is None:
+        stride = [1, 1]
+    helper = LayerHelper('conv2d', **locals())
+    dtype = helper.input_dtype()
+
+    num_channels = input.shape[1]
+    if groups is None:
+        num_filter_channels = num_channels
+    else:
+        if num_channels % groups != 0:
+            raise ValueError("num_channels must be divisible by groups.")
+        num_filter_channels = num_channels / groups
+
+    if isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+    if isinstance(stride, int):
+        stride = [stride, stride]
+    if isinstance(padding, int):
+        padding = [padding, padding]
+
+    input_shape = input.shape
+    filter_shape = [num_filters, num_filter_channels] + filter_size
+
+    def _get_default_param_initializer():
+        std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
+        return Normal(0.0, std, 0)
+
+    filter_param = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=filter_shape,
+        dtype=dtype,
+        default_initializer=_get_default_param_initializer())
+
+    pre_bias = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='conv2d_cudnn',
+        inputs={
+            'Input': input,
+            'Filter': filter_param,
+        },
+        outputs={"Output": pre_bias},
+        attrs={'strides': stride,
+               'paddings': padding,
+               'groups': groups})
+
+    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
+
+    return helper.append_activation(pre_act)
+
+
+def sequence_pool(input, pool_type, **kwargs):
+    """
+    This function add the operator for sequence pooling.
+    This is applied on top of the input using pool_type mentioned
+    in the parameters.
+    """
+    helper = LayerHelper('sequence_pool', input=input, **kwargs)
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+    max_index = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="sequence_pool",
+        inputs={"X": input},
+        outputs={"Out": pool_out,
+                 "MaxIndex": max_index},
+        attrs={"pooltype": pool_type.upper()})
+
+    return pool_out
+
+
+def pool2d(input,
+           pool_size,
+           pool_type,
+           pool_stride=None,
+           pool_padding=None,
+           global_pooling=False,
+           main_program=None,
+           startup_program=None):
+    """
+    This function adds the operator for pooling in 2 dimensions, using the
+    pooling configurations mentioned in input parameters.
+    """
+    if pool_padding is None:
+        pool_padding = [0, 0]
+    if pool_stride is None:
+        pool_stride = [1, 1]
+    if pool_type not in ["max", "avg"]:
+        raise ValueError(
+            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            str(pool_type))
+    if isinstance(pool_size, int):
+        pool_size = [pool_size, pool_size]
+    if isinstance(pool_stride, int):
+        pool_stride = [pool_stride, pool_stride]
+    if isinstance(pool_padding, int):
+        pool_padding = [pool_padding, pool_padding]
+
+    helper = LayerHelper('pool2d', **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="pool2d",
+        inputs={"X": input},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "global_pooling": global_pooling,
+            "strides": pool_stride,
+            "paddings": pool_padding
+        })
+
+    return pool_out
+
+
+def batch_norm(input,
+               act=None,
+               is_test=False,
+               momentum=0.9,
+               epsilon=1e-05,
+               param_attr=None,
+               bias_attr=None,
+               data_layout='NCHW',
+               main_program=None,
+               startup_program=None):
+    """
+    This function helps create an operator to implement
+    the BatchNorm layer using the configurations from the input parameters.
+    """
+    helper = LayerHelper('batch_norm', **locals())
+    dtype = helper.input_dtype()
+
+    input_shape = input.shape
+    if data_layout == 'NCHW':
+        channel_num = input_shape[1]
+    else:
+        if data_layout == 'NHWC':
+            channel_num = input_shape[-1]
+        else:
+            raise ValueError("unsupported data layout:" + data_layout)
+
+    param_shape = [channel_num]
+
+    # create parameter
+    scale = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=dtype,
+        default_initializer=Constant(1.0))
+
+    bias = helper.create_parameter(
+        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True)
+
+    mean = helper.create_global_variable(
+        dtype=input.dtype, shape=param_shape, persistable=True)
+    helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
+
+    variance = helper.create_global_variable(
+        dtype=input.dtype, shape=param_shape, persistable=True)
+    helper.set_variable_initializer(var=variance, initializer=Constant(1.0))
+
+    # create output
+    # mean and mean_out share the same memory
+    mean_out = mean
+    # variance and variance out share the same memory
+    variance_out = variance
+    saved_mean = helper.create_tmp_variable(dtype)
+    saved_variance = helper.create_tmp_variable(dtype)
+
+    batch_norm_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="batch_norm",
+        inputs={
+            "X": input,
+            "Scale": scale,
+            "Bias": bias,
+            "Mean": mean,
+            "Variance": variance
+        },
+        outputs={
+            "Y": batch_norm_out,
+            "MeanOut": mean_out,
+            "VarianceOut": variance_out,
+            "SavedMean": saved_mean,
+            "SavedVariance": saved_variance
+        },
+        attrs={"momentum": momentum,
+               "epsilon": epsilon,
+               "is_test": is_test})
+
+    return helper.append_activation(batch_norm_out)
+
+
+def beam_search_decode(ids, scores, main_program=None, startup_program=None):
+    helper = LayerHelper('beam_search_decode', **locals())
+    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
+    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
+
+    helper.append_op(
+        type="beam_search_decode",
+        inputs={"Ids": ids,
+                "Scores": scores},
+        outputs={
+            "SentenceIds": sentence_ids,
+            "SentenceScores": sentence_scores
+        })
+
+    return sentence_ids, sentence_scores
+
+
+def conv2d_transpose(input,
+                     num_filters,
+                     output_size=None,
+                     filter_size=None,
+                     padding=None,
+                     stride=None,
+                     param_attr=None,
+                     main_program=None,
+                     startup_program=None):
+    """
+    The transpose of conv2d layer.
+
+    This layer is also known as deconvolution layer.
+
+    Args:
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). This
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.  None if use output size to
+            calculate filter_size
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride.
+        param_attr: Parameter Attribute.
+        main_program(Program): the main program
+        startup_program(Program): the startup program
+
+    Returns:
+        Variable: Output image.
+    """
+    helper = LayerHelper("conv2d_transpose", **locals())
+    if not isinstance(input, Variable):
+        raise TypeError("Input of conv2d_transpose must be Variable")
+    input_channel = input.shape[1]
+
+    op_attr = dict()
+
+    if isinstance(padding, int):
+        op_attr['paddings'] = [padding, padding]
+    elif padding is not None:
+        op_attr['paddings'] = padding
+
+    if isinstance(stride, int):
+        op_attr['strides'] = stride
+    elif stride is not None:
+        op_attr['strides'] = stride
+
+    if filter_size is None:
+        if output_size is None:
+            raise ValueError("output_size must be set when filter_size is None")
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]
+
+        padding = op_attr.get('paddings', [0, 0])
+        stride = op_attr.get('strides', [1, 1])
+
+        h_in = input.shape[2]
+        w_in = input.shape[3]
+        filter_size_h = output_size[0] - \
+                        (h_in - 1) * stride[0] + 2 * padding[0]
+        filter_size_w = output_size[1] - \
+                        (w_in - 1) * stride[1] + 2 * padding[1]
+        filter_size = [filter_size_h, filter_size_w]
+    elif isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+
+    filter_shape = [input_channel, num_filters] + filter_size
+    img_filter = helper.create_parameter(
+        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
+
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='conv2d_transpose',
+        inputs={'Input': [input],
+                'Filter': [img_filter]},
+        outputs={'Output': out},
+        attrs=op_attr)
+
+    return out
+
+
+def sequence_expand(x, y, main_program=None, startup_program=None):
+    """Sequence Expand Layer. This layer will expand the input variable **x**
+    according to LoD information of **y**. And the following examples will
+    explain how sequence_expand works:
+
+    .. code-block:: text
+
+        * Case 1
+            x is a LoDTensor:
+                x.lod = [[0,       2, 3],
+                         [0, 1,    3, 4]]
+                x.data = [a, b, c, d]
+                x.dims = [4, 1]
+
+            y is a LoDTensor:
+                y.lod = [[0,    2,    4],
+                         [0, 3, 6, 7, 8]]
+
+            with condition len(y.lod[-1]) - 1 == x.dims[0]
+
+            then output is a 2-level LoDTensor:
+                out.lod = [[0,                2,    4],
+                           [0,       3,       6, 7, 8]]
+                out.data = [a, a, a, b, b, b, c, d]
+                out.dims = [8, 1]
+
+        * Case 2
+            x is a Tensor:
+                x.data = [a, b, c]
+                x.dims = [3, 1]
+
+            y is a LoDTensor:
+                y.lod = [[0, 2, 3, 6]]
+
+            with condition len(y.lod[-1]) - 1 == x.dims[0]
+
+            then output is a 1-level LoDTensor:
+                out.lod = [[0,    2, 3,      6]]
+                out.data = [a, a, b, c, c, c]
+                out.dims = [6, 1]
+
+    Args:
+        x (Variable): The input variable which is a Tensor or LoDTensor.
+        y (Variable): The input variable which is a LoDTensor.
+        main_program (Program): The main program.
+        startup_program (Program): The startup program.
+
+    Returns:
+        Variable: The expanded variable which is a LoDTensor.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[10, 20],
+                             dtype='float32', lod_level=1)
+            out = layers.sequence_expand(x=x, y=y)
+    """
+    helper = LayerHelper('sequence_expand', input=x, **locals())
+    dtype = helper.input_dtype()
+    tmp = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='sequence_expand', inputs={'X': x,
+                                        'Y': y}, outputs={'Out': tmp})
+    return tmp
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
new file mode 100644
index 0000000000..fa312ace60
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -0,0 +1,9 @@
+from ..registry import register_layer
+__all__ = [
+    'mean', 'mul', 'dropout', 'reshape', 'sigmoid', 'scale', 'transpose',
+    'sigmoid_cross_entropy_with_logits', 'elementwise_add', 'elementwise_div',
+    'elementwise_sub', 'elementwise_mul', 'clip', 'abs'
+]
+
+for _OP in set(__all__):
+    globals()[_OP] = register_layer(_OP)
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
new file mode 100644
index 0000000000..a839ed897d
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -0,0 +1,130 @@
+from ..layer_helper import LayerHelper
+
+__all__ = [
+    'create_tensor', 'cast', 'concat', 'sums', 'assign',
+    'fill_constant_batch_size_like', 'fill_constant', 'ones', 'zeros'
+]
+
+
+def create_tensor(dtype, name=None, main_program=None, startup_program=None):
+    helper = LayerHelper("create_tensor", **locals())
+    return helper.create_variable(name=helper.name, dtype=dtype)
+
+
+def cast(x, dtype, main_program=None):
+    """
+    This function takes in the input with input_dtype
+    and casts it to the output_dtype as the output.
+    """
+    helper = LayerHelper('cast', **locals())
+    out = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='cast',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={'in_dtype': x.dtype,
+               'out_dtype': out.dtype})
+    return out
+
+
+def concat(input, axis, main_program=None, startup_program=None):
+    """
+    This function concats the input along the axis mentioned
+    and returns that as the output.
+    """
+    helper = LayerHelper('concat', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='concat',
+        inputs={'X': input},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
+
+
+def sums(input, out=None, main_program=None, startup_program=None):
+    """
+    This function takes in the input and performs the sum operation on it
+    and returns that as the output.
+    """
+    helper = LayerHelper('sum', **locals())
+    if out is None:
+        out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
+    return out
+
+
+def assign(input, output, main_program=None, startup_program=None):
+    helper = LayerHelper('assign', **locals())
+    helper.append_op(
+        type='scale',
+        inputs={'X': [input]},
+        outputs={'Out': [output]},
+        attrs={'scale': 1.0})
+    return output
+
+
+def fill_constant(shape,
+                  dtype,
+                  value,
+                  out=None,
+                  main_program=None,
+                  startup_program=None):
+    """
+    This function creates a tensor , with shape as mentioned in the input and
+    specified dtype and fills this up with a constant value that
+    comes in the input. It also sets the stop_gradient to be True.
+    """
+    helper = LayerHelper("fill_constant", **locals())
+    if out is None:
+        out = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='fill_constant',
+        inputs={},
+        outputs={'Out': [out]},
+        attrs={'shape': shape,
+               'dtype': out.dtype,
+               'value': float(value)})
+    out.stop_gradient = True
+    return out
+
+
+def fill_constant_batch_size_like(input,
+                                  shape,
+                                  dtype,
+                                  value,
+                                  input_dim_idx=0,
+                                  output_dim_idx=0,
+                                  main_program=None,
+                                  startup_program=None):
+    helper = LayerHelper("fill_constant_batch_size_like", **locals())
+    out = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='fill_constant_batch_size_like',
+        inputs={'Input': input},
+        outputs={'Out': [out]},
+        attrs={
+            'shape': shape,
+            'dtype': out.dtype,
+            'value': float(value),
+            'input_dim_idx': input_dim_idx,
+            'output_dim_idx': output_dim_idx
+        })
+    out.stop_gradient = True
+    return out
+
+
+def ones(shape, dtype, main_program=None):
+    """
+    This function performs the same function as fill_constant() declared above
+    with the constant value being 1.0.
+    """
+    return fill_constant(value=1.0, **locals())
+
+
+def zeros(shape, dtype, main_program=None):
+    """
+    This function performs the same function as fill_constant() declared above
+    with the constant value being 0.0.
+    """
+    return fill_constant(value=0.0, **locals())
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py
index 05728ad75a..7ef524318e 100644
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -9,6 +9,7 @@ def simple_img_conv_pool(input,
                          pool_size,
                          pool_stride,
                          act,
+                         param_attr=None,
                          pool_type='max',
                          main_program=None,
                          startup_program=None):
@@ -16,6 +17,7 @@ def simple_img_conv_pool(input,
         input=input,
         num_filters=num_filters,
         filter_size=filter_size,
+        param_attr=param_attr,
         act=act,
         main_program=main_program,
         startup_program=startup_program)
@@ -36,6 +38,7 @@ def img_conv_group(input,
                    conv_padding=1,
                    conv_filter_size=3,
                    conv_act=None,
+                   param_attr=None,
                    conv_with_batchnorm=False,
                    conv_batchnorm_drop_rate=None,
                    pool_stride=1,
@@ -57,6 +60,7 @@ def img_conv_group(input,
 
     conv_padding = __extend_list__(conv_padding)
     conv_filter_size = __extend_list__(conv_filter_size)
+    param_attr = __extend_list__(param_attr)
     conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
     conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
 
@@ -70,6 +74,7 @@ def img_conv_group(input,
             num_filters=conv_num_filter[i],
             filter_size=conv_filter_size[i],
             padding=conv_padding[i],
+            param_attr=param_attr[i],
             act=local_conv_act,
             main_program=main_program,
             startup_program=startup_program)
@@ -101,6 +106,7 @@ def img_conv_group(input,
 def sequence_conv_pool(input,
                        num_filters,
                        filter_size,
+                       param_attr=None,
                        act="sigmoid",
                        pool_type="max",
                        main_program=None,
@@ -109,6 +115,7 @@ def sequence_conv_pool(input,
         input=input,
         num_filters=num_filters,
         filter_size=filter_size,
+        param_attr=param_attr,
         act=act,
         main_program=main_program,
         startup_program=startup_program)
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py
index 719e3b2563..bbdfab2df9 100644
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -18,8 +18,9 @@ class Optimizer(object):
     but need to use one of it's implementation.
     """
 
-    def __init__(self, global_step=None):
+    def __init__(self, global_step=None, regularization=None):
         self._global_step = global_step
+        self.regularization = regularization
         # Dictionary of accumulators. Some optimizer subclasses need to
         # allocate and manage extra variables associated with the parameters
         # to train. These variables are called accumulators.
@@ -199,7 +200,8 @@ class Optimizer(object):
         """
         params_grads = append_backward_ops(loss, parameter_list, no_grad_set)
         # Add regularization if any
-        params_grads = append_regularization_ops(params_grads)
+        params_grads = append_regularization_ops(params_grads,
+                                                 self.regularization)
         optimize_ops = self.create_optimization_pass(params_grads, loss,
                                                      startup_program)
         return optimize_ops
@@ -209,9 +211,9 @@ class SGDOptimizer(Optimizer):
     """ Simple SGD optimizer without any state.
     """
 
-    def __init__(self, learning_rate, global_step=None):
+    def __init__(self, learning_rate, **kwargs):
         assert learning_rate is not None
-        super(SGDOptimizer, self).__init__(global_step)
+        super(SGDOptimizer, self).__init__(**kwargs)
         self.type = "sgd"
         self._learning_rate = learning_rate
 
@@ -236,14 +238,10 @@ class MomentumOptimizer(Optimizer):
     """
     _velocity_acc_str = "velocity"
 
-    def __init__(self,
-                 learning_rate,
-                 momentum,
-                 use_nesterov=False,
-                 global_step=None):
+    def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs):
         assert learning_rate is not None
         assert momentum is not None
-        super(MomentumOptimizer, self).__init__(global_step)
+        super(MomentumOptimizer, self).__init__(**kwargs)
         self.type = "momentum"
         self._learning_rate = learning_rate
         self._momentum = momentum
@@ -284,10 +282,10 @@ class AdagradOptimizer(Optimizer):
     """
     _moment_acc_str = "moment"
 
-    def __init__(self, learning_rate, epsilon=1.0e-6, global_step=None):
+    def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs):
         assert learning_rate is not None
         assert epsilon is not None
-        super(AdagradOptimizer, self).__init__(global_step)
+        super(AdagradOptimizer, self).__init__(**kwargs)
         self.type = "adagrad"
         self._learning_rate = learning_rate
         self._epsilon = epsilon
@@ -331,12 +329,12 @@ class AdamOptimizer(Optimizer):
                  beta1=0.9,
                  beta2=0.999,
                  epsilon=1e-8,
-                 global_step=None):
+                 **kwargs):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        super(AdamOptimizer, self).__init__(global_step)
+        super(AdamOptimizer, self).__init__(**kwargs)
         self.type = "adam"
         self._learning_rate = learning_rate
         self._beta1 = beta1
@@ -436,12 +434,12 @@ class AdamaxOptimizer(Optimizer):
                  beta1=0.9,
                  beta2=0.999,
                  epsilon=1e-8,
-                 global_step=None):
+                 **kwargs):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        super(AdamaxOptimizer, self).__init__()
+        super(AdamaxOptimizer, self).__init__(**kwargs)
         self.type = "adamax"
         self._learning_rate = learning_rate
         self._beta1 = beta1
@@ -514,16 +512,12 @@ class DecayedAdagradOptimizer(Optimizer):
     """
     _moment_acc_str = "moment"
 
-    def __init__(self,
-                 learning_rate,
-                 decay=0.95,
-                 epsilon=1.0e-6,
-                 global_step=None):
+    def __init__(self, learning_rate, decay=0.95, epsilon=1.0e-6, **kwargs):
         assert learning_rate is not None
         assert decay is not None
         assert epsilon is not None
 
-        super(DecayedAdagradOptimizer, self).__init__(global_step)
+        super(DecayedAdagradOptimizer, self).__init__(**kwargs)
         self.type = "decayed_adagrad"
         self._learning_rate = learning_rate
         self._decay = decay
diff --git a/python/paddle/v2/fluid/param_attr.py b/python/paddle/v2/fluid/param_attr.py
index 86088fdd7c..7952a5ea51 100644
--- a/python/paddle/v2/fluid/param_attr.py
+++ b/python/paddle/v2/fluid/param_attr.py
@@ -36,6 +36,8 @@ class ParamAttr(object):
     def to_attr(arg):
         if arg is None:
             return ParamAttr()
+        elif isinstance(arg, list) or isinstance(arg, tuple):
+            return [ParamAttr.to_attr(a) for a in arg]
         elif isinstance(arg, ParamAttr):
             return arg
         elif isinstance(arg, str) or isinstance(arg, unicode):
diff --git a/python/paddle/v2/fluid/registry.py b/python/paddle/v2/fluid/registry.py
new file mode 100644
index 0000000000..6f5dd365de
--- /dev/null
+++ b/python/paddle/v2/fluid/registry.py
@@ -0,0 +1,186 @@
+import re
+import cStringIO
+import warnings
+import functools
+import inspect
+
+import proto.framework_pb2 as framework_pb2
+from framework import OpProtoHolder, Variable, Program, Operator
+from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
+
+__all__ = ['deprecated', 'register_layer']
+
+
+def _convert_(name):
+    """
+    Formatting.
+
+    Args:
+       name: The name/alias
+
+    This function takes in a name and converts it to a standard format of
+    group1_group2. Where as per the regular expression, group1 can have
+    alphabets and numbers and group2 has capital alphabets.
+
+    """
+    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
+    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
+
+
+def _generate_doc_string_(op_proto):
+    """
+    Generate docstring by OpProto
+
+    Args:
+        op_proto (framework_pb2.OpProto): a protobuf message typed OpProto
+
+    Returns:
+        str: the document string
+    """
+
+    def _type_to_str_(tp):
+        return framework_pb2.AttrType.Name(tp)
+
+    if not isinstance(op_proto, framework_pb2.OpProto):
+        raise TypeError("OpProto should be `framework_pb2.OpProto`")
+
+    buf = cStringIO.StringIO()
+    buf.write(op_proto.comment)
+    buf.write('\nArgs:\n')
+    for each_input in op_proto.inputs:
+        line_begin = '    {0}: '.format(_convert_(each_input.name))
+        buf.write(line_begin)
+        buf.write(each_input.comment)
+        buf.write('\n')
+        buf.write(' ' * len(line_begin))
+        buf.write('Duplicable: ')
+        buf.write(str(each_input.duplicable))
+        buf.write('  Optional: ')
+        buf.write(str(each_input.dispensable))
+        buf.write('\n')
+
+    for each_attr in op_proto.attrs:
+        buf.write('    ')
+        buf.write(each_attr.name)
+        buf.write(' (')
+        buf.write(_type_to_str_(each_attr.type))
+        buf.write('): ')
+        buf.write(each_attr.comment)
+        buf.write('\n')
+
+    if len(op_proto.outputs) != 0:
+        buf.write('\nReturns:\n')
+        buf.write('    ')
+        for each_opt in op_proto.outputs:
+            if not each_opt.intermediate:
+                break
+        buf.write(each_opt.comment)
+
+    return buf.getvalue()
+
+
+def register_layer(op_type):
+    """
+    Register an Python layer for an Operator
+
+    Args:
+       op_type: The name of the operator to be created
+
+    This function takes in the operator type (sigmoid, mean , average etc) and
+    creates the operator functionality.
+
+    """
+    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
+    not_intermediate_outputs = \
+        filter(lambda output: not output.intermediate, op_proto.outputs)
+    intermediate_outputs = \
+        filter(lambda output: output.intermediate, op_proto.outputs)
+
+    if len(not_intermediate_outputs) != 1:
+        raise ValueError("Only one non intermediate output operator can be",
+                         "automatically generated")
+
+    if not_intermediate_outputs[0].duplicable:
+        raise ValueError(
+            "Only non duplicable op can be automatically generated")
+
+    for output in intermediate_outputs:
+        if output.duplicable:
+            raise ValueError("The op can be automatically generated only when ",
+                             "all intermediate ops are not duplicable")
+
+    o_name = not_intermediate_outputs[0].name
+    intermediate_output_names = [output.name for output in intermediate_outputs]
+
+    def infer_and_check_dtype(op_proto, **kwargs):
+        """
+        This function performs the sanity check for dtype and
+        instance type.
+        """
+        dtype = None
+        for ipt in op_proto.inputs:
+            name = _convert_(ipt.name)
+            val = kwargs.pop(name, [])
+            if not isinstance(val, list) and not isinstance(val, tuple):
+                val = [val]
+            for each in val:
+                if not isinstance(each, Variable):
+                    raise ValueError("input of {0} must be variable".format(
+                        op_type))
+
+                if dtype is None:
+                    dtype = each.dtype
+                elif dtype != each.dtype:
+                    raise ValueError(
+                        "operator {0} must input same dtype. {1} vs {2}".format(
+                            op_type, dtype, each.dtype))
+
+        return dtype
+
+    def func(**kwargs):
+        helper = LayerHelper(op_type, **kwargs)
+
+        dtype = infer_and_check_dtype(op_proto, **kwargs)
+
+        inputs = dict()
+        for ipt in op_proto.inputs:
+            name = _convert_(ipt.name)
+            val = kwargs.pop(name, [])
+            if not isinstance(val, list) and not isinstance(val, tuple):
+                val = [val]
+            inputs[ipt.name] = val
+
+        outputs = dict()
+        out = helper.create_tmp_variable(dtype=dtype)
+        outputs[o_name] = [out]
+        for name in intermediate_output_names:
+            outputs[name] = [helper.create_tmp_variable(dtype=dtype)]
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
+        return helper.append_activation(out)
+
+    func.__name__ = op_type
+    func.__doc__ = _generate_doc_string_(op_proto)
+    return func
+
+
+def deprecated(func_or_class):
+    """
+    Deprecated warning decorator. It will result a warning message.
+    Should be used before class or function, member function
+    """
+
+    @functools.wraps(func)
+    def func_wrapper(*args, **kwargs):
+        """
+        Wrap func with deprecated warning
+        """
+        warnings.simplefilter('always', DeprecationWarning)  #turn off filter
+        warnings.warn(
+            "Call to deprecated function {}.".format(func.__name__),
+            category=DeprecationWarning,
+            stacklevel=2)
+        warnings.simplefilter('default', DeprecationWarning)  #reset filter
+        return func(*args, **kwargs)
+
+    return func_wrapper
diff --git a/python/paddle/v2/fluid/regularizer.py b/python/paddle/v2/fluid/regularizer.py
index c2c18e1951..d1955b0047 100644
--- a/python/paddle/v2/fluid/regularizer.py
+++ b/python/paddle/v2/fluid/regularizer.py
@@ -3,7 +3,7 @@ import framework
 __all__ = ['append_regularization_ops', 'L1Decay', 'L2Decay']
 
 
-def append_regularization_ops(parameters_and_grads):
+def append_regularization_ops(parameters_and_grads, regularization=None):
     """Create and add backward regularization Operators
 
     Creates and adds backward regularization operators in the BlockDesc.
@@ -14,6 +14,8 @@ def append_regularization_ops(parameters_and_grads):
     Args:
         parameters_and_grads: A list of (parameters, gradients) pairs
                               that need to be regularized.
+        regularization: A global regularizer. If the parameter is not
+                        set. It will be applied with regularizer.
 
     Returns:
         list of (parameters, gradients) pair with the regularized gradient
@@ -23,14 +25,19 @@ def append_regularization_ops(parameters_and_grads):
     """
     params_and_grads = []
     for param, grad in parameters_and_grads:
+        regularization_term = None
+        if param.regularizer is not None:
+            # Add variable for regularization term in grad block
+            regularization_term = param.regularizer(param, grad.block)
+        elif regularization is not None:
+            regularization_term = regularization(param, grad.block)
+
         # If no gradient or no regularization specified,
         # then we don't need to do anything
-        if grad is None or param.regularizer is None:
+        if grad is None or regularization_term is None:
             params_and_grads.append((param, grad))
             continue
 
-        # Add variable for regularization term in grad block
-        regularization_term = param.regularizer(param, grad.block)
         assert grad.shape == regularization_term.shape
 
         grad.block.append_op(
@@ -145,7 +152,7 @@ class L1DecayRegularizer(WeightDecayRegularizer):
 # import paddle.fluid as fluid
 #
 # hidden = fluid.layers.fc(...,
-#                          param_attr=ParamAttr(fluid.regularizer.Xavier()))
+#                          param_attr=fluid.regularizer.Xavier())
 #
 # It is no need to add a `Regularizer` as the class suffix
 L1Decay = L1DecayRegularizer
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
index 4e71b6f345..3d336ffe95 100644
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -1,9 +1,9 @@
 from __future__ import print_function
 
-import numpy as np
+import sys
+
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
-import sys
 
 
 def resnet_cifar10(input, depth=32):
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
index d2693b602e..c3591a613a 100644
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -150,7 +150,7 @@ def main():
     crf_decode = fluid.layers.crf_decoding(
         input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
 
-    precision, recall, f1_score = fluid.layers.chunk_eval(
+    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
         input=crf_decode,
         label=target,
         chunk_scheme="IOB",
@@ -176,20 +176,21 @@ def main():
 
     batch_id = 0
     for pass_id in xrange(PASS_NUM):
+        chunk_evaluator.reset(exe)
         for data in train_data():
-            outs = exe.run(fluid.default_main_program(),
-                           feed=feeder.feed(data),
-                           fetch_list=[avg_cost, precision, recall, f1_score])
-            avg_cost_val = np.array(outs[0])
-            precision_val = np.array(outs[1])
-            recall_val = np.array(outs[2])
-            f1_score_val = np.array(outs[3])
+            cost, precision, recall, f1_score = exe.run(
+                fluid.default_main_program(),
+                feed=feeder.feed(data),
+                fetch_list=[avg_cost] + chunk_evaluator.metrics)
+            pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
+                exe)
 
             if batch_id % 10 == 0:
-                print("avg_cost=" + str(avg_cost_val))
-                print("precision_val=" + str(precision_val))
-                print("recall_val:" + str(recall_val))
-                print("f1_score_val:" + str(f1_score_val))
+                print("avg_cost:" + str(cost) + " precision:" + str(
+                    precision) + " recall:" + str(recall) + " f1_score:" + str(
+                        f1_score) + " pass_precision:" + str(
+                            pass_precision) + " pass_recall:" + str(pass_recall)
+                      + " pass_f1_score:" + str(pass_f1_score))
 
             # exit early for CI
             exit(0)
diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
new file mode 100644
index 0000000000..80ffc5a544
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
@@ -0,0 +1,119 @@
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 32
+word_dim = 16
+IS_SPARSE = True
+batch_size = 10
+max_length = 50
+topk_size = 50
+trg_dic_size = 10000
+
+decoder_size = hidden_dim
+
+
+def encoder_decoder():
+    # encoder
+    src_word_id = layers.data(
+        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = layers.embedding(
+        input=src_word_id,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = layers.sequence_pool(input=lstm_hidden0, pool_type="last")
+
+    # decoder
+    trg_language_word = layers.data(
+        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = layers.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        current_word = rnn.step_input(trg_embedding)
+        mem = rnn.memory(init=encoder_out)
+        fc1 = fluid.layers.fc(input=[current_word, mem],
+                              size=decoder_size,
+                              act='tanh')
+        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
+        rnn.update_memory(mem, fc1)
+        rnn.output(out)
+
+    return rnn()
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    rnn_out = encoder_decoder()
+    label = layers.data(
+        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = layers.cross_entropy(input=rnn_out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
+    optimizer.minimize(avg_cost)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    place = core.CPUPlace()
+    exe = Executor(place)
+
+    exe.run(framework.default_startup_program())
+
+    batch_id = 0
+    for pass_id in xrange(2):
+        for data in train_data():
+            word_data = to_lodtensor(map(lambda x: x[0], data), place)
+            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
+            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
+            outs = exe.run(framework.default_main_program(),
+                           feed={
+                               'src_word_id': word_data,
+                               'target_language_word': trg_word,
+                               'target_language_next_word': trg_word_next
+                           },
+                           fetch_list=[avg_cost])
+            avg_cost_val = np.array(outs[0])
+            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                  " avg_cost=" + str(avg_cost_val))
+            if batch_id > 3:
+                exit(0)
+            batch_id += 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
index 80f8599679..c0b051f862 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
@@ -1,6 +1,51 @@
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+from paddle.v2.fluid.layer_helper import LayerHelper
+
+
+def lstm(x,
+         c_pre_init,
+         hidden_dim,
+         forget_bias=None,
+         main_program=None,
+         startup_program=None):
+    """
+    This function helps create an operator for the LSTM (Long Short Term
+    Memory) cell that can be used inside an RNN.
+    """
+    helper = LayerHelper('lstm_unit', **locals())
+    rnn = fluid.layers.StaticRNN()
+    with rnn.step():
+        c_pre = rnn.memory(init=c_pre_init)
+        x_t = rnn.step_input(x)
+
+        before_fc = fluid.layers.concat(
+            input=[x_t, c_pre],
+            axis=1,
+            main_program=main_program,
+            startup_program=startup_program)
+        after_fc = fluid.layers.fc(input=before_fc,
+                                   size=hidden_dim * 4,
+                                   main_program=main_program,
+                                   startup_program=startup_program)
+
+        dtype = x.dtype
+        c = helper.create_tmp_variable(dtype)
+        h = helper.create_tmp_variable(dtype)
+
+        helper.append_op(
+            type='lstm_unit',
+            inputs={"X": after_fc,
+                    "C_prev": c_pre},
+            outputs={"C": c,
+                     "H": h},
+            attrs={"forget_bias": forget_bias})
+
+        rnn.update_memory(c_pre, c)
+        rnn.output(h)
+
+    return rnn()
 
 
 def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
@@ -23,8 +68,7 @@ def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
     c_pre_init = fluid.layers.fill_constant(
         dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0)
     c_pre_init.stop_gradient = False
-    layer_1_out = fluid.layers.lstm(
-        emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
+    layer_1_out = lstm(emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
     layer_1_out = fluid.layers.transpose(x=layer_1_out, axis=[1, 0, 2])
 
     prediction = fluid.layers.fc(input=layer_1_out,
diff --git a/python/paddle/v2/fluid/tests/test_activation_op.py b/python/paddle/v2/fluid/tests/test_activation_op.py
index bd52bef260..b052374dc7 100644
--- a/python/paddle/v2/fluid/tests/test_activation_op.py
+++ b/python/paddle/v2/fluid/tests/test_activation_op.py
@@ -1,6 +1,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+from scipy.special import expit
 
 
 class TestExp(OpTest):
@@ -455,5 +456,20 @@ class TestHardSigmoid(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.002)
 
 
+class TestSwish(OpTest):
+    def setUp(self):
+        self.op_type = "swish"
+        X = np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        self.inputs = {'X': X}
+        self.attrs = {'beta': 2.3}
+        self.outputs = {'Y': X * expit(self.attrs['beta'] * X)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.008)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
index 819e65a653..53bf6f815b 100644
--- a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
+++ b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
@@ -147,7 +147,13 @@ class TestChunkEvalOp(OpTest):
             'Recall': np.asarray(
                 [recall], dtype='float32'),
             'F1-Score': np.asarray(
-                [f1], dtype='float32')
+                [f1], dtype='float32'),
+            'NumInferChunks': np.asarray(
+                [self.num_infer_chunks], dtype='int64'),
+            'NumLabelChunks': np.asarray(
+                [self.num_label_chunks], dtype='int64'),
+            'NumCorrectChunks': np.asarray(
+                [self.num_correct_chunks], dtype='int64')
         }
 
     def setUp(self):
diff --git a/python/paddle/v2/fluid/tests/test_fill_op.py b/python/paddle/v2/fluid/tests/test_fill_op.py
new file mode 100644
index 0000000000..88337598c8
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_fill_op.py
@@ -0,0 +1,24 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.v2.fluid.core as core
+
+
+class TestFillOp(OpTest):
+    def setUp(self):
+        self.op_type = "fill"
+        val = np.random.random(size=[100, 200])
+        self.inputs = {}
+        self.attrs = {
+            'value': val.flatten().tolist(),
+            'shape': [100, 200],
+            'dtype': int(core.DataType.FP64)
+        }
+        self.outputs = {'Out': val.astype('float64')}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 57f6a362de..2286e94a90 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -29,7 +29,10 @@ class TestBook(unittest.TestCase):
             label = layers.data(name='label', shape=[1], dtype='int32')
             hidden1 = layers.fc(input=images, size=128, act='relu')
             hidden2 = layers.fc(input=hidden1, size=64, act='relu')
-            predict = layers.fc(input=hidden2, size=10, act='softmax')
+            predict = layers.fc(input=[hidden2, hidden1],
+                                size=10,
+                                act='softmax',
+                                param_attr=["sftmax.w1", "sftmax.w2"])
             cost = layers.cross_entropy(input=predict, label=label)
             avg_cost = layers.mean(x=cost)
             self.assertIsNotNone(avg_cost)
@@ -158,6 +161,15 @@ class TestBook(unittest.TestCase):
                     x=dat, label=lbl))
         print(str(program))
 
+    def test_seq_expand(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10], dtype='float32')
+            y = layers.data(
+                name='y', shape=[10, 20], dtype='float32', lod_level=1)
+            self.assertIsNotNone(layers.sequence_expand(x=x, y=y))
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_lrn_op.py b/python/paddle/v2/fluid/tests/test_lrn_op.py
index 7e34b3c91c..9abb09e53a 100644
--- a/python/paddle/v2/fluid/tests/test_lrn_op.py
+++ b/python/paddle/v2/fluid/tests/test_lrn_op.py
@@ -23,7 +23,7 @@ class TestLRNOp(OpTest):
         start = -(self.n - 1) / 2
         end = start + self.n
 
-        mid = np.empty((self.N, self.C, self.H, self.W), dtype=float)
+        mid = np.empty((self.N, self.C, self.H, self.W)).astype("float32")
         mid.fill(self.k)
         for m in range(0, self.N):
             for i in range(0, self.C):
@@ -74,5 +74,4 @@ class TestLRNOp(OpTest):
 
 
 if __name__ == "__main__":
-    exit(0)  # LRN grad implement wrong
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_reduce_op.py b/python/paddle/v2/fluid/tests/test_reduce_op.py
index 70359d60cb..a021d4dd91 100644
--- a/python/paddle/v2/fluid/tests/test_reduce_op.py
+++ b/python/paddle/v2/fluid/tests/test_reduce_op.py
@@ -85,5 +85,19 @@ class Test1DReduce(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestReduceAll(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
+        self.attrs = {'reduce_all': True}
+        self.outputs = {'Out': self.inputs['X'].sum()}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_registry.py b/python/paddle/v2/fluid/tests/test_registry.py
new file mode 100644
index 0000000000..f8328f31cf
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_registry.py
@@ -0,0 +1,22 @@
+import unittest
+import warnings
+
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.registry as registry
+
+
+class TestRegistry(unittest.TestCase):
+    def test_registry_layer(self):
+        self.layer_type = "mean"
+        program = framework.Program()
+
+        x = fluid.layers.data(name='X', shape=[10, 10], dtype='float32')
+        output = layers.mean(x)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        X = np.random.random((10, 10)).astype("float32")
+        mean_out = exe.run(program, feed={"X": X}, fetch_list=[output])
+        self.assertAlmostEqual(np.mean(X), mean_out)
diff --git a/python/paddle/v2/fluid/tests/test_reshape_op.py b/python/paddle/v2/fluid/tests/test_reshape_op.py
index 16bb6bb2af..18ee3aece6 100644
--- a/python/paddle/v2/fluid/tests/test_reshape_op.py
+++ b/python/paddle/v2/fluid/tests/test_reshape_op.py
@@ -17,5 +17,19 @@ class TestReshapeOp(OpTest):
         self.check_grad(["X"], "Out")
 
 
+class TestReshapeOpDimInfer(OpTest):
+    def setUp(self):
+        self.op_type = "reshape"
+        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
+        self.attrs = {'shape': [4, -1, 5]}
+        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_row_conv_op.py b/python/paddle/v2/fluid/tests/test_row_conv_op.py
new file mode 100644
index 0000000000..1ed86e23ac
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_row_conv_op.py
@@ -0,0 +1,95 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def row_conv_forward(x, lod, wt):
+    out = np.zeros_like(x)
+    seq_info = lod[0]
+    num_sequences = len(seq_info) - 1
+    context_length = wt.shape[0]
+
+    for i in range(num_sequences):  # loop over number of sequences
+        start = seq_info[i]
+        end = seq_info[i + 1]
+        curinput = x[start:end, :]
+        curoutput = out[start:end, :]
+
+        cur_timesteps = end - start
+        for j in range(cur_timesteps):  # loop over different timesteps
+            for k in range(context_length):
+
+                if j + k >= cur_timesteps:
+                    continue
+                curoutput[j, :] += curinput[j + k, :] * wt[k, :]
+
+    return out
+
+
+class TestRowConvOp1(OpTest):
+    def setUp(self):
+
+        self.op_type = "row_conv"
+        lod = [[0, 2, 5, 7]]
+        T = lod[0][-1]
+        D = 16
+        context_length = 2
+
+        x = np.random.random((T, D)).astype("float32")
+        wt = np.random.random((context_length, D)).astype("float32")
+        self.inputs = {'X': (x, lod), 'Filter': wt}
+
+        out = row_conv_forward(x, lod, wt)
+        self.outputs = {'Out': (out, lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Filter'], 'Out', max_relative_error=0.05)
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(
+            ['Filter'], 'Out', max_relative_error=0.05, no_grad_set=set('X'))
+
+    def test_check_grad_ignore_wt(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Filter'))
+
+
+class TestRowConvOp2(OpTest):
+    def setUp(self):
+
+        self.op_type = "row_conv"
+        lod = [[0, 20, 50, 100]]
+        T = lod[0][-1]
+        D = 35
+        context_length = 35
+
+        x = np.random.random((T, D)).astype("float32")
+        wt = np.random.random((context_length, D)).astype("float32")
+        self.inputs = {'X': (x, lod), 'Filter': wt}
+
+        out = row_conv_forward(x, lod, wt)
+        self.outputs = {'Out': (out, lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    #max_relative_error is increased from 0.05 to 0.06 as for higher
+    #dimensional input, the dX on CPU for some values has max_rel_error 
+    #slightly more than 0.05
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Filter'], 'Out', max_relative_error=0.06)
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(
+            ['Filter'], 'Out', max_relative_error=0.06, no_grad_set=set('X'))
+
+    def test_check_grad_ignore_wt(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.06, no_grad_set=set('Filter'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_seq_expand.py b/python/paddle/v2/fluid/tests/test_sequence_expand.py
similarity index 89%
rename from python/paddle/v2/fluid/tests/test_seq_expand.py
rename to python/paddle/v2/fluid/tests/test_sequence_expand.py
index ff17edd04b..0f22612d3d 100644
--- a/python/paddle/v2/fluid/tests/test_seq_expand.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_expand.py
@@ -3,7 +3,7 @@ import numpy as np
 from op_test import OpTest
 
 
-class TestSeqExpand(OpTest):
+class TestSequenceExpand(OpTest):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
         y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
@@ -21,7 +21,7 @@ class TestSeqExpand(OpTest):
         self.outputs = {'Out': out}
 
     def setUp(self):
-        self.op_type = 'seq_expand'
+        self.op_type = 'sequence_expand'
         self.set_data()
         self.compute()
 
@@ -32,7 +32,7 @@ class TestSeqExpand(OpTest):
         self.check_grad(["X"], "Out")
 
 
-class TestSeqExpandCase1(TestSeqExpand):
+class TestSequenceExpandCase1(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
         x_lod = [[0, 2, 5]]
@@ -41,7 +41,7 @@ class TestSeqExpandCase1(TestSeqExpand):
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
-class TestSeqExpandCase2(TestSeqExpand):
+class TestSequenceExpandCase2(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
         x_lod = [[0, 1]]
@@ -50,7 +50,7 @@ class TestSeqExpandCase2(TestSeqExpand):
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
-class TestSeqExpandCase3(TestSeqExpand):
+class TestSequenceExpandCase3(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
         x_lod = [[0, 1, 2, 3, 4]]
diff --git a/python/paddle/v2/fluid/tests/test_spp_op.py b/python/paddle/v2/fluid/tests/test_spp_op.py
new file mode 100644
index 0000000000..007723f0e3
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_spp_op.py
@@ -0,0 +1,68 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_pool2d_op import max_pool2D_forward_naive
+from test_pool2d_op import avg_pool2D_forward_naive
+
+
+class TestSppOp(OpTest):
+    def setUp(self):
+        self.op_type = "spp"
+        self.init_test_case()
+        input = np.random.random(self.shape).astype("float32")
+        nsize, csize, hsize, wsize = input.shape
+        out_level_flatten = []
+        for i in xrange(self.pyramid_height):
+            bins = np.power(2, i)
+            kernel_size = [0, 0]
+            padding = [0, 0]
+            kernel_size[0] = np.ceil(hsize /
+                                     bins.astype("double")).astype("int32")
+            padding[0] = (
+                (kernel_size[0] * bins - hsize + 1) / 2).astype("int32")
+
+            kernel_size[1] = np.ceil(wsize /
+                                     bins.astype("double")).astype("int32")
+            padding[1] = (
+                (kernel_size[1] * bins - wsize + 1) / 2).astype("int32")
+            out_level = self.pool2D_forward_naive(input, kernel_size,
+                                                  kernel_size, padding)
+            out_level_flatten.append(
+                out_level.reshape(nsize, bins * bins * csize))
+            if i == 0:
+                output = out_level_flatten[i]
+            else:
+                output = np.concatenate((output, out_level_flatten[i]), 1)
+        # output = np.concatenate(out_level_flatten.tolist(), 0);
+        self.inputs = {'X': input.astype('float32'), }
+        self.attrs = {
+            'pyramid_height': self.pyramid_height,
+            'pooling_type': self.pool_type
+        }
+
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        if self.pool_type != "avg":
+            self.check_grad(['X'], 'Out', max_relative_error=0.05)
+
+    def init_test_case(self):
+        self.shape = [3, 2, 4, 4]
+        self.pyramid_height = 3
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.pool_type = "max"
+
+
+class TestCase2(TestSppOp):
+    def init_test_case(self):
+        self.shape = [3, 2, 4, 4]
+        self.pyramid_height = 3
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.pool_type = "avg"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index bd97dc1199..7b7d1a1d16 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -383,19 +383,22 @@ class Parameters(object):
             params.deserialize(param_name, f)
         return params
 
-    def init_from_tar(self, f):
+    def init_from_tar(self, f, exclude_params=[]):
         """
         Different from `from_tar`, this interface can be used to
         init partial network parameters from another saved model.
 
         :param f: the initialized model file.
         :type f: tar file
+        :param exclude_params: the names of parameters that should  
+            not be initialized from the model file.
+        :type exclude_params: list of strings
         :return: Nothing.
         """
 
         tar_param = Parameters.from_tar(f)
         for pname in tar_param.names():
-            if pname in self.names():
+            if pname in self.names() and pname not in exclude_params:
                 self.set(pname, tar_param.get(pname))
 
 
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
index 7e457f987d..27c82c95f7 100644
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
@@ -390,8 +390,6 @@ def pipe_reader(left_cmd,
     if not callable(parser):
         raise TypeError("parser must be a callable object")
 
-    process = subprocess.Popen(
-        left_cmd.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
     # TODO(typhoonzero): add a thread to read stderr
 
     # Always init a decompress object is better than
@@ -400,6 +398,8 @@ def pipe_reader(left_cmd,
         32 + zlib.MAX_WBITS)  # offset 32 to skip the header
 
     def reader():
+        process = subprocess.Popen(
+            left_cmd.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
         remained = ""
         while True:
             buff = process.stdout.read(bufsize)
diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py
index 5a92951b10..06e14796da 100644
--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ b/python/paddle/v2/reader/tests/decorator_test.py
@@ -145,5 +145,35 @@ class TestXmap(unittest.TestCase):
                             self.assertEqual(e, mapper(idx))
 
 
+class TestPipeReader(unittest.TestCase):
+    def test_pipe_reader(self):
+        def simple_parser(lines):
+            return lines
+
+        import tempfile
+
+        records = [str(i) for i in xrange(5)]
+        temp = tempfile.NamedTemporaryFile()
+        try:
+            with open(temp.name, 'w') as f:
+                for r in records:
+                    f.write('%s\n' % r)
+
+            cmd = "cat %s" % temp.name
+            reader = paddle.v2.reader.pipe_reader(
+                cmd, simple_parser, bufsize=128)
+            for i in xrange(4):
+                result = []
+                for r in reader():
+                    result.append(r)
+
+                for idx, e in enumerate(records):
+                    print e, result[idx]
+                    self.assertEqual(e, result[idx])
+        finally:
+            # delete the temporary file
+            temp.close()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index fe91df10da..8396fb44cf 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,8 +1,61 @@
 from setuptools import setup, Distribution, Extension
+import subprocess
 class BinaryDistribution(Distribution):
     def has_ext_modules(foo):
         return True
 
+MAJOR   = 0
+MINOR   = 11
+PATCH   = 0
+RC      = 0
+ISTAGED = False
+
+
+
+def git_commit():
+    try:
+        cmd = ['git', 'rev-parse', 'HEAD']
+        git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
+    except:
+        git_commit = 'Unknown'
+    return git_commit
+
+def write_version_py(filename='paddle/version.py'):
+    cnt = '''
+# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
+#
+full_version    = '%(major)d.%(minor)d.%(patch)d'
+major           = '%(major)d'
+minor           = '%(minor)d'
+patch           = '%(patch)d'
+rc              = '%(rc)d'
+istaged         = %(istaged)s
+commit          = '%(commit)s'
+
+def show():
+    if istaged:
+        print 'full_version:', full_version
+        print 'major:', major
+        print 'minor:', minor
+        print 'patch:', patch
+        print 'rc:', rc
+    else:
+        print 'commit:', commit
+'''
+    commit = git_commit()
+    with open(filename, 'w') as f:
+        f.write(cnt % {
+            'major': MAJOR,
+            'minor': MINOR,
+            'patch': PATCH,
+            'rc': RC,
+            'version': '${PADDLE_VERSION}',
+            'commit': commit,
+            'istaged': ISTAGED})
+
+write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
+
+
 packages=['paddle',
           'paddle.proto',
           'paddle.trainer',
@@ -15,13 +68,14 @@ packages=['paddle',
           'paddle.v2.plot',
           'paddle.v2.fluid',
           'paddle.v2.fluid.proto',
+          'paddle.v2.fluid.layers',
           'py_paddle']
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
     setup_requires = f.read().splitlines()
 
 if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
-    setup_requires+=["opencv-python"]
+    setup_requires+=['opencv-python']
 
 # the prefix is sys.prefix which should always be usr
 paddle_bin_dir = 'opt/paddle/bin'
@@ -36,7 +90,7 @@ paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
 if '${MKL_SHARED_LIBS}'!= '':
   paddle_rt_libs += '${MKL_SHARED_LIBS}'.split(';')
 
-setup(name='paddlepaddle',
+setup(name='${PACKAGE_NAME}',
       version='${PADDLE_VERSION}',
       description='Parallel Distributed Deep Learning',
       install_requires=setup_requires,