diff --git a/.gitignore b/.gitignore
index 020d3f0c30..ac56a3320e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
 paddle/pybind/pybind.h
+python/paddle/version.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6aeef23330..b309ff37e5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,8 @@ SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 include(system)
 
 project(paddle CXX C Go)
+message(STATUS "CXX compiler: " ${CMAKE_CXX_COMPILER} ", version: " ${CMAKE_CXX_COMPILER_VERSION})
+message(STATUS "C compiler: " ${CMAKE_C_COMPILER} ", version: " ${CMAKE_C_COMPILER_VERSION})
 
 find_package(Sphinx)
 if(NOT CMAKE_CROSSCOMPILING)
@@ -58,6 +60,7 @@ option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
+option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
diff --git a/RELEASE.cn.md b/RELEASE.cn.md
index 5deaf230a8..494c59730d 100644
--- a/RELEASE.cn.md
+++ b/RELEASE.cn.md
@@ -1,3 +1,62 @@
+# v0.11.0版本
+
+## PaddlePaddle Fluid
+
+- PaddlePaddle发布版本v0.11.0包含一个新的特性*PaddlePaddle Fluid*. Fluid 是设计用来让用户像Pytorch和Tensorflow Eager Execution一样执行程序。在这些系统中，不再有*模型*这个概念，应用也不再包含一个用于描述Operator图或者一系列层的符号描述，而是像通用程序那样描述训练或者预测的过程。而Fluid与PyTorch或Eager Execution的区别在于Fluid不依赖Python提供的控制流，例如 if-else-then或者for，而是提供了基于C++实现的控制流并暴露了对应的用with语法实现的Python接口。例如：
+
+  https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44
+
+- 在v0.11.0版本中，我们提供了一个C++类`Executor`用于运行一个Fluid程序。Executor类似一个解释器。在未来的版本中，我们将提升和优化Executor成为一个调试器，就像GDB。并可能提供一些编译器，这个编译器会读取一个上文所描述的应用然后编译成一个等价的
+源代码，这个源代码可以被nvcc编译成可以使用CUDA的二进制，或者被icc编译成可以充分利用Intel CPU的二进制。
+
+
+## 新特点
+
+* 发布 `PaddlePaddle Fluid`。
+* 增加了用于模型预测的C-API。
+* 用Fluid API实现了一个简单的GAN的例子。
+* 增加了关于性能调优的文档。
+* 为`paddle.v2.dataset`下载数据集提供了重试机制.
+* C++中使用protobuf-lite替换protobuf减少了二进制的大小。
+* 发布了新特性 [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment).
+* 基于Bazel API利用cmake实现了一个的新的构建系统函数库。
+* 当使用编译选项`WITH_MKL=ON`时自动下载和编译Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) 函数库.
+* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn):
+  - 完成了 11个 MKL-DNN 层: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN。
+  - 完成了 3个 MKL-DNN 网络: VGG-19, ResNet-50, GoogleNet
+  - 基于Intel Skylake 6148 CPU的[性能测试](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) : 相对于MKLML有2~3倍的训练加速。
+* 增加 [softsign activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign)
+* 增加 [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod)
+* 增加 [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance)
+* 增加 [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq)
+* 增加 [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score)
+* 增加 [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice)
+* 增加 [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv)
+* 增加移动端友好的网页
+
+## 改进
+
+* 使用一个Python`whl`包即可安装.
+* [V2 API可以实现用户定制化评估](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标)。
+* 将 `PADDLE_ONLY_CPU` 改为 `PADDLE_WITH_GPU`, 因为我们会支持多种设备。
+* 删除了有一些bug的BarrierStat。
+* 清理和删除了paddle::Parameter中未使用的函数。
+* 删除了ProtoDataProvider。
+* Huber loss同时支持回归和分类。
+* 为sequence pooling 层增加`stride`参数。
+* v2 API自动使用cudnn batch normalization。
+* 可以使用一个固定的参数名共享BN层的参数。
+* 2D convolution operation支持variable-dimension input特性。
+* 重构cmake中关于CUDA的部分并实现自动检测GPU架构的功能。
+* 优化网页导航。
+
+## 错误修复
+
+* 修复ROI pooling的Bug. cc9a761
+* 修复当label是dense vector是AUC变成0的问题. #5274
+* 修复WarpCTC 层的Bug.
+
+
 # v0.10.0版本
 
 我们非常高兴发布了PaddlePaddle V0.10.0版，并开发了新的[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。
diff --git a/RELEASE.md b/RELEASE.md
index 146f7afa7d..5a62c95513 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,75 @@
+# Release v0.11.0
+
+## PaddlePaddle Fluid
+
+- Release 0.11.0 includes a new feature *PaddlePaddle Fluid*.  Fluid is
+  designed to allow users to program like PyTorch and TensorFlow Eager Execution.
+  In these systems, there is no longer the concept *model* and applications
+  do not include a symbolic description of a graph of operators nor a sequence
+  of layers. Instead, applications look exactly like a usual program that
+  describes a process of training or inference.  The difference between
+  Fluid and PyTorch or Eager Execution is that Fluid doesn't rely on Python's
+  control-flow, `if-then-else` nor `for`.  Instead, Fluid provides its
+  C++ implementations and their Python binding using the `with` statement.  For an example
+
+  https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44
+
+- In 0.11.0, we provides a C++ class `Executor` to run a Fluid program.
+Executor works like an interpreter. In future version, we will improve
+`Executor` into a debugger like GDB, and we might provide some compilers,
+which, for example, takes an application like the above one, and outputs
+an equivalent C++ source program, which can be compiled using
+[`nvcc`](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html)
+to generate binaries that use CUDA, or using
+[`icc`](https://software.intel.com/en-us/c-compilers) to generate binaries
+that make full use of Intel CPUs.
+
+## New Features
+
+* Release `PaddlePaddle Fluid`.
+* Add C-API for model inference
+* Use fluid API to create a simple GAN demo.
+* Add develop guide about performance tunning.
+* Add retry when download `paddle.v2.dataset`.
+* Linking protobuf-lite not protobuf in C++. Reduce the binary size.
+* Feature [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment) released.
+* A new style cmake functions for Paddle. It is based on Bazel API.
+* Automatically download and compile with Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) library as CBLAS when build `WITH_MKL=ON`.
+* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn):
+  - Complete 11 MKL-DNN layers: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN.
+  - Complete 3 MKL-DNN networks: VGG-19, ResNet-50, GoogleNet
+  - [Benchmark](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) on Intel Skylake 6148 CPU: 2~3x training speedup compared with MKLML.
+* Add the [`softsign` activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign).
+* Add the [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod).
+* Add the [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance).
+* Add the [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq).
+* Add the [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score).
+* Add the [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice).
+* Add the [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv)
+* Add mobile friendly webpages.
+
+## Improvements
+
+* Build and install using a single `whl` package.
+* [Custom evaluating in V2 API](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标).
+* Change `PADDLE_ONLY_CPU` to `PADDLE_WITH_GPU`, since we will support many kinds of devices.
+* Remove buggy BarrierStat.
+* Clean and remove unused functions in paddle::Parameter.
+* Remove ProtoDataProvider.
+* Huber loss supports both regression and classification.
+* Add the `stride` parameter  for sequence pooling layers.
+* Enable v2 API use cudnn batch normalization automatically.
+* The BN layer's parameter can be shared by a fixed the parameter name.
+* Support variable-dimension input feature for 2D convolution operation.
+* Refine cmake about CUDA to automatically detect GPU architecture.
+* Improved website navigation.
+
+## Bug Fixes
+
+* Fix bug in ROI pooling. cc9a761
+* Fix AUC is zero when label is dense vector. #5274
+* Fix bug in WarpCTC layer.
+
 # Release v0.10.0
 
 We are glad to release version 0.10.0.  In this version, we are happy to release the new 
diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
index 16c2390fd3..8ee7fd28c5 100644
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -2,27 +2,25 @@
 
 Machine:
 
-- Server
- 	- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
-- Laptop
- 	- DELL XPS15-9560-R1745: i7-7700HQ 8G 256GSSD
- 	- i5 MacBook Pro (Retina, 13-inch, Early 2015)
-- Desktop
- 	- i7-6700k
+- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
+- Laptop: TBD
 
 System: CentOS release 6.3 (Final), Docker 1.12.1.
 
-PaddlePaddle: paddlepaddle/paddle:latest (for MKLML and MKL-DNN), paddlepaddle/paddle:latest-openblas (for OpenBLAS)
-- MKL-DNN tag v0.11
-- MKLML 2018.0.1.20171007
-- OpenBLAS v0.2.20
-(TODO: will rerun after 0.11.0)
+PaddlePaddle: (TODO: will rerun after 0.11.0)
+- paddlepaddle/paddle:latest (for MKLML and MKL-DNN)
+  - MKL-DNN tag v0.11
+  - MKLML 2018.0.1.20171007
+- paddlepaddle/paddle:latest-openblas (for OpenBLAS)
+  - OpenBLAS v0.2.20
 	 
 On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
 
 ## Benchmark Model
 
 ### Server
+
+#### Training
 Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
 
 Input image size - 3 * 224 * 224, Time: images/second
@@ -35,9 +33,7 @@ Input image size - 3 * 224 * 224, Time: images/second
 | MKLML        | 12.12 | 13.70 | 16.18  |
 | MKL-DNN      | 28.46 | 29.83 | 30.44  |
 
-
-chart on batch size 128
-TBD
+<img src="figs/vgg-cpu-train.png" width="500">
 
  - ResNet-50
 
@@ -47,9 +43,7 @@ TBD
 | MKLML        | 32.52 | 31.89 | 33.12  |
 | MKL-DNN      | 81.69 | 82.35 | 84.08  |
 
-
-chart on batch size 128
-TBD
+<img src="figs/resnet-cpu-train.png" width="500">
 
  - GoogLeNet
 
@@ -59,10 +53,35 @@ TBD
 | MKLML        | 128.46| 137.89| 158.63 |
 | MKL-DNN      | 250.46| 264.83| 269.50 |
 
-chart on batch size 128
-TBD
+<img src="figs/googlenet-cpu-train.png" width="500">
+
+#### Inference
+Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+- VGG-19
+
+| BatchSize | 1     | 2     | 4     | 8     | 16    |
+|-----------|-------|-------|-------|-------|-------|
+| OpenBLAS  | 1.07  | 1.08  | 1.06  | 0.88  | 0.65  |
+| MKLML     | 5.58  | 9.80  | 15.15 | 21.21 | 28.67 |
+| MKL-DNN   | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
+
+- ResNet-50
+
+| BatchSize | 1     | 2      | 4      | 8      | 16     |
+|-----------|-------|--------|--------|--------|--------|
+| OpenBLAS  | 3.35  | 3.19   | 3.09   | 2.55   | 1.96   |
+| MKLML     | 6.33  | 12.02  | 22.88  | 40.53  | 63.09  |
+| MKL-DNN   | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
+
+
+- GoogLeNet
+
+| BatchSize | 1      | 2      | 4      | 8      | 16     |
+|-----------|--------|--------|--------|--------|--------|
+| OpenBLAS  | 12.04  | 11.31  | 10.00  | 9.07   | 4.34   |
+| MKLML     | 22.74  | 41.56  | 81.22  | 133.47 | 210.53 |
+| MKL-DNN   | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
+
 
 ### Laptop
 TBD
-### Desktop
-TBD
diff --git a/benchmark/figs/googlenet-cpu-train.png b/benchmark/figs/googlenet-cpu-train.png
new file mode 100644
index 0000000000..c3f67faf09
Binary files /dev/null and b/benchmark/figs/googlenet-cpu-train.png differ
diff --git a/benchmark/figs/resnet-cpu-train.png b/benchmark/figs/resnet-cpu-train.png
new file mode 100644
index 0000000000..b96ecd5ff9
Binary files /dev/null and b/benchmark/figs/resnet-cpu-train.png differ
diff --git a/benchmark/figs/vgg-cpu-train.png b/benchmark/figs/vgg-cpu-train.png
new file mode 100644
index 0000000000..f830ca6a87
Binary files /dev/null and b/benchmark/figs/vgg-cpu-train.png differ
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index b21fc43904..13294c0548 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -17,7 +17,7 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
   set(CBLAS_INC_DIR ${MKLML_INC_DIR})
   set(CBLAS_LIBRARIES ${MKLML_LIB})
 
-  add_definitions(-DPADDLE_USE_MKLML)
+  add_definitions(-DPADDLE_WITH_MKLML)
   add_definitions(-DLAPACK_FOUND)
 
   message(STATUS "Found cblas and lapack in MKLML "
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e550ec2856..5c6bcfde76 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -24,6 +24,11 @@ if(WITH_DOUBLE)
     add_definitions(-DPADDLE_TYPE_DOUBLE)
 endif(WITH_DOUBLE)
 
+if(WITH_ARM_FP16)
+    add_definitions(-DPADDLE_ARM_FP16)
+    add_definitions("-march=armv8.2-a+fp16+simd")
+endif(WITH_ARM_FP16)
+
 if(WITH_TESTING)
     add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)
diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake
index ac456933bd..aec51410b3 100644
--- a/cmake/external/cares.cmake
+++ b/cmake/external/cares.cmake
@@ -33,7 +33,7 @@ ExternalProject_Add(
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
     BUILD_IN_SOURCE 1
-    BUILD_COMMAND   make
+    BUILD_COMMAND   make -j8
     INSTALL_COMMAND make install
 )
 
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index fc52d339d7..5d24caebdc 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -67,5 +67,5 @@ ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
 ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
 MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
-add_definitions(-DPADDLE_USE_MKLDNN)
+add_definitions(-DPADDLE_WITH_MKLDNN)
 LIST(APPEND external_project_dependencies mkldnn)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 4c4f59656d..97857a686b 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -114,11 +114,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
 FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
-    ADD_LIBRARY(cblas SHARED ${dummyfile})
-ELSE()
-    ADD_LIBRARY(cblas STATIC ${dummyfile})
-ENDIF()
+ADD_LIBRARY(cblas STATIC ${dummyfile})
 TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
 
 IF(NOT ${CBLAS_FOUND})
diff --git a/doc/api/v2/config/activation.rst b/doc/api/v2/config/activation.rst
index eca3ce03bc..5317e66b64 100644
--- a/doc/api/v2/config/activation.rst
+++ b/doc/api/v2/config/activation.rst
@@ -99,3 +99,10 @@ STanh
 ..  automodule:: paddle.v2.activation
     :members: STanh
     :noindex:
+    
+SoftSign
+========
+
+..  automodule:: paddle.v2.activation
+    :members: SoftSign
+    :noindex:
diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/refactor/distributed_architecture.md
index 2b4f921ae9..d9fe7d6bbb 100644
--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/design/refactor/distributed_architecture.md
@@ -53,7 +53,7 @@ The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the
 The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
 
 This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
-[Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
+[Design Doc: Operation Graph Based Parameter Server](./parameter_server.md)
 
 ## Distributed Training Architecture
 
diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md
index 62ff8f3229..14c081ea84 100644
--- a/doc/design/releasing_process.md
+++ b/doc/design/releasing_process.md
@@ -5,8 +5,9 @@ PaddlePaddle使用git-flow branching model做分支管理，使用[Semantic Vers
 PaddlePaddle每次发新的版本，遵循以下流程:
 
 1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
-2. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
-3. 对这个版本的提交，做如下几个操作:
+1. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
+1. 对这个版本的提交，做如下几个操作:
+	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
 	* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
 	* 编译这个版本的Ubuntu Deb包。如果失败，修复Ubuntu Deb包编译问题，Patch号加一，返回第二步。
 	* 使用Regression Test List作为检查列表，测试Docker镜像/ubuntu安装包的功能正确性
@@ -20,9 +21,9 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 			pip install twine
 			twine upload dist/[package to upload]
 			```
-4. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-5. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
-6. 协同完成Release Note的书写
+1. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
+1. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
+1. 协同完成Release Note的书写
 
 
 需要注意的是:
@@ -30,7 +31,7 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 * `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试PaddlePaddle的行为。
 * 在`release/版本号`分支存在的时候，如果有bugfix的行为，需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
 
-# PaddlePaddle 分支规范
+## PaddlePaddle 分支规范
 
 PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。
 
@@ -47,11 +48,11 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 
 * BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
 
-# PaddlePaddle回归测试列表
+## PaddlePaddle回归测试列表
 
 本列表说明PaddlePaddle发版之前需要测试的功能点。
 
-## PaddlePaddle Book中所有章节
+### PaddlePaddle Book中所有章节
 
 PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
 
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst
index 3c525bdad6..c875c807b8 100644
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -19,7 +19,7 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译
    git clone https://github.com/PaddlePaddle/Paddle.git
    cd Paddle
    # 如果使用Docker编译环境，执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
    # 如果不使用Docker编译环境，执行下面的命令
    mkdir build
    cd build
@@ -30,7 +30,7 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译
 
 .. code-block:: bash
 
-   pip install python/dist/*.whl
+   pip install build/python/dist/*.whl
 
 
 .. _run_test:
@@ -45,7 +45,7 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
 
 如果不使用Docker，可以执行ctest命令即可：
 
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst
index 76fbc43de2..f194f84ce7 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -21,7 +21,7 @@ Then run:
    git clone https://github.com/PaddlePaddle/Paddle.git
    cd Paddle
    # run the following command to build a CPU-Only binaries if you are using docker
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
    # else run these commands
    mkdir build
    cd build
@@ -34,7 +34,7 @@ machine or copy it to the target machine.
 
 .. code-block:: bash
 
-   pip install python/dist/*.whl
+   pip install build/python/dist/*.whl
 
 
 .. _run_test:
@@ -49,7 +49,7 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU.
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
 
 If you don't use Docker, just run ctest will start the tests:
 
@@ -117,7 +117,7 @@ You can add :code:`-D` argument to pass such options, like:
     "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
     "WITH_STYLE_CHECK", "Check code style when building", "ON"
     "WITH_TESTING", "Build unit tests", "ON"
-    "WITH_DOC", "Build documentaions", "OFF"
+    "WITH_DOC", "Build documentations", "OFF"
     "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
     "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
     "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst
index 88c5142dde..c9ba84c842 100644
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -13,7 +13,7 @@ PaddlePaddle提供pip和Docker的安装方式：
 
    pip_install_cn.rst
    docker_install_cn.rst
-
+   ../../howto/dev/build_cn.md
 
 编译流程
 ++++++++
diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/getstarted/build_and_install/index_en.rst
index c8b60d0357..32d66d63dd 100644
--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
@@ -13,6 +13,7 @@ You can choose either pip or Docker to complete your install:
 
    pip_install_en.rst
    docker_install_en.rst
+   ../../howto/dev/build_en.md
 
 
 Build from Source
diff --git a/doc/howto/dev/build_cn.md b/doc/howto/dev/build_cn.md
index 0b911f7b75..4a80a52451 100644
--- a/doc/howto/dev/build_cn.md
+++ b/doc/howto/dev/build_cn.md
@@ -1,4 +1,4 @@
-# 编译PaddlePaddle和运行单元测试
+# 用Docker编译和测试PaddlePaddle
 
 ## 需要的软硬件
 
diff --git a/doc/howto/dev/build_en.md b/doc/howto/dev/build_en.md
index d0048e3714..91c41ef8ce 100644
--- a/doc/howto/dev/build_en.md
+++ b/doc/howto/dev/build_en.md
@@ -1,4 +1,4 @@
-# Build PaddlePaddle from Source Code and Run Unit Test
+# Build using Docker
 
 ## What Developers Need
 
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index 6cfc9536f2..44dbeecbbd 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -30,8 +30,8 @@
 --------------  | :----------------------
 OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
 Op定义           | `.cc`文件
-Kernel实现       | CPU、GPU共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，GPU 实现在`.cu`文件中。
-注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，GPU实现在`.cu`文件中
+Kernel实现       | CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。
+注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中
 
 
 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
@@ -153,7 +153,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
 `MulKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
 
-- `typename  Place`: 表示设备类型，不同设备(CPU、GPU)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+- `typename DeviceContext`: 表示设备类型，不同设备(CPU、CUDA)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
 
 - `typename T` : 表示数据类型，如`float`, `double`等。
 
@@ -165,7 +165,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 下面是 `MulKernel` `Compute`的实现：
 
   ```cpp
-  template <typename Place, typename T>
+  template <typename DeviceContext, typename T>
   class MulKernel : public framework::OpKernel {
   public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -173,18 +173,16 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
     auto* Y = context.Input<Tensor>("Y");
     auto* Z = context.Output<Tensor>("Out");
     Z->mutable_data<T>(context.GetPlace());
-    auto* device_context =
-        const_cast<platform::DeviceContext*>(context.device_context_);
-    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+    auto& device_context = context.template device_context<DeviceContext>();
+    math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
   }
   };
-  ```
 
-需要注意：**不同设备(CPU、GPU)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
+需要注意：**不同设备(CPU、CUDA)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
 
-`MulOp`的CPU、GPU实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
 
-为了使`OpKernel`的计算过程书写更加简单，并且CPU、GPU的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
+为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
 
 
 到此，前向Op实现完成。接下来，需要在`.cc`文件中注册该op和kernel。
@@ -197,9 +195,9 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
     ```cpp
     namespace ops = paddle::operators;
     REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
-    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
     REGISTER_OP_CPU_KERNEL(mul_grad,
-                  ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
     ```
 
    在上面的代码中：
@@ -209,17 +207,17 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
     - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
 
 
-- 在 `.cu`文件中注册GPU Kernel。
-    - 请注意，如果GPU Kernel的实现基于Eigen unsupported模块，那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`，代码示例如下：
+- 在 `.cu`文件中注册CUDA Kernel。
+    - 请注意，如果CUDA Kernel的实现基于Eigen unsupported模块，那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`，代码示例如下：
 
     ```cpp
     // if use Eigen unsupported module before include head files
-    // #define EIGEN_USE_GPU
+    #define EIGEN_USE_GPU
 
     namespace ops = paddle::operators;
-    REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
-    REGISTER_OP_GPU_KERNEL(mul_grad,
-                           ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+    REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+    REGISTER_OP_CUDA_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
     ```
 
 ### 5. 编译
@@ -236,71 +234,55 @@ make mul_op
 
 ## 实现单元测试
 
-单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
 
-### 前向Operator单元测试
 
-前向Op单元测试继承自`unittest.TestCase`，并定义元类`__metaclass__ = OpTestMeta`。各项更加具体的单元测试在`OpTestMeta`里完成。测试前向Operator，需要：
+Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator，需要：
 
 1. 在`setUp`函数定义输入、输出，以及相关的属性参数。
 2. 生成随机的输入数据。
 3. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比。
+4. 反向计算已经自动集成进测试框架，直接调用相应接口即可。
 
 
   ```python
   import unittest
   import numpy as np
-  from gradient_checker import GradientChecker, create_op
-  from op_test_util import OpTestMeta
+  from op_test import OpTest
 
-  class TestMulOp(unittest.TestCase):
-      __metaclass__ = OpTestMeta
 
+  class TestMulOp(OpTest):
       def setUp(self):
-          self.type = "mul"
+          self.op_type = "mul"
           self.inputs = {
               'X': np.random.random((32, 84)).astype("float32"),
               'Y': np.random.random((84, 100)).astype("float32")
           }
           self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
-  ```
 
-上面的代码首先导入依赖的包，下面是对`setUp`函数中操作的重要变量的详细解释：
-
-- `self.type = "mul" ` : 定义类型，与operator注册时注册的类型一致。
-- `self.inputs` : 定义输入，类型为`numpy.array`，并初始化。
-- `self.outputs` : 定义输出，并在Python脚本中完成与operator同样的计算逻辑，返回Python端的计算结果。
-
-
-### 反向Operator单元测试
+      def test_check_output(self):
+          self.check_output()
 
-反向Op单元测试继承自`GradientChecker`，而`GradientChecker`继承自`unittest.TestCase`，因此，**反向单元测试函数需要以`test_`开头**。
+      def test_check_grad_normal(self):
+          self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
 
-```python
-class TestMulGradOp(GradientChecker):
-    def setUp(self):
-        self.op = create_op("mul")
-        self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((84, 100)).astype("float32")
-        }
+      def test_check_grad_ingore_x(self):
+          self.check_grad(
+              ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
 
-    def test_check_grad_normal(self):
-        # mul op will enlarge the relative error
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+      def test_check_grad_ingore_y(self):
+          self.check_grad(
+              ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
 
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+    ```
 
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-```
+上面的代码首先导入依赖的包，下面是对`setUp`函数中操作的重要变量的详细解释：
 
-下面解释代码中一些关键的地方:
+- `self.op_type = "mul" ` : 定义类型，与operator注册时注册的类型一致。
+- `self.inputs` : 定义输入，类型为`numpy.array`，并初始化。
+- `self.outputs` : 定义输出，并在Python脚本中完成与operator同样的计算逻辑，返回Python端的计算结果。
 
-- 调用`create_op("mul")`创建反向Op对应的前向Op。
+而反向测试中：
 - `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
   - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
   - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
@@ -328,5 +310,5 @@ ctest -R test_mul_op
 
 - 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。不允许一个文件中包含多个Op，这将会导致编译出错。
 - 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`等，这将会导致单元测试出错。
-- 如果Op没有实现GPU Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
+- 如果Op没有实现CUDA Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
 - 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md
index 1e88e1f5b4..510233306c 100644
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@@ -28,8 +28,8 @@ An operator can be differentiated by whether in has kernel methods. An operator
 --------------  | :----------------------
 OpProtoMake definition  | `.cc`files, Backward Op does not need an OpProtoMake interface.
 Op definition           | `.cc` files
-Kernel implementation       | The kernel methods shared between CPU and GPU are defined in `.h` files. CPU-specific kernels live in `.cc` files, while GPU-specific kernels are implemented in `.cu`files.
-Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the GPU implementation.
+Kernel implementation       | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
+Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
 
 
 New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. **
@@ -151,7 +151,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
 
 `MulKernel` inherits `framework::OpKernel`, which includes the following templates:
 
-- `typename  Place` denotes device type. When different devices, namely the CPU and the GPU, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+- `typename  DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
 
 - `typename T` denotes data type, such as `float` or `double`.
 
@@ -163,7 +163,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
 `MulKernel`'s implementation of `Compute` is as follows:
 
   ```cpp
-  template <typename Place, typename T>
+  template <typename DeviceContext, typename T>
   class MulKernel : public framework::OpKernel {
   public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -171,16 +171,15 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
     auto* Y = context.Input<Tensor>("Y");
     auto* Z = context.Output<Tensor>("Out");
     Z->mutable_data<T>(context.GetPlace());
-    auto* device_context =
-        const_cast<platform::DeviceContext*>(context.device_context_);
-    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+    auto& device_context = context.template device_context<DeviceContext>();
+    math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
   }
   };
   ```
 
-Note that **different devices (CPU, GPU)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
+Note that **different devices (CPU, CUDA)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
 
-`MulOp`'s CPU and GPU share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
 
 To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
 
@@ -196,9 +195,9 @@ The definition of its corresponding backward operator, if applicable, is similar
     ```cpp
     namespace ops = paddle::operators;
     REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
-    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
     REGISTER_OP_CPU_KERNEL(mul_grad,
-                  ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
     ```
 
    In that code block,
@@ -208,17 +207,17 @@ The definition of its corresponding backward operator, if applicable, is similar
     - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
 
 
-- Registering GPU Kernel in `.cu` files
-    - Note that if GPU Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
+- Registering CUDA Kernel in `.cu` files
+    - Note that if CUDA Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
 
     ```cpp
     // if use Eigen unsupported module before include head files
     #define EIGEN_USE_GPU
 
     namespace ops = paddle::operators;
-    REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
-    REGISTER_OP_GPU_KERNEL(mul_grad,
-                           ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+    REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+    REGISTER_OP_CUDA_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
     ```
 
 ### 5. Compilation
@@ -253,62 +252,50 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass
 
 2. Generating random input data.
 
-3. Implementing the same computation logic in a Python script:
+3. Implementing the same computation logic in a Python script.
+
+4. Call check gradient function to check the backward operator.
 
   ```python
   import unittest
   import numpy as np
-  from gradient_checker import GradientChecker, create_op
-  from op_test_util import OpTestMeta
+  from op_test import OpTest
 
-  class TestMulOp(unittest.TestCase):
-      __metaclass__ = OpTestMeta
 
+  class TestMulOp(OpTest):
       def setUp(self):
-          self.type = "mul"
+          self.op_type = "mul"
           self.inputs = {
               'X': np.random.random((32, 84)).astype("float32"),
               'Y': np.random.random((84, 100)).astype("float32")
           }
           self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
-  ```
-Get its output, and compare it with the forward operator's own output.
-
-The code above first loads required packages. In addition, we have
-
-- `self.type = "mul" ` defines the type that is identical to what the operator's registered type.
-- `self.inputs` defines input, with type `numpy.array` and initializes it.
-- `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script.
 
-### Testing Backward Operators
+      def test_check_output(self):
+          self.check_output()
+          
+      def test_check_grad_normal(self):
+          self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
 
-A backward operator unit test inherits `GradientChecker`, which inherits `unittest.TestCase`. As a result, **a backward operator unit test needs to be have the prefix `test_`**.
+      def test_check_grad_ingore_x(self):
+          self.check_grad(
+              ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
 
-```python
-class TestMulGradOp(GradientChecker):
-    def setUp(self):
-        self.op = create_op("mul")
-        self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((84, 100)).astype("float32")
-        }
+      def test_check_grad_ingore_y(self):
+          self.check_grad(
+              ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
 
-    def test_check_grad_normal(self):
-        # mul op will enlarge the relative error
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+    ```
+Get its output, and compare it with the forward operator's own output.
 
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+The code above first loads required packages. In addition, we have
 
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-```
+- `self.op_type = "mul" ` defines the type that is identical to what the operator's registered type.
+- `self.inputs` defines input, with type `numpy.array` and initializes it.
+- `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script.
 
-Some key points in the code above include:
+Some key points in checking gradient above include:
 
-- `create_op("mul")` creates the backward operator's corresponding forward operator.
 - `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
   - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
   - The second variable `"Out"` points to the network's final output target `Out`.
@@ -338,5 +325,5 @@ ctest -R test_mul_op
 
 - Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file.
 - The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures.
-- If the operator does not implement a GPU kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
+- If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
 - If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`.
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index d6b8464100..cf84568ecd 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -25,8 +25,18 @@ FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py)
 
 SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
 
+SET(SWIG_NEED_FLAGS
+    -ftls-model=global-dynamic
+    -Wno-parentheses-equality
+    -Wno-self-assign
+    -Wno-maybe-uninitialized
+    -Wno-missing-field-initializers)
+  FOREACH(flag ${SWIG_NEED_FLAGS})
+  safe_set_cxxflag(SWIG_CXX_FLAGS ${flag})
+ENDFOREACH()
+
 SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign -ftls-model=global-dynamic")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SWIG_CXX_FLAGS}")
 
 SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
     paddle_parameter
diff --git a/paddle/capi/Main.cpp b/paddle/capi/Main.cpp
index bb8249a551..c038789340 100644
--- a/paddle/capi/Main.cpp
+++ b/paddle/capi/Main.cpp
@@ -43,4 +43,11 @@ paddle_error paddle_init(int argc, char** argv) {
   isInit = true;
   return kPD_NO_ERROR;
 }
+
+paddle_error paddle_init_thread() {
+  if (FLAGS_use_gpu) {
+    hl_init(FLAGS_gpu_id);
+  }
+  return kPD_NO_ERROR;
+}
 }
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index 30f3a766f0..cbacd1fb71 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -40,7 +40,7 @@ paddle_error paddle_matrix_destroy(paddle_matrix mat) {
 paddle_error paddle_matrix_set_row(paddle_matrix mat,
                                    uint64_t rowID,
                                    paddle_real* rowArray) {
-  if (mat == nullptr) return kPD_NULLPTR;
+  if (mat == nullptr || rowArray == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
   if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
diff --git a/paddle/capi/error.cpp b/paddle/capi/error.cpp
new file mode 100644
index 0000000000..169b65f921
--- /dev/null
+++ b/paddle/capi/error.cpp
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "error.h"
+
+const char* paddle_error_string(paddle_error err) {
+  switch (err) {
+    case kPD_NULLPTR:
+      return "nullptr error";
+    case kPD_OUT_OF_RANGE:
+      return "out of range error";
+    case kPD_PROTOBUF_ERROR:
+      return "protobuf error";
+    case kPD_NOT_SUPPORTED:
+      return "not supported error";
+    case kPD_UNDEFINED_ERROR:
+      return "undefined error";
+    default:
+      return "";
+  }
+}
diff --git a/paddle/capi/error.h b/paddle/capi/error.h
index 44d8c2040d..9d9d0ed63a 100644
--- a/paddle/capi/error.h
+++ b/paddle/capi/error.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifndef __PADDLE_CAPI_ERROR_H__
 #define __PADDLE_CAPI_ERROR_H__
 
+#include "config.h"
+
 /**
  * Error Type for Paddle API.
  */
@@ -27,4 +29,9 @@ typedef enum {
   kPD_UNDEFINED_ERROR = -1,
 } paddle_error;
 
+/**
+ * Error string for Paddle API.
+ */
+PD_API const char* paddle_error_string(paddle_error err);
+
 #endif
diff --git a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
index 98e411ddc0..2fc8debdde 100644
--- a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
+++ b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
@@ -1,8 +1,29 @@
 project(multi_thread)
 cmake_minimum_required(VERSION 2.8)
-aux_source_directory(. SRC_LIST)
-add_executable(${PROJECT_NAME} ${SRC_LIST})
+
 find_package (Threads)
+
+if(NOT PADDLE_ROOT)
+  set(PADDLE_ROOT $ENV{PADDLE_ROOT} CACHE PATH "Paddle Path")
+endif()
+if(PADDLE_ROOT)
+  include_directories(${PADDLE_ROOT}/include)
+  link_directories(${PADDLE_ROOT}/lib)
+endif()
+
+set(CPU_SRCS main.c)
+add_executable(${PROJECT_NAME} ${CPU_SRCS})
 set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
-target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared
-  ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${PROJECT_NAME}
+                      -lpaddle_capi_shared
+                      ${CMAKE_THREAD_LIBS_INIT})
+
+find_package(CUDA QUIET)
+if(CUDA_FOUND)
+  set(GPU_SRCS main_gpu.c)
+  cuda_add_executable(${PROJECT_NAME}_gpu ${GPU_SRCS})
+  set_property(TARGET ${PROJECT_NAME}_gpu PROPERTY C_STANDARD 99)
+  target_link_libraries(${PROJECT_NAME}_gpu
+                        -lpaddle_capi_shared
+                        ${CMAKE_THREAD_LIBS_INIT})
+endif(CUDA_FOUND)
diff --git a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
new file mode 100644
index 0000000000..6fd376e0d1
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
@@ -0,0 +1,113 @@
+#include <paddle/capi.h>
+#include <pthread.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+#define NUM_THREAD 4
+#define NUM_ITER 1000
+
+pthread_mutex_t mutex;
+
+/*
+ * @brief It is an simple inference example that runs multi-threads on a GPU.
+ *        Each thread holds it own local gradient_machine but shares the same
+ *        parameters.
+ *        If you want to run on different GPUs, you need to launch
+ *        multi-processes or set trainer_count > 1.
+ */
+void* thread_main(void* gm_ptr) {
+  // Initialize the thread environment of Paddle.
+  CHECK(paddle_init_thread());
+
+  paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr);
+  // Create input arguments.
+  paddle_arguments in_args = paddle_arguments_create_none();
+  // Create input matrix.
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+                                           /* size */ 784,
+                                           /* useGPU */ true);
+  // Create output arguments.
+  paddle_arguments out_args = paddle_arguments_create_none();
+  // Create output matrix.
+  paddle_matrix prob = paddle_matrix_create_none();
+
+  // CPU buffer to cache the input and output.
+  paddle_real* cpu_input = (paddle_real*)malloc(784 * sizeof(paddle_real));
+  paddle_real* cpu_output = (paddle_real*)malloc(10 * sizeof(paddle_real));
+  for (int iter = 0; iter < NUM_ITER; ++iter) {
+    // There is only one input layer of this network.
+    CHECK(paddle_arguments_resize(in_args, 1));
+    CHECK(paddle_arguments_set_value(in_args, 0, mat));
+
+    for (int i = 0; i < 784; ++i) {
+      cpu_input[i] = rand() / ((float)RAND_MAX);
+    }
+    CHECK(paddle_matrix_set_value(mat, cpu_input));
+
+    CHECK(paddle_gradient_machine_forward(machine,
+                                          in_args,
+                                          out_args,
+                                          /* isTrain */ false));
+
+    CHECK(paddle_arguments_get_value(out_args, 0, prob));
+    CHECK(paddle_matrix_get_value(prob, cpu_output));
+
+    pthread_mutex_lock(&mutex);
+    printf("Prob: ");
+    for (int i = 0; i < 10; ++i) {
+      printf("%.2f ", cpu_output[i]);
+    }
+    printf("\n");
+    pthread_mutex_unlock(&mutex);
+  }
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_matrix_destroy(mat));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+
+  free(cpu_input);
+  free(cpu_output);
+
+  return NULL;
+}
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=True"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  srand(time(0));
+  pthread_mutex_init(&mutex, NULL);
+
+  pthread_t threads[NUM_THREAD];
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    paddle_gradient_machine thread_local_machine;
+    CHECK(paddle_gradient_machine_create_shared_param(
+        machine, buf, size, &thread_local_machine));
+    pthread_create(&threads[i], NULL, thread_main, thread_local_machine);
+  }
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    pthread_join(threads[i], NULL);
+  }
+
+  pthread_mutex_destroy(&mutex);
+
+  return 0;
+}
diff --git a/paddle/capi/main.h b/paddle/capi/main.h
index 893ebcbd58..99c4e8428d 100644
--- a/paddle/capi/main.h
+++ b/paddle/capi/main.h
@@ -26,6 +26,13 @@ extern "C" {
  */
 PD_API paddle_error paddle_init(int argc, char** argv);
 
+/**
+ * Initialize the thread environment of Paddle.
+ * @note it is requisite for GPU runs but optional for CPU runs.
+ *       For GPU runs, all threads will run on the same GPU devices.
+ */
+PD_API paddle_error paddle_init_thread();
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 89c1f48eda..8841806292 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -116,6 +116,7 @@ extern void hl_maxpool_backward(const int frameCnt,
  * @param[in]   paddingW    padding width.
  * @param[out]  tgtData     output data.
  * @param[in]   tgtStride   stride between output data samples.
+ * @param[in]   excludeMode whether to consider paddings for size.
  *
  */
 extern void hl_avgpool_forward(const int frameCnt,
@@ -132,7 +133,8 @@ extern void hl_avgpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride);
+                               const int tgtStride,
+                               bool excludeMode);
 
 /**
  * @brief   Maximum pool backward.
@@ -154,6 +156,7 @@ extern void hl_avgpool_forward(const int frameCnt,
  * @param[in]   scaleB      scale.
  * @param[out]  backGrad    output grad.
  * @param[in]   outStride   stride between output data samples.
+ * @param[in]   excludeMode whether to consider paddings for size.
  *
  */
 extern void hl_avgpool_backward(const int frameCnt,
@@ -172,7 +175,8 @@ extern void hl_avgpool_backward(const int frameCnt,
                                 real scaleA,
                                 real scaleB,
                                 real* backGrad,
-                                const int outStride);
+                                const int outStride,
+                                bool excludeMode);
 
 extern void hl_maxpool3D_forward(const int frameCnt,
                                  const real* inputData,
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 968ed4840f..706cc59a8e 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -68,7 +68,8 @@ inline void hl_avgpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride) {}
+                               const int tgtStride,
+                               const bool excludeMode) {}
 
 inline void hl_avgpool_backward(const int frameCnt,
                                 const real* outGrad,
@@ -86,7 +87,8 @@ inline void hl_avgpool_backward(const int frameCnt,
                                 real scaleA,
                                 real scaleB,
                                 real* backGrad,
-                                const int outStride) {}
+                                const int outStride,
+                                const bool excludeMode) {}
 
 inline void hl_maxpool3D_forward(const int frameCnt,
                                  const real* inputData,
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index 3699b1e8ae..2d1bc4f6d5 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -210,7 +210,8 @@ __global__ void KeAvgPoolForward(const int nthreads,
                                  const int padH,
                                  const int padW,
                                  real* tgtData,
-                                 const int tgtStride) {
+                                 const int tgtStride,
+                                 const bool excludeMode) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
@@ -224,7 +225,8 @@ __global__ void KeAvgPoolForward(const int nthreads,
     int wend = min(wstart + sizeX, width);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
-    int pool_size = (hend - hstart) * (wend - wstart);
+    int poolSize =
+        excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
 
     real aveval = 0;
     inputData += (frameNum * channels + c) * height * width;
@@ -235,7 +237,7 @@ __global__ void KeAvgPoolForward(const int nthreads,
     }
     int tgtIndex =
         index % (pooledW * pooledH * channels) + frameNum * tgtStride;
-    tgtData[tgtIndex] = aveval / pool_size;
+    tgtData[tgtIndex] = aveval / poolSize;
   }
 }
 
@@ -253,7 +255,8 @@ void hl_avgpool_forward(const int frameCnt,
                         const int paddingH,
                         const int paddingW,
                         real* tgtData,
-                        const int tgtStride) {
+                        const int tgtStride,
+                        const bool excludeMode) {
   int num_kernels = pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
   KeAvgPoolForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
@@ -270,7 +273,8 @@ void hl_avgpool_forward(const int frameCnt,
                                                         paddingH,
                                                         paddingW,
                                                         tgtData,
-                                                        tgtStride);
+                                                        tgtStride,
+                                                        excludeMode);
   CHECK_SYNC("hl_avgpool_forward failed");
 }
 
@@ -290,7 +294,8 @@ __global__ void KeAvgPoolBackward(const int nthreads,
                                   real scaleA,
                                   real scaleB,
                                   real* tgtGrad,
-                                  const int outStride) {
+                                  const int outStride,
+                                  const bool excludeMode) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int offsetW = index % width + padW;
@@ -314,8 +319,9 @@ __global__ void KeAvgPoolBackward(const int nthreads,
         int wstart = pw * strideW - padW;
         int wend = min(wstart + sizeX, width);
         wstart = max(wstart, 0);
-        int poolsize = (hend - hstart) * (wend - wstart);
-        gradient += outGrad[ph * pooledW + pw] / poolsize;
+        int poolSize =
+            excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
+        gradient += outGrad[ph * pooledW + pw] / poolSize;
       }
     }
     tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient;
@@ -338,7 +344,8 @@ void hl_avgpool_backward(const int frameCnt,
                          real scaleA,
                          real scaleB,
                          real* backGrad,
-                         const int outStride) {
+                         const int outStride,
+                         const bool excludeMode) {
   int num_kernels = height * width * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
 
@@ -358,7 +365,8 @@ void hl_avgpool_backward(const int frameCnt,
                                                          scaleA,
                                                          scaleB,
                                                          backGrad,
-                                                         outStride);
+                                                         outStride,
+                                                         excludeMode);
   CHECK_SYNC("hl_avgpool_backward failed");
 }
 
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index c8b85caaca..a17036c652 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -33,8 +33,8 @@ static std::unordered_set<std::string>* g_ctrl_flow_ops_ = nullptr;
 // We should design a better way to backward CtrlFlowOps.
 static std::unordered_set<std::string>& CtrlFlowOps() {
   if (g_ctrl_flow_ops_ == nullptr) {
-    g_ctrl_flow_ops_ =
-        new std::unordered_set<std::string>{"increment", "lod_rank_table"};
+    g_ctrl_flow_ops_ = new std::unordered_set<std::string>{
+        "increment", "lod_rank_table", "less_than"};
   }
   return *g_ctrl_flow_ops_;
 }
@@ -190,8 +190,9 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
       // collect all the offset for each alias,
       // insert a sum operator to add all aliases to output
       insert_position.push_back(
-          {dup_op.back(), OpRegistry::CreateOp("sum", {{"X", dup_outputs}},
-                                               {{"Out", {name}}}, {})});
+          {dup_op.back(),
+           OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, {{"Out", {name}}},
+                                AttributeMap{})});
     }
 
     // make sure the inserted `sum` ops follow the BFS order.
@@ -216,7 +217,8 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
         // If part of input gradient of that operator is not calculated, fill
         // zero variables to that input gradient.
         net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
-                                           {{"Y", {grad_input}}}, {}));
+                                           {{"Y", {grad_input}}},
+                                           AttributeMap{}));
       }
       return false;
     });
@@ -392,8 +394,9 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
             0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
         std::string new_name = prefix + kZeroVarSuffix;
         desc->Rename(in_name, new_name);
-        std::unique_ptr<OpDescBind> fill_zeros_op(new OpDescBind(
-            "fill_zeros_like", {{"X", {prefix}}}, {{"Y", {new_name}}}, {}));
+        std::unique_ptr<OpDescBind> fill_zeros_op(
+            new OpDescBind("fill_zeros_like", {{"X", {prefix}}},
+                           {{"Y", {new_name}}}, AttributeMap{}));
         pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
       }
     }
@@ -483,8 +486,9 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
         sum_op_inputs.emplace_back(new_name);
         next_g_name = sum_op_inputs.back();
       }
-      std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
-          "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
+      std::unique_ptr<OpDescBind> sum_op(
+          new OpDescBind("sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}},
+                         AttributeMap{}));
       pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
     }
   }
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 2b858f5ea0..9fe49881d5 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -106,15 +106,15 @@ class FcOp : public operators::NetOp {
   FcOp(const std::string &type, const VariableNameMap &inputs,
        const VariableNameMap &outputs, const AttributeMap &attrs)
       : NetOp(type, inputs, outputs, attrs) {
-    AppendOp(OpRegistry::CreateOp("mul",
-                                  {{"X", {Input("X")}}, {"Y", {Input("W")}}},
-                                  {{"Out", {Output("mul_result")}}}, {}));
+    AppendOp(OpRegistry::CreateOp(
+        "mul", {{"X", {Input("X")}}, {"Y", {Input("W")}}},
+        {{"Out", {Output("mul_result")}}}, AttributeMap{}));
     auto input_b = Inputs("b");
     std::string before_act = "mul_result";
     if (input_b.size() != 0) {
       AppendOp(OpRegistry::CreateOp(
           "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
-          {{"Out", {Output("add_result")}}}, {}));
+          {{"Out", {Output("add_result")}}}, AttributeMap{}));
       before_act = "add_result";
     } else {
       auto out_varname = Output("add_result");
@@ -124,7 +124,7 @@ class FcOp : public operators::NetOp {
     }
 
     AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
-                                  {{"Out", {Output("Out")}}}, {}));
+                                  {{"Out", {Output("Out")}}}, AttributeMap{}));
     CompleteAddOp(false);
   }
 };
@@ -278,8 +278,9 @@ REGISTER_OPERATOR(scale, f::NoneOp);
 REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel<paddle::platform::CPUPlace, float>);
 
 TEST(Backward, simple_op_not_need_grad) {
-  auto fwd = f::OpRegistry::CreateOp(
-      "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
+  auto fwd =
+      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+                              {{"Out", {"out"}}}, f::AttributeMap{});
   ASSERT_NE(fwd, nullptr);
   auto gop = f::Backward(*fwd, {"x"});
   ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName);
@@ -296,9 +297,10 @@ TEST(Backward, net_fc_backward_normal) {
                               {{"mul_result", {"mul_res"}},
                                {"add_result", {"add_re"}},
                                {"Out", {"out"}}},
-                              {});
+                              f::AttributeMap{});
   ASSERT_NE(fwd, nullptr);
-  std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
+  std::shared_ptr<f::OperatorBase> gop =
+      f::Backward(*fwd, std::unordered_set<std::string>{});
   ASSERT_TRUE(gop->IsNetOp());
   auto net = static_cast<ops::NetOp *>(gop.get());
 
@@ -322,9 +324,10 @@ TEST(Backward, net_fc_backward_not_have_b) {
                               {{"mul_result", {"mul_res"}},
                                {"add_result", {"add_res"}},
                                {"Out", {"tmp"}}},
-                              {});
+                              f::AttributeMap{});
   ASSERT_NE(fwd, nullptr);
-  std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
+  std::shared_ptr<f::OperatorBase> gop =
+      f::Backward(*fwd, std::unordered_set<std::string>{});
   ASSERT_TRUE(gop->IsNetOp());
   auto net = static_cast<ops::NetOp *>(gop.get());
 
@@ -346,13 +349,13 @@ TEST(Backward, net_input_of_network_not_need_grad) {
       {{"mul_result", {"mul_tmp_0"}},
        {"add_result", {"add_tmp_0"}},
        {"Out", {"hidden0"}}},
-      {}));
+      f::AttributeMap{}));
   net.AppendOp(f::OpRegistry::CreateOp(
       "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
       {{"mul_result", {"mul_tmp_1"}},
        {"add_result", {"add_tmp_1"}},
        {"Out", {"hidden1"}}},
-      {}));
+      f::AttributeMap{}));
   net.CompleteAddOp();
   auto bwd = Backward(net, {"x"});  // x@GRAD is not need.
   ASSERT_TRUE(bwd->IsNetOp());
@@ -381,12 +384,13 @@ TEST(Backward, net_input_of_network_not_need_grad) {
 TEST(Backward, net_shared_weight) {
   ops::NetOp net;
   net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
-                                       {{"Out", {"out"}}}, {}));
+                                       {{"Out", {"out"}}}, f::AttributeMap{}));
   net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
-                                       {{"Out", {"FinalOut"}}}, {}));
+                                       {{"Out", {"FinalOut"}}},
+                                       f::AttributeMap{}));
   net.CompleteAddOp();
 
-  auto bwd = f::Backward(net, {});
+  auto bwd = f::Backward(net, std::unordered_set<std::string>{});
   ASSERT_TRUE(bwd->IsNetOp());
   auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
   ASSERT_EQ(3UL, bwd_net->ops_.size());
@@ -394,8 +398,9 @@ TEST(Backward, net_shared_weight) {
 }
 
 TEST(Backward, op_all_input_are_not_need) {
-  auto fwd = f::OpRegistry::CreateOp(
-      "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
+  auto fwd =
+      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+                              {{"Out", {"out"}}}, f::AttributeMap{});
   auto backward = f::Backward(*fwd, {"x", "b"});
   ASSERT_TRUE(backward->IsNetOp());
   auto net = static_cast<ops::NetOp *>(backward.get());
@@ -403,8 +408,9 @@ TEST(Backward, op_all_input_are_not_need) {
 }
 
 TEST(Backward, op_all_output_are_not_need) {
-  auto fwd = f::OpRegistry::CreateOp(
-      "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
+  auto fwd =
+      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+                              {{"Out", {"out"}}}, f::AttributeMap{});
   auto backward = f::Backward(*fwd, {"out"});
   ASSERT_TRUE(backward->IsNetOp());
   auto net = static_cast<ops::NetOp *>(backward.get());
@@ -412,8 +418,9 @@ TEST(Backward, op_all_output_are_not_need) {
 }
 
 TEST(Backward, op_part_of_output_are_not_need) {
-  auto fwd = f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
-                                     {{"y", {"Y"}}, {"z", {"Z"}}}, {});
+  auto fwd =
+      f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
+                              {{"y", {"Y"}}, {"z", {"Z"}}}, f::AttributeMap{});
   auto backward = f::Backward(*fwd, {"Z"});
   ASSERT_TRUE(backward->IsNetOp());
   auto net = static_cast<ops::NetOp *>(backward.get());
@@ -437,7 +444,7 @@ TEST(Backward, op_part_of_output_are_not_need) {
 
 TEST(Backward, op_part_of_input_are_not_need) {
   auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}},
-                                     {{"Out", {"out"}}}, {});
+                                     {{"Out", {"out"}}}, f::AttributeMap{});
   auto backward = f::Backward(*fwd, {"a"});
   auto &grad_mul = *backward;
   ASSERT_EQ(grad_mul.Type(), "mul_grad");
@@ -458,19 +465,19 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
       {{"mul_result", {"mul_out1"}},
        {"add_result", {"add_out1"}},
        {"Out", {"out1"}}},
-      {}));
+      f::AttributeMap{}));
   net.AppendOp(f::OpRegistry::CreateOp(
       "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
       {{"mul_result", {"mul_out2"}},
        {"add_result", {"tmp_out2"}},
        {"Out", {"out2"}}},
-      {}));
+      f::AttributeMap{}));
   net.AppendOp(f::OpRegistry::CreateOp(
       "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
       {{"mul_result", {"mul_out3"}},
        {"add_result", {"tmp_out3"}},
        {"Out", {"out3"}}},
-      {}));
+      f::AttributeMap{}));
   net.CompleteAddOp();
 
   auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
@@ -509,7 +516,8 @@ TEST(Backward, simple_single_op) {
 
   auto target = f::VarDescBind("out");
   target.SetShape({1});
-  auto var_to_grad = AppendBackward(program, target, {});
+  auto var_to_grad =
+      AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 3UL);
   f::OpDescBind *fill_op = block->AllOps()[1];
@@ -546,7 +554,7 @@ TEST(Backward, default_attribute) {
 
   auto target = f::VarDescBind("out");
   target.SetShape({1});
-  AppendBackward(program, target, {});
+  AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 3UL);
   EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
@@ -585,7 +593,8 @@ TEST(Backward, simple_mult_op) {
   auto target = f::VarDescBind("out3");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {});
+  auto var_to_grad =
+      AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 6UL + 1);
   f::OpDescBind *fill_op = block->AllOps()[forward_len];
@@ -817,7 +826,8 @@ TEST(Backward, shared_var) {
   auto target = f::VarDescBind("out3");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {});
+  auto var_to_grad =
+      AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 8UL);
   f::OpDescBind *fill_op = block->AllOps()[forward_len];
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 2281d93df9..7ba1e3e4e3 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -59,7 +59,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
     auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
     if (in_var->GetType() != VarDesc::LOD_TENSOR) {
-      VLOG(3) << "input " << in << "is not LodTensor";
+      VLOG(3) << "input " << in << " is not LodTensor";
       return;
     }
     PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
@@ -316,8 +316,8 @@ static void InitInferShapeFuncs() {
     for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) {
       auto op_type = kern_pair.first;
       auto &op_info = info_map.at(op_type);
-      auto op =
-          static_cast<OperatorWithKernel *>(op_info.Creator()("", {}, {}, {}));
+      auto op = static_cast<OperatorWithKernel *>(op_info.Creator()(
+          "", VariableNameMap{}, VariableNameMap{}, AttributeMap{}));
       if (op_info.infer_shape_) {  // infer_shape has been registered.
         continue;
       }
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index daade439e5..b29238432b 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -181,8 +181,8 @@ class OpKernelRegistrar : public Registrar {
     return 0;                                                             \
   }
 
-#define REGISTER_OP_GPU_KERNEL(op_type, ...) \
-  REGISTER_OP_KERNEL(op_type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
+#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__)
 
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
@@ -217,7 +217,7 @@ class OpKernelRegistrar : public Registrar {
 #else
 #define USE_OP_KERNEL(op_type)        \
   USE_OP_DEVICE_KERNEL(op_type, CPU); \
-  USE_OP_DEVICE_KERNEL(op_type, GPU)
+  USE_OP_DEVICE_KERNEL(op_type, CUDA)
 #endif
 
 #define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
@@ -226,9 +226,9 @@ class OpKernelRegistrar : public Registrar {
   USE_OP_ITSELF(op_type);        \
   USE_OP_DEVICE_KERNEL(op_type, CPU);
 
-#define USE_GPU_ONLY_OP(op_type) \
-  USE_OP_ITSELF(op_type);        \
-  USE_OP_DEVICE_KERNEL(op_type, GPU)
+#define USE_CUDA_ONLY_OP(op_type) \
+  USE_OP_ITSELF(op_type);         \
+  USE_OP_DEVICE_KERNEL(op_type, CUDA)
 
 #define USE_OP(op_type)   \
   USE_OP_ITSELF(op_type); \
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 93467ab8ac..e83d754783 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,20 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-template <>
-Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
-    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return *device_context_.GetEigenDevice<platform::CPUPlace>();
-}
-
-#ifdef PADDLE_WITH_CUDA
-template <>
-Eigen::GpuDevice&
-ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return *device_context_.GetEigenDevice<platform::GPUPlace>();
-}
-#endif
-
 std::string OperatorBase::Input(const std::string& name) const {
   auto& ins = Inputs(name);
   PADDLE_ENFORCE_LE(ins.size(), 1UL,
@@ -426,13 +412,10 @@ void OperatorWithKernel::Run(const Scope& scope,
   }
 
   kernel_iter->second->Compute(ctx);
-
-  // throws errors if have.
-  dev_ctx.Finish();
 }
 OpKernelType OperatorWithKernel::GetKernelType(
     const ExecutionContext& ctx) const {
-  return OpKernelType(IndicateDataType(ctx), ctx.device_context());
+  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
 }
 DataType OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 60861d9293..e60dbfc313 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -276,17 +276,25 @@ class ExecutionContext {
     out_tensor->set_lod(in_tensor.lod());
   }
 
-  template <typename PlaceType,
-            typename DeviceType = typename platform::EigenDeviceConverter<
-                PlaceType>::EigenDeviceType>
-  DeviceType& GetEigenDevice() const;
-
   platform::Place GetPlace() const { return device_context_.GetPlace(); }
 
+  template <typename DeviceContextType>
+  const DeviceContextType& device_context() const {
+    return *reinterpret_cast<const DeviceContextType*>(&device_context_);
+  }
+
   const platform::DeviceContext& device_context() const {
     return device_context_;
   }
 
+#ifdef PADDLE_WITH_CUDA
+  const inline platform::CUDADeviceContext& cuda_device_context() const {
+    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
+    return *reinterpret_cast<const platform::CUDADeviceContext*>(
+        &device_context_);
+  }
+#endif
+
   //! Get actual name vector for this input.
   const std::vector<std::string>& Inputs(const std::string& name) const {
     return op_.Inputs(name);
@@ -297,14 +305,6 @@ class ExecutionContext {
     return op_.Outputs(name);
   }
 
-#ifdef PADDLE_WITH_CUDA
-  const inline platform::CUDADeviceContext& cuda_device_context() const {
-    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
-    return *reinterpret_cast<const platform::CUDADeviceContext*>(
-        &device_context_);
-  }
-#endif
-
  private:
   const OperatorBase& op_;
   const Scope& scope_;
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 1e19f82b34..b678178454 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -115,7 +115,7 @@ class OpWithKernelTest : public OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {}
   OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
-    return OpKernelType(DataType::FP32, ctx.device_context());
+    return OpKernelType(DataType::FP32, ctx.GetPlace());
   }
 };
 
@@ -261,7 +261,9 @@ class OperatorClone : public paddle::framework::OperatorBase {
 };
 
 TEST(Operator, Clone) {
-  OperatorClone a("ABC", {}, {}, {});
+  OperatorClone a("ABC", paddle::framework::VariableNameMap{},
+                  paddle::framework::VariableNameMap{},
+                  paddle::framework::AttributeMap{});
   auto b = a.Clone();
   ASSERT_EQ(a.Type(), b->Type());
 }
diff --git a/paddle/framework/prune_test.cc b/paddle/framework/prune_test.cc
index 5988874809..f21df37a29 100644
--- a/paddle/framework/prune_test.cc
+++ b/paddle/framework/prune_test.cc
@@ -54,7 +54,8 @@ TEST(Prune, one_operator) {
   f::ProgramDescBind program;
   f::BlockDescBind *block = program.MutableBlock(0);
 
-  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, {}, block);
+  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
+        block);
 
   f::ProgramDesc *pdesc = program.Proto();
   f::ProgramDesc pruned;
@@ -71,10 +72,14 @@ TEST(Prune, forward) {
   f::ProgramDescBind program;
   f::BlockDescBind *block = program.MutableBlock(0);
 
-  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, {}, block);
-  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, {}, block);
-  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, {}, block);
-  AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, {}, block);
+  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, f::AttributeMap{},
+        block);
 
   f::ProgramDesc *pdesc = program.Proto();
 
@@ -90,11 +95,14 @@ TEST(Prune, multi_input_op) {
   f::ProgramDescBind program;
   f::BlockDescBind *block = program.MutableBlock(0);
 
-  AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, {}, block);
-  AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, {}, block);
-  AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, {}, block);
-  AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}}, {},
+  AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
         block);
+  AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, f::AttributeMap{},
+        block);
+  AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}},
+        f::AttributeMap{}, block);
 
   f::ProgramDesc *pdesc = program.Proto();
   pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true);
@@ -108,9 +116,12 @@ TEST(Prune, multi_output_op) {
   f::ProgramDescBind program;
   f::BlockDescBind *block = program.MutableBlock(0);
 
-  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, {}, block);
-  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, {}, block);
-  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, {}, block);
+  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
+        f::AttributeMap{}, block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
+        block);
 
   f::ProgramDesc *pdesc = program.Proto();
   pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
@@ -124,9 +135,12 @@ TEST(Prune, multi_target) {
   f::ProgramDescBind program;
   f::BlockDescBind *block = program.MutableBlock(0);
 
-  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, {}, block);
-  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, {}, block);
-  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, {}, block);
+  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
+        f::AttributeMap{}, block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
+        block);
 
   f::ProgramDesc *pdesc = program.Proto();
   pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index f5a41b66bf..57c890e488 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/utils/ClassRegistrar.h"
 #include "paddle/utils/Logging.h"
 
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
 #include "MKLDNNActivation.h"
 #endif
 
@@ -490,7 +490,7 @@ Error __must_check backward(Argument& act) {
 END_DEFINE_ACTIVATION(log)
 
 ActivationFunction* ActivationFunction::create(const std::string& type) {
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
   if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) {
     return MKLDNNActivation::create(type);
   }
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index be112b4123..68bf37d59d 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
 #include "paddle/gserver/layers/MKLDNNLayer.h"
 #endif
 
@@ -307,7 +307,7 @@ void NeuralNetwork::backward(const UpdateCallback& callback) {
 }
 
 void NeuralNetwork::finish() {
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
   FOR_EACH_R(layer, layers_) {
     MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(*layer);
     if (dnnLayer) {
diff --git a/paddle/gserver/layers/ConvTransProjection.cpp b/paddle/gserver/layers/ConvTransProjection.cpp
index 48132a3ce4..e7f081c023 100644
--- a/paddle/gserver/layers/ConvTransProjection.cpp
+++ b/paddle/gserver/layers/ConvTransProjection.cpp
@@ -24,13 +24,13 @@ size_t ConvTransProjection::calOutputSize() {
   if (outputH_ == 0) outputH_ = configOutH_;
   if (outputW_ == 0) outputW_ = configOutW_;
   imageH_ = imageSize(outputH_,
-                      filterH_,
+                      (filterH_ - 1) * dilationH_ + 1,
                       paddingH_,
                       strideH_,
                       /* caffeMode */ true);
 
   imageW_ = imageSize(outputW_,
-                      filterW_,
+                      (filterW_ - 1) * dilationW_ + 1,
                       paddingW_,
                       strideW_,
                       /* caffeMode */ true);
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
new file mode 100644
index 0000000000..741984bb68
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
@@ -0,0 +1,163 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNLRNLayer.h"
+#include "paddle/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_lrn, MKLDNNLRNLayer);
+
+bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  /* the size of inputs for norm-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1UL);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  localSize_ = conf.size();
+  alpha_ = conf.scale();
+  beta_ = conf.pow();
+
+  ic_ = conf.channels();
+  oc_ = ic_;
+  iw_ = conf.img_size();
+  ow_ = conf.output_x();
+  ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  oh_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  CHECK_EQ(iw_, ow_);
+  CHECK_EQ(ih_, oh_);
+  return true;
+}
+
+void MKLDNNLRNLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  reshapeInput(bs, ih, iw);
+  // ic_ and oc can not be changed
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
+      << "Input channel can not be changed";
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNLRNLayer::resetFwd(std::vector<primitive>& pipeline,
+                              std::vector<MKLDNNMatrixPtr>& inputs,
+                              MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs[0], out);
+
+  resetFwdPD(fwdPD_, inputs[0], out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
+}
+
+void MKLDNNLRNLayer::resetBwd(std::vector<primitive>& pipeline,
+                              std::vector<MKLDNNMatrixPtr>& inputs,
+                              MKLDNNMatrixPtr& out) {
+  std::shared_ptr<lrn_bwd::primitive_desc> pd;
+
+  resetBwdBuffers(inputs[0], out);
+
+  resetBwdPD(pd, inputs[0], out);
+
+  resetBwdPipeline(pipeline, pd, inputs[0], out);
+}
+
+void MKLDNNLRNLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                     MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+  CHECK(in);
+  resetOutValue(out, in->getPrimitiveDesc());
+}
+
+void MKLDNNLRNLayer::resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                                MKLDNNMatrixPtr in,
+                                MKLDNNMatrixPtr out) {
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  auto fwdDesc = lrn_fwd::desc(pk,
+                               algorithm::lrn_across_channels,
+                               in->getMemoryDesc(),
+                               localSize_,
+                               alpha_,
+                               beta_,
+                               1.0f);
+  pd.reset(new lrn_fwd::primitive_desc(fwdDesc, engine_));
+  // prepare workspace if necessary
+  workspace_ =
+      passType_ != PASS_TEST
+          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
+          : nullptr;
+}
+
+void MKLDNNLRNLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  fwd_ = workspace_
+             ? std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *workspace_, *out))
+             : std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNLRNLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                     MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0] && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
+}
+
+void MKLDNNLRNLayer::resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                                MKLDNNMatrixPtr& in,
+                                MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  CHECK(out);
+  auto bwdDesc = lrn_bwd::desc(algorithm::lrn_across_channels,
+                               in->getMemoryDesc(),
+                               out->getMemoryDesc(),
+                               localSize_,
+                               alpha_,
+                               beta_,
+                               1.0f);
+  pd.reset(new lrn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNLRNLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  if (pd == nullptr) {
+    return;
+  }
+  CHECK(inVals_[0]);
+  CHECK(workspace_);
+  bwdData_ = std::make_shared<lrn_bwd>(
+      lrn_bwd(*pd, *inVals_[0], *out, *workspace_, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.h b/paddle/gserver/layers/MKLDNNLRNLayer.h
new file mode 100644
index 0000000000..cfe5621252
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+typedef mkldnn::lrn_forward lrn_fwd;
+typedef mkldnn::lrn_backward lrn_bwd;
+
+/**
+ * @brief A subclass of MKLDNNLayer LRN(Local Response Norm) layer.
+ *
+ * The config file api is mkldnn_lrn
+ */
+class MKLDNNLRNLayer : public MKLDNNLayer {
+protected:
+  // save forward primitive_desc, which can be used in backward
+  std::shared_ptr<lrn_fwd::primitive_desc> fwdPD_;
+  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+  // test_lrn_backward.cpp, lrn need workspace for backward
+  std::shared_ptr<mkldnn::memory> workspace_;
+
+  int localSize_;
+  float alpha_, beta_;  // scale and pow in paddle
+
+public:
+  explicit MKLDNNLRNLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+
+  ~MKLDNNLRNLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+protected:
+  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr& in,
+                  MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 87613a96c5..fceb389d06 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -45,6 +45,8 @@ bool PoolLayer::init(const LayerMap& layerMap,
   strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
   confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
   outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+
+  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
   return true;
 }
 
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index d43292ad2d..9df672a935 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -38,6 +38,8 @@ protected:
 
   std::string poolType_;
 
+  bool excludeMode_;
+
 public:
   explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
 
diff --git a/paddle/gserver/layers/PoolProjection.cpp b/paddle/gserver/layers/PoolProjection.cpp
index d90b438448..6a9de394ce 100644
--- a/paddle/gserver/layers/PoolProjection.cpp
+++ b/paddle/gserver/layers/PoolProjection.cpp
@@ -36,6 +36,8 @@ PoolProjection::PoolProjection(const ProjectionConfig& config,
   strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
   confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
   outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+
+  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
 }
 
 size_t PoolProjection::getSize() {
@@ -141,7 +143,8 @@ void AvgPoolProjection::forward() {
                        outputY_,
                        outputX_,
                        confPaddingY_,
-                       confPadding_);
+                       confPadding_,
+                       excludeMode_);
 }
 
 void AvgPoolProjection::backward(const UpdateCallback& callback) {
@@ -166,6 +169,7 @@ void AvgPoolProjection::backward(const UpdateCallback& callback) {
                              1,
                              1,
                              confPaddingY_,
-                             confPadding_);
+                             confPadding_,
+                             excludeMode_);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h
index 9a75f465f6..a0412714bc 100644
--- a/paddle/gserver/layers/PoolProjection.h
+++ b/paddle/gserver/layers/PoolProjection.h
@@ -28,6 +28,7 @@ protected:
   int confPaddingY_, confPadding_;
   size_t channels_;
   std::string poolType_;
+  bool excludeMode_;
 
 public:
   PoolProjection(const ProjectionConfig& config,
diff --git a/paddle/gserver/tests/mkldnn_simple_net.conf b/paddle/gserver/tests/mkldnn_simple_net.conf
index 8bbe91e56d..0e9d6b31fa 100644
--- a/paddle/gserver/tests/mkldnn_simple_net.conf
+++ b/paddle/gserver/tests/mkldnn_simple_net.conf
@@ -51,6 +51,8 @@ tmp = img_pool_layer(input=tmp,
             padding=1,
             pool_type=MaxPooling())
 
+tmp = img_cmrnorm_layer(input=tmp, size=5, scale=0.0001, power=0.75)
+
 tmp = fc_layer(input=tmp,
             size=channels,
             bias_attr=False,
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index c5359f272b..a2f07937b8 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -238,9 +238,24 @@ void testProjectionConv(size_t groups, bool isDeconv) {
                             /* caffeMode */ true);
   conv->set_output_x(output_x);
   conv->set_output_y(output_y);
+  LOG(INFO) << "DILATION:" << DILATION << "; output_x: " << output_x
+            << "; output_y: " << output_y;
   if (isDeconv) {
+    int deconv_image_x = imageSize(output_x,
+                                   (conv->filter_size() - 1) * DILATION + 1,
+                                   conv->padding(),
+                                   conv->stride(),
+                                   /* caffeMode */ true);
+    int deconv_image_y = imageSize(output_y,
+                                   (conv->filter_size_y() - 1) * DILATION + 1,
+                                   conv->padding_y(),
+                                   conv->stride_y(),
+                                   /* caffeMode */ true);
+
+    LOG(INFO) << " deconv_image_x: " << deconv_image_x
+              << "; deconv_image_y: " << deconv_image_y;
     conf.set_input_size(output_x * output_y * CHANNELS);
-    conf.set_output_size(IMAGE_SIZE * IMAGE_SIZE * NUM_FILTERS);
+    conf.set_output_size(deconv_image_x * deconv_image_y * NUM_FILTERS);
   } else {
     conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
     conf.set_output_size(output_x * output_y * NUM_FILTERS);
@@ -1211,7 +1226,10 @@ void setPoolConfig(TestConfig* config,
   pool->set_output_y(oh);
 }
 
-void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
+void testPoolLayer(const string& poolType,
+                   bool trans,
+                   bool useGpu,
+                   bool excludeMode = true) {
   TestConfig config;
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
   LayerInputConfig* input = config.layerConfig.add_inputs();
@@ -1219,6 +1237,7 @@ void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
 
   pool->set_img_size(14);
   pool->set_img_size_y(14);
+  pool->set_exclude_mode(excludeMode);
   setPoolConfig(&config, pool, poolType);
   config.layerConfig.set_size(pool->output_x() * pool->output_y() *
                               pool->channels());
@@ -1250,16 +1269,26 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
 
 TEST(Layer, PoolLayer) {
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ false,
+                /* excludeMode= */ false);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
   testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
 
 #ifdef PADDLE_WITH_CUDA
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ true,
+                /* excludeMode= */ false);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2(
+      "cudnn-avg-incl-pad-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index 56b523f220..ad1dbc3ee2 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -272,6 +272,51 @@ TEST(MKLDNNLayer, BatchNormLayer) {
   testBatchNormLayer({4, 16, 8, 10});
 }
 
+struct testLRNDesc {
+  int bs, ic, ih, iw;
+  float scale, pow;
+  int localSize;
+};
+
+void getMKLDNNLRNConfig(TestConfig& cfg, const testLRNDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_lrn");
+  cfg.layerConfig.set_active_type("relu");
+  size_t layerSize = pm.ic * pm.ih * pm.iw;
+  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_channels(pm.ic);
+  norm->set_size(pm.localSize);
+  norm->set_scale(pm.scale);
+  norm->set_pow(pm.pow);
+  norm->set_blocked(0);
+  norm->set_img_size(pm.iw);
+  norm->set_img_size_y(pm.ih);
+  norm->set_output_x(norm->img_size());
+  norm->set_output_y(norm->img_size_y());
+  cfg.layerConfig.set_size(layerSize);
+  cfg.biasSize = 0;
+}
+
+void testLRNLayer(const testLRNDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNLRNConfig(dnnConfig, pm);
+  // mkldnn_lrn <==> norm with cmrnorm-projection type
+  TestConfig refConfig = dnnConfig;
+  refConfig.layerConfig.set_type("norm");
+  LayerInputConfig* input = refConfig.layerConfig.mutable_inputs(0);
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cmrnorm-projection");
+  norm->set_scale(norm->scale() / norm->size());
+  RUN_MKLDNN_TEST(dnnConfig, refConfig, pm)
+}
+
+TEST(MKLDNNLayer, LRNLayer) {
+  testLRNLayer({4, 10, 12, 12, 0.001f, 0.75f, 5});
+  testLRNLayer({2, 32, 6, 6, 0.001f, 0.75f, 5});
+  testLRNLayer({4, 16, 8, 10, 0.01f, 0.5f, 5});
+}
+
 struct testImageDesc {
   int bs, ic, ih, iw;
 };
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index 94ef561f06..17563bf5e1 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -48,7 +48,7 @@ public:
    */
   virtual void* alloc(size_t size) {
     void* ptr;
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
     // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
     // memory alignment
     CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index ba86eacbb5..28ab54b450 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -206,7 +206,7 @@ double dotProduct<double>(const int n, const double* x, const double* y) {
 }
 #endif
 
-#if defined(PADDLE_USE_MKLML)
+#if defined(PADDLE_WITH_MKLML)
 
 template <>
 void vExp<float>(const int n, const float* a, float* r) {
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index f6e77029bd..29fe36e3a4 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifndef MATHFUNCTIONS_H_
 #define MATHFUNCTIONS_H_
 
-#ifdef PADDLE_USE_MKLML
+#ifdef PADDLE_WITH_MKLML
 #include <mkl_cblas.h>
 #include <mkl_lapacke.h>
 #include <mkl_vml_functions.h>
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 88e9180690..1ec4336cab 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "hl_top_k.h"
 #include "paddle/utils/Logging.h"
 
+#include "NEONFunctions.h"
 #include "paddle/function/GemmFunctor.h"
 #include "paddle/utils/ThreadLocal.h"
 
@@ -1130,7 +1131,8 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat,
                                size_t outputH,
                                size_t outputW,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               bool excludeMode) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
   real* inputData = inputMat.getData();
@@ -1153,7 +1155,8 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat,
                      paddingH,
                      paddingW,
                      data_,
-                     getStride());
+                     getStride(),
+                     excludeMode);
 }
 
 void GpuMatrix::avgPoolBackward(Matrix& outGrad,
@@ -1168,7 +1171,8 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
                                 real scaleTargets,
                                 real scaleOutput,
                                 size_t paddingH,
-                                size_t paddingW) {
+                                size_t paddingW,
+                                bool excludeMode) {
   CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
 
   real* outDiff = outGrad.getData();
@@ -1194,7 +1198,8 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
                       scaleTargets,
                       scaleOutput,
                       data_,
-                      outGrad.getStride());
+                      outGrad.getStride(),
+                      excludeMode);
 }
 
 void GpuMatrix::maxPool3DForward(Matrix& inputMat,
@@ -2136,7 +2141,8 @@ void CpuMatrix::avgPoolForward(Matrix& input,
                                size_t outputH,
                                size_t outputW,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               bool excludeMode) {
   // The main loop
   size_t num = input.getHeight();
   size_t inLength = imgSizeH * imgSizeW;
@@ -2165,7 +2171,8 @@ void CpuMatrix::avgPoolForward(Matrix& input,
               tgtData[ph * outputW + pw] += inData[h * imgSizeW + w];
             }
           }
-          int poolSize = (hend - hstart) * (wend - wstart);
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
           CHECK(poolSize);
           tgtData[ph * outputW + pw] /= poolSize;
         }
@@ -2189,7 +2196,8 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
                                 real scaleTargets,
                                 real scaleOutput,
                                 size_t paddingH,
-                                size_t paddingW) {
+                                size_t paddingW,
+                                bool excludeMode) {
   size_t num = input.getHeight();
   size_t channels = input.getWidth() / outputH / outputW;
   size_t inLength = imgSizeH * imgSizeW;
@@ -2211,7 +2219,8 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
           int wstart = pw * strideW - paddingW;
           int wend = std::min(wstart + sizeX, imgSizeW);
           wstart = std::max(wstart, 0);
-          int poolSize = (hend - hstart) * (wend - wstart);
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
           CHECK(poolSize);
 
           for (int h = hstart; h < hend; ++h) {
@@ -4157,16 +4166,36 @@ void CpuMatrix::print(std::ostream& os) const {
 void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   real* input = data.getData();
   real* w = W.getData();
+  real* output = data_;
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
   size_t paraSize = W.getHeight() * W.getWidth();
   CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+
   size_t partial_sum = numElements / paraSize;
+  if (paraSize == numElements) {
+    for (size_t n = 0; n < numSamples * numElements; ++n) {
+      output[n] = input[n] > 0 ? input[n] : input[n] * w[n % numElements];
+    }
+    return;
+  }
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  for (size_t n = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < paraSize; i++) {
+      neon::prelu(
+          input + i * partial_sum, w[i], output + i * partial_sum, partial_sum);
+    }
+    input = input + numElements;
+    output = output + numElements;
+  }
+#else
   for (size_t n = 0, k = 0; n < numSamples; ++n) {
     for (size_t i = 0; i < numElements; ++i, ++k) {
-      data_[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
+      output[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
     }
   }
+#endif
 }
 
 void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index e273f11236..c8e690e642 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -911,7 +911,8 @@ public:
                               size_t outputH,
                               size_t outputW,
                               size_t paddingH,
-                              size_t paddingW) {
+                              size_t paddingW,
+                              bool excludeMode = true) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -927,9 +928,11 @@ public:
                                real scaleTargets,
                                real scaleOutput,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               bool excludeMode = true) {
     LOG(FATAL) << "Not implemeted";
   }
+
   /**
    * Pooling 3D forward operation, pick out the largest element
    * in the sizeX of value
@@ -1458,7 +1461,8 @@ public:
                       size_t outputH,
                       size_t outputW,
                       size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      bool excludeMode = true);
 
   void avgPoolBackward(Matrix& input,
                        size_t imgSizeH,
@@ -1472,7 +1476,8 @@ public:
                        real scaleTargets,
                        real scaleOutput,
                        size_t paddingH,
-                       size_t paddingW);
+                       size_t paddingW,
+                       bool excludeMode = true);
 
   void maxPool3DForward(Matrix& inputMat,
                         Matrix& maxPoolIdx,
@@ -1730,7 +1735,8 @@ public:
                       size_t outputH,
                       size_t outputW,
                       size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      bool excludeMode = true);
 
   void avgPoolBackward(Matrix& input,
                        size_t imgSizeH,
@@ -1744,7 +1750,8 @@ public:
                        real scaleTargets,
                        real scaleOutput,
                        size_t paddingH,
-                       size_t paddingW);
+                       size_t paddingW,
+                       bool excludeMode = true);
 
   void maxPool3DForward(Matrix& inputMat,
                         Matrix& maxPoolIdx,
diff --git a/paddle/math/NEONFunctions.cpp b/paddle/math/NEONFunctions.cpp
index 3bf47901f1..0f83149422 100644
--- a/paddle/math/NEONFunctions.cpp
+++ b/paddle/math/NEONFunctions.cpp
@@ -49,6 +49,46 @@ void relu(const float* a, float* b, int len) {
   }
 }
 
+// b[i] = a[i] > 0.0f ? a[i] : a[i] * w
+void prelu(const float* a, float w, float* b, int len) {
+  int offset = len % 16;
+  float32x4_t ma0, ma1, ma2, ma3;
+
+  float32x4_t zero = vdupq_n_f32(0.f);
+  float32x4_t vw = vdupq_n_f32(w);
+
+  for (int k = 0; k < len / 16; k++, a += 16, b += 16) {
+    ma0 = vld1q_f32(a);
+    ma1 = vld1q_f32(a + 4);
+    ma2 = vld1q_f32(a + 8);
+    ma3 = vld1q_f32(a + 12);
+
+    uint32x4_t flag0 = vcgtq_f32(ma0, zero);
+    uint32x4_t flag1 = vcgtq_f32(ma1, zero);
+    uint32x4_t flag2 = vcgtq_f32(ma2, zero);
+    uint32x4_t flag3 = vcgtq_f32(ma3, zero);
+
+    float32x4_t mul0 = vmulq_f32(ma0, vw);
+    float32x4_t mul1 = vmulq_f32(ma1, vw);
+    float32x4_t mul2 = vmulq_f32(ma2, vw);
+    float32x4_t mul3 = vmulq_f32(ma3, vw);
+
+    ma0 = vbslq_f32(flag0, ma0, mul0);
+    ma1 = vbslq_f32(flag1, ma1, mul1);
+    ma2 = vbslq_f32(flag2, ma2, mul2);
+    ma3 = vbslq_f32(flag3, ma3, mul3);
+
+    vst1q_f32(b, ma0);
+    vst1q_f32(b + 4, ma1);
+    vst1q_f32(b + 8, ma2);
+    vst1q_f32(b + 12, ma3);
+  }
+
+  for (int i = 0; i < offset; i++) {
+    b[i] = a[i] > 0.0f ? a[i] : a[i] * w;
+  }
+}
+
 }  // namespace neon
 }  // namespace paddle
 
diff --git a/paddle/math/NEONFunctions.h b/paddle/math/NEONFunctions.h
index 69085e3335..d67b2f47a8 100644
--- a/paddle/math/NEONFunctions.h
+++ b/paddle/math/NEONFunctions.h
@@ -18,6 +18,7 @@ namespace paddle {
 namespace neon {
 
 void relu(const float* a, float* b, int len);
+void prelu(const float* a, float w, float* b, int len);
 
 }  // namespace neon
 }  // namespace paddle
diff --git a/paddle/math/float16.h b/paddle/math/float16.h
new file mode 100644
index 0000000000..76ad3a0123
--- /dev/null
+++ b/paddle/math/float16.h
@@ -0,0 +1,739 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif  // PADDLE_WITH_CUDA
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+#include "paddle/platform/hostdevice.h"
+
+#ifdef __GNUC__
+#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
+#else
+#define PADDLE_GNUC_VER 0
+#endif  // __GNUC__
+
+#ifdef __clang__
+#define PADDLE_CLANG_VER (__clang_major__ * 10 + __clang_minor__)
+#else
+#define PADDLE_CLANG_VER 0
+#endif  // __clang__
+
+#if defined(__CUDACC__) && CUDA_VERSION >= 7050
+#define PADDLE_CUDA_FP16
+#include <cuda_fp16.h>
+#endif
+
+#if defined(__arm__) || defined(__aarch64__)
+#define PADDLE_ARM
+#endif
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#define PADDLE_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
+    (PADDLE_GNUC_VER >= 62 || PADDLE_CLANG_VER >= 37)
+#define PADDLE_WITH_NATIVE_FP16
+#endif
+
+#ifndef PADDLE_ARM
+#include <immintrin.h>
+#endif  // PADDLE_ARM
+
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+
+namespace paddle {
+
+// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
+// and aligned at least on a 2-byte boundary, which leads to efficient
+// memory access of float16 struct and also makes float16 compatible
+// with CUDA half, ARM float16_t, and Eigen::half data types.
+struct PADDLE_ALIGN(2) float16 {
+public:
+  uint16_t x;
+
+  // Constructors
+  HOSTDEVICE inline float16() : x(0) {}
+
+  HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
+
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline explicit float16(const half& h) {
+#if CUDA_VERSION >= 9000
+    x = reinterpret_cast<__half_raw*>(&h)->x;
+#else
+    x = h.x;
+#endif  // CUDA_VERSION >= 9000
+  }
+#endif  // PADDLE_CUDA_FP16
+
+  HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {}
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  // __fp16 is a native half precision data type for arm cpu,
+  // float16_t is an alias for __fp16
+  HOSTDEVICE inline explicit float16(const float16_t& h) {
+    x = *reinterpret_cast<const uint16_t*>(&h);
+  }
+#endif
+
+  HOSTDEVICE inline explicit float16(float val) {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    half tmp = __float2half(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
+
+#elif defined(PADDLE_WITH_NATIVE_FP16)
+    float32x4_t tmp = vld1q_dup_f32(&val);
+    float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
+    x = *reinterpret_cast<uint16_t*>(&res);
+
+#elif defined(__F16C__)
+    x = _cvtss_sh(val, 0);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v, s;
+    v.f = val;
+    uint32_t sign = v.si & sigN;
+    v.si ^= sign;
+    sign >>= shiftSign;  // logical shift
+    s.si = mulN;
+    s.si = s.f * v.f;  // correct subnormals
+    v.si ^= (s.si ^ v.si) & -(minN > v.si);
+    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+    v.ui >>= shift;  // logical shift
+    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+    x = v.ui | sign;
+
+#endif
+  }
+
+  HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
+
+  template <class T>
+  HOSTDEVICE inline explicit float16(const T& val)
+      : x(float16(static_cast<float>(val)).x) {}
+
+  HOSTDEVICE inline float16& operator=(const float16& rhs) {
+    x = rhs.x;
+    return *this;
+  }
+
+// Assignment operators
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline float16& operator=(const half& rhs) {
+#if CUDA_VERSION >= 9000
+    x = reinterpret_cast<__half_raw*>(&rhs)->x;
+#else
+    x = rhs.x;
+#endif
+    return *this;
+  }
+#endif
+
+  HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
+    x = rhs.x;
+    return *this;
+  }
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
+    x = *reinterpret_cast<const uint16_t*>(&rhs);
+    return *this;
+  }
+#endif
+
+  HOSTDEVICE inline float16& operator=(bool b) {
+    x = b ? 0x3c00 : 0;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(float val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(double val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+// Conversion opertors
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline explicit operator half() const {
+#if CUDA_VERSION >= 9000
+    __half_raw h;
+    h.x = x;
+    return half(h);
+#else
+    half h;
+    h.x = x;
+    return h;
+#endif  // CUDA_VERSION >= 9000
+  }
+#endif  // PADDLE_CUDA_FP16
+
+  HOSTDEVICE inline explicit operator Eigen::half() const {
+    Eigen::half h;
+    h.x = x;
+    return h;
+  }
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline explicit operator float16_t() const {
+    return *reinterpret_cast<const float16_t*>(this);
+  }
+#endif
+
+  HOSTDEVICE inline explicit operator float() const {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    half tmp = *reinterpret_cast<const half*>(this);
+    return __half2float(tmp);
+
+#elif defined(PADDLE_WITH_NATIVE_FP16)
+    float16x4_t res = vld1_dup_f16(reinterpret_cast<const float16_t*>(this));
+    return vgetq_lane_f32(vcvt_f32_f16(res), 0);
+
+#elif defined(__F16C__)
+    return _cvtsh_ss(this->x);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v;
+    v.ui = this->x;
+    int32_t sign = v.si & sigC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+
+#endif
+  }
+
+  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(float(*this));
+  }
+
+private:
+  union Bits {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  };
+
+  static const int shift = 13;
+  static const int shiftSign = 16;
+
+  static const int32_t infN = 0x7F800000;
+  static const int32_t maxN = 0x477FE000;  // max flt16 as flt32
+  static const int32_t minN = 0x38800000;  // min flt16 normal as flt32
+  static const int32_t sigN = 0x80000000;  // sign bit
+
+  static constexpr int32_t infC = infN >> shift;
+  static constexpr int32_t nanN = (infC + 1)
+                                  << shift;  // minimum flt16 nan as float32
+  static constexpr int32_t maxC = maxN >> shift;
+  static constexpr int32_t minC = minN >> shift;
+  static constexpr int32_t sigC = sigN >> shiftSign;
+
+  static const int32_t mulN = 0x52000000;  // (1 << 23) / minN
+  static const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
+  static const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
+  static const int32_t norC = 0x00400;     // min flt32 normal downshifted
+
+  static constexpr int32_t maxD = infC - maxC - 1;
+  static constexpr int32_t minD = minC - subC - 1;
+};
+
+// Arithmetic operators on GPU
+// CUDA 9.0 provides built-in arithmetic operators for half while
+// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
+// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
+// CUDA 9.0 regarding the half data type.
+#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
+
+DEVICE inline half operator+(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hadd(a, b);
+#else
+  float res = float(float16(a)) + float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator-(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hsub(a, b);
+#else
+  float res = float(float16(a)) - float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator*(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hmul(a, b);
+#else
+  float res = float(float16(a)) * float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator/(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+  float num = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
+#else
+  float res = float(float16(a)) / float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator-(const half& a) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hneg(a);
+#else
+  float res = -float(float16(a));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half& operator+=(half& a, const half& b) {
+  a = a + b;
+  return a;
+}
+
+DEVICE inline half& operator-=(half& a, const half& b) {
+  a = a - b;
+  return a;
+}
+
+DEVICE inline half& operator*=(half& a, const half& b) {
+  a = a * b;
+  return a;
+}
+
+DEVICE inline half& operator/=(half& a, const half& b) {
+  a = a / b;
+  return a;
+}
+
+DEVICE inline bool operator==(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __heq(a, b);
+#else
+  return float(float16(a)) == float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator!=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hne(a, b);
+#else
+  return float(float16(a)) != float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator<(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(a, b);
+#else
+  return float(float16(a)) < float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator<=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hle(a, b);
+#else
+  return float(float16(a)) <= float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator>(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hgt(a, b);
+#else
+  return float(float16(a)) > float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator>=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hge(a, b);
+#else
+  return float(float16(a)) >= float(float16(b));
+#endif
+}
+
+#endif  // PADDLE_CUDA_FP16
+
+// Arithmetic operators on ARMv8.2-A CPU
+#if defined(PADDLE_WITH_NATIVE_FP16)
+HOST inline float16 operator+(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fadd h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator-(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fsub h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator*(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fmul h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator/(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fdiv h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator-(const float16& a) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "fneg h0, h0\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0");
+  return res;
+}
+
+HOST inline float16& operator+=(float16& a, const float16& b) {
+  a = a + b;
+  return a;
+}
+
+HOST inline float16& operator-=(float16& a, const float16& b) {
+  a = a - b;
+  return a;
+}
+
+HOST inline float16& operator*=(float16& a, const float16& b) {
+  a = a * b;
+  return a;
+}
+
+HOST inline float16& operator/=(float16& a, const float16& b) {
+  a = a / b;
+  return a;
+}
+
+HOST inline bool operator==(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmeq h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator!=(const float16& a, const float16& b) {
+  return !(a == b);
+}
+
+HOST inline bool operator<(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator<=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator>(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator>=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+// Arithmetic operators, software emulated on other CPU
+#else
+HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
+  return float16(float(a) + float(b));
+}
+
+HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
+  return float16(float(a) - float(b));
+}
+
+HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
+  return float16(float(a) * float(b));
+}
+
+HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
+  return float16(float(a) / float(b));
+}
+
+HOSTDEVICE inline float16 operator-(const float16& a) {
+  float16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+
+HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
+  a = float16(float(a) + float(b));
+  return a;
+}
+
+HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
+  a = float16(float(a) - float(b));
+  return a;
+}
+
+HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
+  a = float16(float(a) * float(b));
+  return a;
+}
+
+HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
+  a = float16(float(a) / float(b));
+  return a;
+}
+
+HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
+  return float(a) == float(b);
+}
+
+HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
+  return float(a) != float(b);
+}
+
+HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
+  return float(a) < float(b);
+}
+
+HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
+  return float(a) <= float(b);
+}
+
+HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
+  return float(a) > float(b);
+}
+
+HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
+  return float(a) >= float(b);
+}
+#endif
+}  // namespace paddle
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index d8b7f9e3fc..dcd2a34583 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -22,6 +22,7 @@ if(WITH_GPU)
     link_paddle_test(test_Tensor)
     CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
     link_paddle_test(test_lazyAssign)
+    nv_test(test_float16_gpu SRCS test_float16.cu)
 else()
     compile_cu_as_cpp(test_Tensor.cu)
     add_unittest(test_Tensor test_Tensor.cu)
@@ -33,3 +34,4 @@ add_simple_unittest(test_FPException)
 add_simple_unittest(test_GpuProfiler)
 add_simple_unittest(test_BaseMatrix)
 add_simple_unittest(test_Matrix)
+add_simple_unittest(test_float16)
diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp
new file mode 100644
index 0000000000..74cc55aa37
--- /dev/null
+++ b/paddle/math/tests/test_float16.cpp
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/math/float16.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+TEST(float16, conversion_cpu) {
+  // Explicit conversion from Eigen::half
+  EXPECT_EQ(float16(Eigen::half(1.0f)).x, 0x3c00);
+  EXPECT_EQ(float16(Eigen::half(0.5f)).x, 0x3800);
+  EXPECT_EQ(float16(Eigen::half(0.33333f)).x, 0x3555);
+  EXPECT_EQ(float16(Eigen::half(0.0f)).x, 0x0000);
+  EXPECT_EQ(float16(Eigen::half(-0.0f)).x, 0x8000);
+  EXPECT_EQ(float16(Eigen::half(65504.0f)).x, 0x7bff);
+  EXPECT_EQ(float16(Eigen::half(65536.0f)).x, 0x7c00);
+
+  // Conversion from float
+  EXPECT_EQ(float16(1.0f).x, 0x3c00);
+  EXPECT_EQ(float16(0.5f).x, 0x3800);
+  EXPECT_EQ(float16(0.33333f).x, 0x3555);
+  EXPECT_EQ(float16(0.0f).x, 0x0000);
+  EXPECT_EQ(float16(-0.0f).x, 0x8000);
+  EXPECT_EQ(float16(65504.0f).x, 0x7bff);
+  EXPECT_EQ(float16(65536.0f).x, 0x7c00);
+
+  // Conversion from double
+  EXPECT_EQ(float16(1.0).x, 0x3c00);
+  EXPECT_EQ(float16(0.5).x, 0x3800);
+  EXPECT_EQ(float16(0.33333).x, 0x3555);
+  EXPECT_EQ(float16(0.0).x, 0x0000);
+  EXPECT_EQ(float16(-0.0).x, 0x8000);
+  EXPECT_EQ(float16(65504.0).x, 0x7bff);
+  EXPECT_EQ(float16(65536.0).x, 0x7c00);
+
+  // Conversion from int
+  EXPECT_EQ(float16(-1).x, 0xbc00);
+  EXPECT_EQ(float16(0).x, 0x0000);
+  EXPECT_EQ(float16(1).x, 0x3c00);
+  EXPECT_EQ(float16(2).x, 0x4000);
+  EXPECT_EQ(float16(3).x, 0x4200);
+
+  // Conversion from bool
+  EXPECT_EQ(float16(true).x, 0x3c00);
+  EXPECT_EQ(float16(false).x, 0x0000);
+
+  // Default constructor
+  float16 v_def;
+  EXPECT_EQ(v_def.x, 0x0000);
+
+  // Assignment operator
+  float16 v_assign;
+  v_assign = v_def;
+  EXPECT_EQ(v_assign.x, 0x0000);
+  v_assign = Eigen::half(1.0f);
+  EXPECT_EQ(v_assign.x, 0x3c00);
+  v_assign = 0.5f;
+  EXPECT_EQ(v_assign.x, 0x3800);
+  v_assign = 0.33333;
+  EXPECT_EQ(v_assign.x, 0x3555);
+  v_assign = -1;
+  EXPECT_EQ(v_assign.x, 0xbc00);
+  v_assign = true;
+  EXPECT_EQ(v_assign.x, 0x3c00);
+
+  // Conversion operator
+  EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
+  EXPECT_EQ(float(float16(0.5f)), 0.5f);
+  EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001);
+  EXPECT_EQ(int(float16(-1)), -1);
+  EXPECT_EQ(bool(float16(true)), true);
+}
+
+TEST(float16, arithmetic_cpu) {
+  EXPECT_EQ(float(float16(1) + float16(1)), 2);
+  EXPECT_EQ(float(float16(5) + float16(-5)), 0);
+  EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001);
+  EXPECT_EQ(float(float16(3) - float16(5)), -2);
+  EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001);
+  EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
+  EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
+  EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001);
+  EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f);
+  EXPECT_EQ(float(-float16(512.0f)), -512.0f);
+  EXPECT_EQ(float(-float16(-512.0f)), 512.0f);
+}
+
+TEST(float16, comparison_cpu) {
+  EXPECT_TRUE(float16(1.0f) == float16(1.0f));
+  EXPECT_FALSE(float16(-1.0f) == float16(-0.5f));
+  EXPECT_TRUE(float16(1.0f) != float16(0.5f));
+  EXPECT_FALSE(float16(-1.0f) != float16(-1.0f));
+  EXPECT_TRUE(float16(1.0f) < float16(2.0f));
+  EXPECT_FALSE(float16(-1.0f) < float16(-1.0f));
+  EXPECT_TRUE(float16(1.0f) <= float16(1.0f));
+  EXPECT_TRUE(float16(2.0f) > float16(1.0f));
+  EXPECT_FALSE(float16(-2.0f) > float16(-2.0f));
+  EXPECT_TRUE(float16(2.0f) >= float16(2.0f));
+
+  EXPECT_TRUE(float16(0.0f) == float16(-0.0f));
+  EXPECT_TRUE(float16(0.0f) <= float16(-0.0f));
+  EXPECT_TRUE(float16(0.0f) >= float16(-0.0f));
+  EXPECT_FALSE(float16(0.0f) < float16(-0.0f));
+  EXPECT_FALSE(float16(-0.0f) < float16(0.0f));
+  EXPECT_FALSE(float16(0.0f) > float16(-0.0f));
+  EXPECT_FALSE(float16(-0.0f) > float16(0.0f));
+}
+
+}  // namespace paddle
diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu
new file mode 100644
index 0000000000..4b520feaaf
--- /dev/null
+++ b/paddle/math/tests/test_float16.cu
@@ -0,0 +1,213 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/math/float16.h"
+
+#include <gtest/gtest.h>
+
+#include "paddle/utils/Logging.h"
+
+#define ARITHMETIC_KERNEL(op_type, sign)                                 \
+  __global__ void op_type(const half* in1, const half* in2, half* out) { \
+    out[0] = in1[0] sign in2[0];                                         \
+  }
+
+#define COMPOUND_KERNEL(op_type, sign) \
+  __global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; }
+
+#define COMPARISON_KERNEL(op_type, sign)                                 \
+  __global__ void op_type(const half* in1, const half* in2, bool* out) { \
+    out[0] = in1[0] sign in2[0];                                         \
+  }
+
+#define ARITHMETIC_KERNEL_LAUNCH(op_type)                     \
+  void Test##op_type(float v_in1, float v_in2, float v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
+    half *in1, *in2, *out;                                    \
+    half *d_in1, *d_in2, *d_out;                              \
+    int size = sizeof(half);                                  \
+    cudaMalloc((void**)&d_in1, size);                         \
+    cudaMalloc((void**)&d_in2, size);                         \
+    cudaMalloc((void**)&d_out, size);                         \
+    in1 = (half*)malloc(size);                                \
+    in2 = (half*)malloc(size);                                \
+    out = (half*)malloc(size);                                \
+    in1[0] = half(float16(v_in1));                            \
+    in2[0] = half(float16(v_in2));                            \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
+    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                   \
+    cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);     \
+    EXPECT_EQ(float(float16(out[0])), v_out);                 \
+    free(in1);                                                \
+    free(in2);                                                \
+    free(out);                                                \
+    cudaFree(d_in1);                                          \
+    cudaFree(d_in2);                                          \
+    cudaFree(d_out);                                          \
+  }
+
+#define COMPOUND_KERNEL_LAUNCH(op_type)                       \
+  void Test##op_type(float v_in1, float v_in2, float v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
+    half *in1, *in2;                                          \
+    half *d_in1, *d_in2;                                      \
+    int size = sizeof(half);                                  \
+    cudaMalloc((void**)&d_in1, size);                         \
+    cudaMalloc((void**)&d_in2, size);                         \
+    in1 = (half*)malloc(size);                                \
+    in2 = (half*)malloc(size);                                \
+    in1[0] = half(float16(v_in1));                            \
+    in2[0] = half(float16(v_in2));                            \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
+    op_type<<<1, 1>>>(d_in1, d_in2);                          \
+    cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost);     \
+    EXPECT_EQ(float(float16(in1[0])), v_out);                 \
+    free(in1);                                                \
+    free(in2);                                                \
+    cudaFree(d_in1);                                          \
+    cudaFree(d_in2);                                          \
+  }
+
+#define COMPARISON_KERNEL_LAUNCH(op_type)                    \
+  void Test##op_type(float v_in1, float v_in2, bool v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";          \
+    half *in1, *in2;                                         \
+    half *d_in1, *d_in2;                                     \
+    bool *out, *d_out;                                       \
+    int size = sizeof(half);                                 \
+    cudaMalloc((void**)&d_in1, size);                        \
+    cudaMalloc((void**)&d_in2, size);                        \
+    cudaMalloc((void**)&d_out, 1);                           \
+    in1 = (half*)malloc(size);                               \
+    in2 = (half*)malloc(size);                               \
+    out = (bool*)malloc(1);                                  \
+    in1[0] = half(float16(v_in1));                           \
+    in2[0] = half(float16(v_in2));                           \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);    \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);    \
+    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                  \
+    cudaMemcpy(out, d_out, 1, cudaMemcpyDeviceToHost);       \
+    EXPECT_EQ(out[0], v_out);                                \
+    free(in1);                                               \
+    free(in2);                                               \
+    free(out);                                               \
+    cudaFree(d_in1);                                         \
+    cudaFree(d_in2);                                         \
+    cudaFree(d_out);                                         \
+  }
+
+#ifdef PADDLE_CUDA_FP16
+namespace paddle {
+
+#if CUDA_VERSION < 9000
+ARITHMETIC_KERNEL(Add, +)
+ARITHMETIC_KERNEL(Sub, -)
+ARITHMETIC_KERNEL(Mul, *)
+ARITHMETIC_KERNEL(Div, /)
+
+ARITHMETIC_KERNEL_LAUNCH(Add)
+ARITHMETIC_KERNEL_LAUNCH(Sub)
+ARITHMETIC_KERNEL_LAUNCH(Mul)
+ARITHMETIC_KERNEL_LAUNCH(Div)
+
+// Negative sign kernel
+__global__ void Neg(half* in) { in[0] = -in[0]; }
+
+void TestNeg(float v_in, float v_out) {
+  LOG(INFO) << "Test Neg on GPU!";
+  half *in, *d_in;
+  int size = sizeof(half);
+  cudaMalloc((void**)&d_in, size);
+  in = (half*)malloc(size);
+  in[0] = half(float16(v_in));
+  cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
+  Neg<<<1, 1>>>(d_in);
+  cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
+  EXPECT_EQ(float(float16(in[0])), v_out);
+  free(in);
+  cudaFree(d_in);
+}
+
+COMPOUND_KERNEL(AddAssign, +=)
+COMPOUND_KERNEL(SubAssign, -=)
+COMPOUND_KERNEL(MulAssign, *=)
+COMPOUND_KERNEL(DivAssign, /=)
+
+COMPOUND_KERNEL_LAUNCH(AddAssign)
+COMPOUND_KERNEL_LAUNCH(SubAssign)
+COMPOUND_KERNEL_LAUNCH(MulAssign)
+COMPOUND_KERNEL_LAUNCH(DivAssign)
+
+COMPARISON_KERNEL(Equal, ==)
+COMPARISON_KERNEL(NotEqual, !=)
+COMPARISON_KERNEL(Less, <)
+COMPARISON_KERNEL(LessEqual, <=)
+COMPARISON_KERNEL(Greater, >)
+COMPARISON_KERNEL(GreaterEqual, >=)
+
+COMPARISON_KERNEL_LAUNCH(Equal)
+COMPARISON_KERNEL_LAUNCH(NotEqual)
+COMPARISON_KERNEL_LAUNCH(Less)
+COMPARISON_KERNEL_LAUNCH(LessEqual)
+COMPARISON_KERNEL_LAUNCH(Greater)
+COMPARISON_KERNEL_LAUNCH(GreaterEqual)
+
+TEST(float16, arithmetic_on_gpu) {
+  TestAdd(1, 2, 3);
+  TestSub(2, 1, 1);
+  TestMul(2, 3, 6);
+  TestDiv(6, 2, 3);
+  TestNeg(1, -1);
+}
+
+TEST(float16, compound_on_gpu) {
+  TestAddAssign(1, 2, 3);
+  TestSubAssign(2, 1, 1);
+  TestMulAssign(2, 3, 6);
+  TestDivAssign(6, 2, 3);
+}
+
+TEST(float16, comparision_on_gpu) {
+  TestEqual(1, 1, true);
+  TestEqual(1, 2, false);
+  TestNotEqual(2, 3, true);
+  TestNotEqual(2, 2, false);
+  TestLess(3, 4, true);
+  TestLess(3, 3, false);
+  TestLessEqual(3, 3, true);
+  TestLessEqual(3, 2, false);
+  TestGreater(4, 3, true);
+  TestGreater(4, 4, false);
+  TestGreaterEqual(4, 4, true);
+  TestGreaterEqual(4, 5, false);
+}
+#endif  // CUDA_VERSION
+
+TEST(float16, conversion_on_gpu) {
+  // Explicit conversion to and from cuda half
+  EXPECT_EQ(float16(half(float16(1.0f))).x, 0x3c00);
+  EXPECT_EQ(float16(half(float16(0.5f))).x, 0x3800);
+  EXPECT_EQ(float16(half(float16(0.33333f))).x, 0x3555);
+  EXPECT_EQ(float16(half(float16(0.0f))).x, 0x0000);
+  EXPECT_EQ(float16(half(float16(-0.0f))).x, 0x8000);
+  EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff);
+  EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00);
+
+  // Assignment operator
+  float16 v_assign;
+  v_assign = half(float16(1.0f));
+  EXPECT_EQ(v_assign.x, 0x3c00);
+}
+
+}  // namespace paddle
+#endif  // PADDLE_CUDA_FP16
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 6b4e46f56a..6a815a1b57 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -43,7 +43,7 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) {
 
   void* p;
 
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
   // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
   // memory alignment
   PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0);
@@ -83,7 +83,7 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
   paddle::platform::GpuMemoryUsage(available, capacity);
 
   // Reserve memory for page tables, etc.
-  size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
+  size_t reserving = 0.05 * capacity + paddle::platform::GpuMinChunkSize();
   size_t usable = available > reserving ? available - reserving : 0;
 
   // If remaining size no less than expected size, using general
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 95cfe2525e..9cafdfda75 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -64,19 +64,21 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
     int gpu_num = platform::GetCUDADeviceCount();
     as = new BuddyAllocator*[gpu_num];
     for (int gpu = 0; gpu < gpu_num; gpu++) {
-      platform::SetDeviceId(gpu);
-      as[gpu] = new BuddyAllocator(new detail::GPUAllocator,
-                                   platform::GpuMinChunkSize(),
-                                   platform::GpuMaxChunkSize());
+      as[gpu] = nullptr;
     }
+  }
+  platform::SetDeviceId(gpu_id);
+  if (!as[gpu_id]) {
+    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator,
+                                    platform::GpuMinChunkSize(),
+                                    platform::GpuMaxChunkSize());
     VLOG(10) << "\n\nNOTE: each GPU device use "
              << FLAGS_fraction_of_gpu_memory_to_use * 100
              << "% of GPU memory.\n"
-             << "You can set environment variable '"
-             << platform::kEnvFractionGpuMemoryToUse
+             << "You can set GFlags environment variable '"
+             << "FLAGS_fraction_of_gpu_memory_to_use"
              << "' to change the fraction of GPU usage.\n\n";
   }
-  platform::SetDeviceId(gpu_id);
   return as[gpu_id];
 }
 
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 38b89b9eb1..5aaaf99332 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -138,7 +138,7 @@ function(op_library TARGET)
     if ("${TARGET}" STREQUAL "nccl_op")
         set(pybind_flag 1)
         # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
+        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
     endif()
 
     # reduce_op contains several operators
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 2785a8c6fb..76da21c472 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -57,7 +57,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
-        ctx.device_context());
+        ctx.GetPlace());
   }
 };
 
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index d2dcab4e54..539a935302 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -104,5 +104,6 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 
 // FIXME(typhoonzero): types of T is for inference data.
 // label data is always int64
-REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
-                       paddle::operators::AccuracyOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(accuracy,
+                        paddle::operators::AccuracyOpCUDAKernel<float>,
+                        paddle::operators::AccuracyOpCUDAKernel<double>);
diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h
index d060e6eddd..04104a695f 100644
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@@ -21,7 +21,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AccuracyKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index 154c618e8e..63490f0ec9 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -44,9 +44,9 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Sigmoid operator");
     AddOutput("Y", "Output of Sigmoid operator");
     AddComment(R"DOC(
-Sigmoid Activation Operator.
+Sigmoid Activation Operator
 
-$y = 1 / (1 + e^{-x})$
+$$y = \frac{1}{1 + e^{-x}}$$
 
 )DOC");
   }
@@ -60,9 +60,9 @@ class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of LogSigmoid operator");
     AddOutput("Y", "Output of LogSigmoid operator");
     AddComment(R"DOC(
-Logsigmoid Activation Operator.
+Logsigmoid Activation Operator
 
-$y = \log(1 / (1 + e^{-x}))$
+$$y = \log \frac{1}{1 + e^{-x}}$$
 
 )DOC");
   }
@@ -506,6 +506,22 @@ It is recommended to use the defaults for this activation.
   }
 };
 
+class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SwishOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Swish operator");
+    AddOutput("Y", "Output of Swish operator");
+    AddAttr<float>("beta", "Constant beta of swish operator").SetDefault(1.0f);
+    AddComment(R"DOC(
+Swish Activation Operator.
+
+$$y = \frac{x}{1 + e^{- \beta x}}$$
+
+)DOC");
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -592,16 +608,20 @@ REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker,
 REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
             hard_sigmoid_grad, ops::ActivationOpGrad);
 
-#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)       \
-  REGISTER_OP_CPU_KERNEL(                                                     \
-      act_type,                                                               \
-      ops::ActivationKernel<paddle::platform::CPUPlace, ops::functor<float>>, \
-      ops::ActivationKernel<paddle::platform::CPUPlace,                       \
-                            ops::functor<double>>);                           \
-  REGISTER_OP_CPU_KERNEL(                                                     \
-      act_type##_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace,  \
-                                                 ops::grad_functor<float>>,   \
-      ops::ActivationGradKernel<paddle::platform::CPUPlace,                   \
+REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
+            ops::ActivationOpGrad);
+
+#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)   \
+  REGISTER_OP_CPU_KERNEL(                                                 \
+      act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
+                                      ops::functor<float>>,               \
+      ops::ActivationKernel<paddle::platform::CPUDeviceContext,           \
+                            ops::functor<double>>);                       \
+  REGISTER_OP_CPU_KERNEL(                                                 \
+      act_type##_grad,                                                    \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
+                                ops::grad_functor<float>>,                \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
                                 ops::grad_functor<double>>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu
index 97737857ab..856d3fc35d 100644
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@@ -17,16 +17,17 @@
 
 namespace ops = paddle::operators;
 
-#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor)       \
-  REGISTER_OP_GPU_KERNEL(                                                     \
-      act_type,                                                               \
-      ops::ActivationKernel<paddle::platform::GPUPlace, ops::functor<float>>, \
-      ops::ActivationKernel<paddle::platform::GPUPlace,                       \
-                            ops::functor<double>>);                           \
-  REGISTER_OP_GPU_KERNEL(                                                     \
-      act_type##_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace,  \
-                                                 ops::grad_functor<float>>,   \
-      ops::ActivationGradKernel<paddle::platform::GPUPlace,                   \
+#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor)   \
+  REGISTER_OP_CUDA_KERNEL(                                                 \
+      act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext, \
+                                      ops::functor<float>>,                \
+      ops::ActivationKernel<paddle::platform::CUDADeviceContext,           \
+                            ops::functor<double>>);                        \
+  REGISTER_OP_CUDA_KERNEL(                                                 \
+      act_type##_grad,                                                     \
+      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
+                                ops::grad_functor<float>>,                 \
+      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
                                 ops::grad_functor<double>>);
 
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL);
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 8cd3bfbbd3..75eefca8b8 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class ActivationKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
@@ -32,18 +32,19 @@ class ActivationKernel
 
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
 
     auto attrs = functor.GetAttrs();
     for (auto& attr : attrs) {
       *attr.second = context.Attr<float>(attr.first);
     }
-    functor(place, x, y);
+    functor(*place, x, y);
   }
 };
 
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class ActivationGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
@@ -59,13 +60,14 @@ class ActivationGradKernel
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto y = framework::EigenVector<T>::Flatten(*Y);
     auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
     auto attrs = functor.GetAttrs();
     for (auto& attr : attrs) {
       *attr.second = context.Attr<float>(attr.first);
     }
-    functor(place, x, y, dy, dx);
+    functor(*place, x, y, dy, dx);
   }
 };
 
@@ -700,6 +702,35 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct SwishFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x / (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+  }
+};
+
+template <typename T>
+struct SwishGradFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    auto temp1 = static_cast<T>(1) /
+                 (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+    auto temp2 = temp1 * (static_cast<T>(1) - (beta * y));
+    dx.device(d) = dy * ((beta * y) + temp2);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -730,4 +761,5 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   __macro(elu, ELUFunctor, ELUGradFunctor);                          \
   __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor);    \
   __macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \
+  __macro(swish, SwishFunctor, SwishGradFunctor);                    \
   __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor);
diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc
index 16a7794d5b..507811e7b5 100644
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -92,12 +92,12 @@ for gradient descent.
 
 Adadelta updates are as follows:
 
-$$avgSquaredGradOut = \rho * avgSquaredGrad + (1 - \rho) * grad * grad \break
-paramUpdate =  - $\sqrt{((avgSquaredUpdate + \epsilon) /
-                       (avgSquaredGrad_out + \epsilon))}$ * grad \break
-avgSquaredUpdateOut = \rho * avgSquaredUpdate + (1 - \rho) *
-                                  {(paramUpdate)}^2 \break
-paramOut = param + paramUpdate$$
+$$
+avg\_squared\_grad\_out = \rho * avg\_squared\_grad + (1 - \rho) * grad * grad \\
+param\_update =  - \sqrt{\frac{avg\_squared\_update + \epsilon}{avg\_squared\_grad\_out + \epsilon}} * grad \\
+avg\_squared\_update\_out = \rho * avg\_squared\_update + (1 - \rho) * {param\_update}^2 \\
+param\_out = param + param\_update
+$$
 
 )DOC");
   }
@@ -109,5 +109,5 @@ paramOut = param + paramUpdate$$
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>,
-    ops::AdadeltaOpKernel<paddle::platform::CPUPlace, double>);
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adadelta_op.cu b/paddle/operators/adadelta_op.cu
index 9fb6185207..eee2d0a2f5 100644
--- a/paddle/operators/adadelta_op.cu
+++ b/paddle/operators/adadelta_op.cu
@@ -16,6 +16,6 @@
 #include "paddle/operators/adadelta_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>,
-    ops::AdadeltaOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adadelta_op.h b/paddle/operators/adadelta_op.h
index a8c5f0c8aa..819d0845db 100644
--- a/paddle/operators/adadelta_op.h
+++ b/paddle/operators/adadelta_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdadeltaOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -51,7 +51,7 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
         framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
     auto avg_squared_update_out =
         framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     avg_squared_grad_out.device(place) =
         rho * avg_squared_grad + (1 - rho) * grad.square();
diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc
index d6686e3ef3..5d00716316 100644
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -80,8 +80,8 @@ Adaptive Gradient Algorithm (Adagrad).
 
 The update is done as follows:
 
-$$momentOut = moment + grad * grad \break
-paramOut = param - learningRate * grad / ($\sqrt{momentOut}$ + \epsilon) \break
+$$moment\_out = moment + grad * grad \\
+param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
 $$
 
 The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
@@ -100,8 +100,8 @@ size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
 }  // namespace
 
 template <typename T>
-struct SparseAdagradFunctor<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& grad,
                   const framework::Tensor& learning_rate, T epsilon,
                   framework::Tensor* moment, framework::Tensor* param) {
@@ -120,7 +120,7 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
             {static_cast<int64_t>(merge_rows.size()), grad_width}),
         context.GetPlace());
 
-    math::SetConstant<platform::CPUPlace, T> constant_functor;
+    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
     constant_functor(context, grad_merge->mutable_value(), 0.0);
 
     auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
@@ -144,9 +144,9 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
     auto gs =
         framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
     auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
-    gs.device(*context.GetEigenDevice<platform::CPUPlace>()) = gm * gm;
+    gs.device(*context.eigen_device()) = gm * gm;
 
-    math::SelectedRowsAddToTensor<platform::CPUPlace, T> functor;
+    math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
     functor(context, *grad_square, moment);
 
     // 3. update parameter
@@ -164,13 +164,13 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template struct SparseAdagradFunctor<platform::CPUPlace, float>;
-template struct SparseAdagradFunctor<platform::CPUPlace, double>;
+template struct SparseAdagradFunctor<platform::CPUDeviceContext, float>;
+template struct SparseAdagradFunctor<platform::CPUDeviceContext, double>;
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::CPUPlace, float>,
-    ops::AdagradOpKernel<paddle::platform::CPUPlace, double>);
+    adagrad, ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu
index 1c870214b2..585b2d9289 100644
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -72,8 +72,8 @@ __global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
 }  // namespace
 
 template <typename T>
-struct SparseAdagradFunctor<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& grad,
                   const framework::Tensor& learning_rate, T epsilon,
                   framework::Tensor* moment, framework::Tensor* param) {
@@ -92,7 +92,7 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
             {static_cast<int64_t>(merge_rows.size()), grad_width}),
         context.GetPlace());
 
-    math::SetConstant<platform::GPUPlace, T> constant_functor;
+    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
     constant_functor(context, grad_merge->mutable_value(), 0.0);
 
     auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
@@ -119,9 +119,9 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
     auto gs =
         framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
     auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
-    gs.device(*context.GetEigenDevice<platform::GPUPlace>()) = gm * gm;
+    gs.device(*context.eigen_device()) = gm * gm;
 
-    math::SelectedRowsAddToTensor<platform::GPUPlace, T> functor;
+    math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
     functor(context, *grad_square, moment);
 
     // 3. update parameter
@@ -139,13 +139,13 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
   }
 };
 
-template struct SparseAdagradFunctor<platform::GPUPlace, float>;
-template struct SparseAdagradFunctor<platform::GPUPlace, double>;
+template struct SparseAdagradFunctor<platform::CUDADeviceContext, float>;
+template struct SparseAdagradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::GPUPlace, float>,
-    ops::AdagradOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    adagrad, ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adagrad_op.h b/paddle/operators/adagrad_op.h
index 4d4a6434c7..0d77dbcbac 100644
--- a/paddle/operators/adagrad_op.h
+++ b/paddle/operators/adagrad_op.h
@@ -19,15 +19,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SparseAdagradFunctor {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::SelectedRows& grad,
                   const framework::Tensor& learning_rate, T epsilon,
                   framework::Tensor* moment, framework::Tensor* param);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -52,11 +52,11 @@ class AdagradOpKernel : public framework::OpKernel<T> {
 
       auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
       auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-      auto place = ctx.GetEigenDevice<Place>();
+      auto* place = ctx.template device_context<DeviceContext>().eigen_device();
 
-      moment_out.device(place) = moment + grad * grad;
+      moment_out.device(*place) = moment + grad * grad;
       Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-      param_out.device(place) =
+      param_out.device(*place) =
           param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto* param_tensor = ctx.Input<framework::Tensor>("Param");
@@ -65,8 +65,9 @@ class AdagradOpKernel : public framework::OpKernel<T> {
       auto* moment_tensor = ctx.Input<framework::Tensor>("Moment");
       PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);
 
-      SparseAdagradFunctor<Place, T> functor;
-      functor(ctx.device_context(), *ctx.Input<framework::SelectedRows>("Grad"),
+      SparseAdagradFunctor<DeviceContext, T> functor;
+      functor(ctx.template device_context<DeviceContext>(),
+              *ctx.Input<framework::SelectedRows>("Grad"),
               *ctx.Input<framework::Tensor>("LearningRate"), epsilon,
               moment_out_tensor, param_out_tensor);
     } else {
diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc
index 03faa2a7c5..cf6ef6dd53 100644
--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@@ -112,11 +112,13 @@ adaptive estimates of lower-order moments.
 
 Adam updates:
 
-$$moment_1_{out} = \beta_1 * moment_1 + (1 - \beta_1) * grad \break
-moment_2_{out} = \beta_2 * moment_2 + (1 - \beta_2) * grad * grad \break
-learningRate = learningRate *
-                  $\sqrt{(1 - \beta_2_{pow})}$ / (1 - \beta_1_{pow}) \break
-paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
+$$
+moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
+moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
+learning\_rate = learning\_rate *
+                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+$$
 
 )DOC");
   }
@@ -126,6 +128,6 @@ paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
-REGISTER_OP_CPU_KERNEL(adam,
-                       ops::AdamOpKernel<paddle::platform::CPUPlace, float>,
-                       ops::AdamOpKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    adam, ops::AdamOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdamOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adam_op.cu b/paddle/operators/adam_op.cu
index 6e34f7818c..c135b37378 100644
--- a/paddle/operators/adam_op.cu
+++ b/paddle/operators/adam_op.cu
@@ -16,6 +16,6 @@
 #include "paddle/operators/adam_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(adam,
-                       ops::AdamOpKernel<paddle::platform::GPUPlace, float>,
-                       ops::AdamOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    adam, ops::AdamOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdamOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h
index 7f7fa1da1c..45157842a6 100644
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdamOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -52,17 +52,17 @@ class AdamOpKernel : public framework::OpKernel<T> {
     auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
     auto moment1_out = framework::EigenVector<T>::Flatten(*moment1_out_tensor);
     auto moment2_out = framework::EigenVector<T>::Flatten(*moment2_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
 
-    moment1_out.device(place) = beta1 * moment1 + (1 - beta1) * grad;
-    moment2_out.device(place) = beta2 * moment2 + (1 - beta2) * grad.square();
+    moment1_out.device(*place) = beta1 * moment1 + (1 - beta1) * grad;
+    moment2_out.device(*place) = beta2 * moment2 + (1 - beta2) * grad.square();
 
     // All of these are tensors of 1 element
     auto lr_t = lr * (1 - beta2_pow).sqrt() / (1 - beta1_pow);
     // Eigen does not support automatic broadcast
     // Get dimensions of moment vector to broadcast lr_t
     Eigen::DSizes<int, 1> m_dsize(moment1_out_tensor->numel());
-    param_out.device(place) =
+    param_out.device(*place) =
         param -
         lr_t.broadcast(m_dsize) *
             (moment1_out / (moment2_out.sqrt() + epsilon));
diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc
index d5bbc672e1..49ce497bb7 100644
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@@ -107,10 +107,12 @@ Adam algorithm based on the infinity norm.
 
 Adamax updates:
 
-$$momentOut = \beta_1 * moment + (1 - \beta_1) * grad \break
-infNormOut = max(\beta_2 * infNorm + \epsilon, |grad|) \break
-learningRate = learningRate /(1 - \beta_1_{pow}) \break
-paramOut = param - learningRate * momentPut / infNormOut$$
+$$
+moment\_out = \beta_1 * moment + (1 - \beta_1) * grad \\
+inf\_norm\_out = max(\beta_2 * inf\_norm + \epsilon, |grad|) \\
+learning\_rate = \frac{learning\_rate}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out}
+$$
 
 The original paper does not have an epsilon attribute.
 However, it is added here for numerical stability to prevent the
@@ -125,6 +127,6 @@ division by 0 error.
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
-REGISTER_OP_CPU_KERNEL(adamax,
-                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, float>,
-                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    adamax, ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adamax_op.cu b/paddle/operators/adamax_op.cu
index 057ef39025..2d143905c4 100644
--- a/paddle/operators/adamax_op.cu
+++ b/paddle/operators/adamax_op.cu
@@ -16,6 +16,6 @@
 #include "paddle/operators/adamax_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(adamax,
-                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, float>,
-                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    adamax, ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h
index bf36ed7860..172c179c5f 100644
--- a/paddle/operators/adamax_op.h
+++ b/paddle/operators/adamax_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdamaxOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -51,14 +51,14 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
     auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
     auto inf_norm_out =
         framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
 
-    moment_out.device(place) = beta1 * moment + (1 - beta1) * grad;
-    inf_norm_out.device(place) =
+    moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad;
+    inf_norm_out.device(*place) =
         grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
     auto lr_t = lr / (1 - beta1_pow);
     Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-    param_out.device(place) =
+    param_out.device(*place) =
         param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
   }
 };
diff --git a/paddle/operators/auc_op.h b/paddle/operators/auc_op.h
index e5ac57b038..b80509e2a9 100644
--- a/paddle/operators/auc_op.h
+++ b/paddle/operators/auc_op.h
@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AucKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index ac97bd83ab..94a972b7ab 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -135,7 +135,8 @@ The required data format for this layer is one of the following:
 };
 
 template <typename T>
-class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
+class BatchNormKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const float epsilon = ctx.Attr<float>("epsilon");
@@ -318,12 +319,12 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
       PADDLE_THROW("can't find Y@GRAD");
     }
     return framework::OpKernelType(framework::ToDataType(t->type()),
-                                   ctx.device_context());
+                                   ctx.GetPlace());
   }
 };
 
 template <typename T>
-class BatchNormGradKernel<platform::CPUPlace, T>
+class BatchNormGradKernel<platform::CPUDeviceContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -436,8 +437,9 @@ class BatchNormGradKernel<platform::CPUPlace, T>
 namespace ops = paddle::operators;
 REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
             batch_norm_grad, ops::BatchNormGradOp);
-REGISTER_OP_CPU_KERNEL(batch_norm,
-                       ops::BatchNormKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    batch_norm,
+    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::CPUPlace, float>);
+    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc
index 7b2f318700..c7adc3d80e 100644
--- a/paddle/operators/batch_norm_op.cu.cc
+++ b/paddle/operators/batch_norm_op.cu.cc
@@ -47,7 +47,8 @@ void ExtractNCWHD(const framework::DDim &dims,
 }
 
 template <typename T>
-class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
+class BatchNormKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
@@ -121,11 +122,12 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
     saved_mean->mutable_data<T>(ctx.GetPlace());
     saved_variance->mutable_data<T>(ctx.GetPlace());
 
-    math::SetConstant<platform::GPUPlace, T> functor;
-    functor(ctx.device_context(), saved_mean, 0);
-    functor(ctx.device_context(), saved_variance, 0);
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    math::SetConstant<platform::CUDADeviceContext, T> functor;
+    functor(dev_ctx, saved_mean, 0);
+    functor(dev_ctx, saved_variance, 0);
 
-    auto handle = ctx.cuda_device_context().cudnn_handle();
+    auto handle = dev_ctx.cudnn_handle();
 
     // Now, depending on whether we are running test or not, we have two paths.
     if (is_test) {
@@ -171,7 +173,7 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
 };
 
 template <typename T>
-class BatchNormGradKernel<platform::GPUPlace, T>
+class BatchNormGradKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -244,11 +246,12 @@ class BatchNormGradKernel<platform::GPUPlace, T>
     const void *saved_mean_data = saved_mean->template data<T>();
     const void *saved_var_data = saved_var->template data<T>();
 
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
-        ctx.cuda_device_context().cudnn_handle(), mode_,
-        CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
-        CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(), data_desc_,
-        x->template data<T>(), data_desc_, d_y->template data<T>(), data_desc_,
+        dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
+        data_desc_, d_y->template data<T>(), data_desc_,
         d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
         scale->template data<T>(),
         d_scale->template mutable_data<T>(ctx.GetPlace()),
@@ -266,8 +269,9 @@ class BatchNormGradKernel<platform::GPUPlace, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(batch_norm,
-                       ops::BatchNormKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    batch_norm,
+    ops::BatchNormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::GPUPlace, float>);
+    ops::BatchNormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/batch_norm_op.h b/paddle/operators/batch_norm_op.h
index 4e80134a1a..8d99b68647 100644
--- a/paddle/operators/batch_norm_op.h
+++ b/paddle/operators/batch_norm_op.h
@@ -34,13 +34,13 @@ inline TensorFormat StringToTensorFormat(const std::string& str) {
   }
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BatchNormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BatchNormGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override;
diff --git a/paddle/operators/bilinear_tensor_product_op.cc b/paddle/operators/bilinear_tensor_product_op.cc
index c88b2c9beb..217fd52366 100644
--- a/paddle/operators/bilinear_tensor_product_op.cc
+++ b/paddle/operators/bilinear_tensor_product_op.cc
@@ -159,9 +159,12 @@ REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
             ops::BilinearTensorProductOpGrad);
 REGISTER_OP_CPU_KERNEL(
     bilinear_tensor_product,
-    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, float>,
-    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, double>);
+    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
 REGISTER_OP_CPU_KERNEL(
     bilinear_tensor_product_grad,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, float>,
-    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, double>);
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
+                                         float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
+                                         double>);
diff --git a/paddle/operators/bilinear_tensor_product_op.cu b/paddle/operators/bilinear_tensor_product_op.cu
index 858d2668d0..0f48010716 100644
--- a/paddle/operators/bilinear_tensor_product_op.cu
+++ b/paddle/operators/bilinear_tensor_product_op.cu
@@ -16,11 +16,15 @@ limitations under the License. */
 #include "paddle/operators/bilinear_tensor_product_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     bilinear_tensor_product,
-    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, float>,
-    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
+    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
+                                     double>);
+REGISTER_OP_CUDA_KERNEL(
     bilinear_tensor_product_grad,
-    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, float>,
-    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, double>);
+    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
+                                         float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
+                                         double>);
diff --git a/paddle/operators/bilinear_tensor_product_op.h b/paddle/operators/bilinear_tensor_product_op.h
index 1113a4c6f3..ba9a2c5ce3 100644
--- a/paddle/operators/bilinear_tensor_product_op.h
+++ b/paddle/operators/bilinear_tensor_product_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BilinearTensorProductKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -46,7 +46,8 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
     int out_dim = weight_dims[0];
     auto x_dim = weight_dims[1];
     auto y_dim = weight_dims[2];
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
     // Create the intermediate variable to caculate the result of
     // Input(X) multiplied by Input(Weight_i), the formula is:
@@ -60,9 +61,9 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
       auto output_col_vec = output_mat.chip(i, 1);
       Tensor weight_mat =
           weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
-      math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
-                           batch_size, y_dim, x_dim, 1, x->data<T>(),
-                           weight_mat.data<T>(), 0, left_mul.data<T>());
+      math::gemm<DeviceContext, T>(dev_ctx, CblasNoTrans, CblasNoTrans,
+                                   batch_size, y_dim, x_dim, 1, x->data<T>(),
+                                   weight_mat.data<T>(), 0, left_mul.data<T>());
       output_col_vec.device(place) =
           (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
     }
@@ -74,7 +75,7 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -96,8 +97,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
     auto x_mat = EigenMatrix<T>::From(*x);
     auto y_mat = EigenMatrix<T>::From(*y);
     auto d_out_mat = EigenMatrix<T>::From(*d_out);
-    auto place = ctx.GetEigenDevice<Place>();
-
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
     // Create the intermediate variable to caculate the Output(Y@Grad).
     Tensor x_scale;
     x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}),
@@ -110,18 +111,18 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
                             ctx.GetPlace());
     auto y_scale_mat = EigenMatrix<T>::From(y_scale);
 
-    math::SetConstant<Place, T> set_zero;
+    math::SetConstant<DeviceContext, T> set_zero;
 
     // Set Output(X@Grad) be zero.
     if (d_x) {
       d_x->mutable_data<T>(ctx.GetPlace());
-      set_zero(ctx.device_context(), d_x, static_cast<T>(0));
+      set_zero(dev_ctx, d_x, static_cast<T>(0));
     }
 
     // Set Output(Y@Grad) be zero.
     if (d_y) {
       d_y->mutable_data<T>(ctx.GetPlace());
-      set_zero(ctx.device_context(), d_y, static_cast<T>(0));
+      set_zero(dev_ctx, d_y, static_cast<T>(0));
     }
 
     // Caculate the Output(X@Grad) and Output(Y@Grad).
@@ -137,18 +138,18 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
               output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                   .broadcast(bcast_for_x) *
               y_mat;
-          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasTrans,
-                               batch_size, x_dim, y_dim, 1, y_scale.data<T>(),
-                               weight_i.data<T>(), 1, d_x->data<T>());
+          math::gemm<DeviceContext, T>(
+              dev_ctx, CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
+              y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
         }
         if (d_y) {
           x_scale_mat.device(place) =
               output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                   .broadcast(bcast_for_y) *
               x_mat;
-          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
-                               batch_size, y_dim, x_dim, 1, x_scale.data<T>(),
-                               weight_i.data<T>(), 1, d_y->data<T>());
+          math::gemm<DeviceContext, T>(
+              dev_ctx, CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
+              x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
         }
       }
     }
@@ -165,9 +166,9 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
             output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                 .broadcast(bcast_for_weight) *
             x_mat;
-        math::gemm<Place, T>(ctx.device_context(), CblasTrans, CblasNoTrans,
-                             x_dim, y_dim, batch_size, 1, x_scale.data<T>(),
-                             y->data<T>(), 0, d_weight_i.data<T>());
+        math::gemm<DeviceContext, T>(dev_ctx, CblasTrans, CblasNoTrans, x_dim,
+                                     y_dim, batch_size, 1, x_scale.data<T>(),
+                                     y->data<T>(), 0, d_weight_i.data<T>());
       }
     }
 
diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc
index 3082a53ccf..42bff69a1e 100644
--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
@@ -68,7 +68,7 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUPlace;
+using CPU = paddle::platform::CPUDeviceContext;
 REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape,
                         ops::CastOpProtoMaker);
 REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
diff --git a/paddle/operators/cast_op.cu b/paddle/operators/cast_op.cu
index fb75ddbabf..4681deaa62 100644
--- a/paddle/operators/cast_op.cu
+++ b/paddle/operators/cast_op.cu
@@ -16,7 +16,7 @@
 
 template <typename T>
 using CastOpKernel =
-    paddle::operators::CastOpKernel<paddle::platform::GPUPlace, T>;
+    paddle::operators::CastOpKernel<paddle::platform::CUDADeviceContext, T>;
 
-REGISTER_OP_GPU_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
-                       CastOpKernel<int>, CastOpKernel<int64_t>);
+REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
+                        CastOpKernel<int>, CastOpKernel<int64_t>);
diff --git a/paddle/operators/cast_op.h b/paddle/operators/cast_op.h
index 850dc8e349..a6773f13a8 100644
--- a/paddle/operators/cast_op.h
+++ b/paddle/operators/cast_op.h
@@ -27,13 +27,13 @@ struct CastOpTransformFunctor {
   HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
 };
 
-template <typename Place, typename InT>
+template <typename DeviceContext, typename InT>
 struct CastOpFunctor {
   const framework::Tensor* in_;
   framework::Tensor* out_;
-  const platform::DeviceContext& ctx_;
+  const DeviceContext& ctx_;
   CastOpFunctor(const framework::Tensor* in, framework::Tensor* out,
-                const platform::DeviceContext& ctx)
+                const DeviceContext& ctx)
       : in_(in), out_(out), ctx_(ctx) {}
 
   template <typename OutT>
@@ -42,13 +42,13 @@ struct CastOpFunctor {
     auto numel = in_->numel();
     auto* in_end = in_begin + numel;
     auto* out_begin = out_->mutable_data<OutT>(ctx_.GetPlace());
-    platform::Transform<Place> trans;
+    platform::Transform<DeviceContext> trans;
     trans(ctx_, in_begin, in_end, out_begin,
           CastOpTransformFunctor<InT, OutT>());
   }
 };
 
-template <typename Place, typename InT>
+template <typename DeviceContext, typename InT>
 class CastOpKernel : public framework::OpKernel<InT> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -56,7 +56,8 @@ class CastOpKernel : public framework::OpKernel<InT> {
     auto* out = context.Output<framework::Tensor>("Out");
     framework::VisitDataType(
         static_cast<framework::DataType>(context.Attr<int>("out_dtype")),
-        CastOpFunctor<Place, InT>(in, out, context.device_context()));
+        CastOpFunctor<DeviceContext, InT>(
+            in, out, context.template device_context<DeviceContext>()));
   }
 };
 
diff --git a/paddle/operators/chunk_eval_op.cc b/paddle/operators/chunk_eval_op.cc
index 309660b01f..94127ab33e 100644
--- a/paddle/operators/chunk_eval_op.cc
+++ b/paddle/operators/chunk_eval_op.cc
@@ -58,9 +58,10 @@ class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
                    framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Inference",
-             "(Tensor, default: Tensor<int>). Predictions from the network.");
+             "(Tensor, default: Tensor<int64_t>). "
+             "Predictions from the network.");
     AddInput("Label",
-             "(Tensor, default: Tensor<int>). The true tag sequences.");
+             "(Tensor, default: Tensor<int64_t>). The true tag sequences.");
     AddOutput("Precision",
               "(float). The evaluated precision (called positive predictive "
               "value) of chunks on the given mini-batch.");
@@ -84,7 +85,7 @@ class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(std::vector<int>{});
     AddComment(R"DOC(
 For some basics of chunking, please refer to
-‘Chunking with Support Vector Mechines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>’.
+‘Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>’.
 
 
 CheckEvalOp computes the precision, recall, and F1-score of chunk detection,
@@ -97,7 +98,7 @@ Here is a NER example of labeling for these tagging schemes:
   IOE:   I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
   IOBES: B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
 
-There are three chunk types(named entity types) including PER(person), ORG(orgnazation)
+There are three chunk types(named entity types) including PER(person), ORG(organization)
 and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
 
 Since the calculations actually use label ids rather than labels, extra attention
diff --git a/paddle/operators/chunk_eval_op.h b/paddle/operators/chunk_eval_op.h
index 81aa07817b..9cd758a825 100644
--- a/paddle/operators/chunk_eval_op.h
+++ b/paddle/operators/chunk_eval_op.h
@@ -23,7 +23,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ChunkEvalKernel : public framework::OpKernel<T> {
  public:
   struct Segment {
@@ -35,10 +35,10 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     }
   };
 
-  void GetSegments(const int* label, int length, std::vector<Segment>& segments,
-                   int num_chunk_types, int num_tag_types, int other_chunk_type,
-                   int tag_begin, int tag_inside, int tag_end,
-                   int tag_single) const {
+  void GetSegments(const int64_t* label, int length,
+                   std::vector<Segment>& segments, int num_chunk_types,
+                   int num_tag_types, int other_chunk_type, int tag_begin,
+                   int tag_inside, int tag_end, int tag_single) const {
     segments.clear();
     segments.reserve(length);
     int chunk_start = 0;
@@ -152,8 +152,8 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     auto* recall = context.Output<Tensor>("Recall");
     auto* f1 = context.Output<Tensor>("F1-Score");
 
-    const int* inference_data = inference->data<int>();
-    const int* label_data = label->data<int>();
+    const int64_t* inference_data = inference->data<int64_t>();
+    const int64_t* label_data = label->data<int64_t>();
     T* precision_data = precision->mutable_data<T>(context.GetPlace());
     T* racall_data = recall->mutable_data<T>(context.GetPlace());
     T* f1_data = f1->mutable_data<T>(context.GetPlace());
@@ -179,7 +179,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
                                       ((*precision_data) + (*racall_data));
   }
 
-  void EvalOneSeq(const int* output, const int* label, int length,
+  void EvalOneSeq(const int64_t* output, const int64_t* label, int length,
                   std::vector<Segment>& output_segments,
                   std::vector<Segment>& label_segments,
                   int64_t& num_output_segments, int64_t& num_label_segments,
diff --git a/paddle/operators/clip_by_norm_op.cc b/paddle/operators/clip_by_norm_op.cc
index d9fc532e39..0b7975a63f 100644
--- a/paddle/operators/clip_by_norm_op.cc
+++ b/paddle/operators/clip_by_norm_op.cc
@@ -47,15 +47,19 @@ class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
               "(Tensor) The output of clip_by_norm op with shape as input(X)");
     AddAttr<float>("max_norm", "(float) The maximum norm value.");
     AddComment(R"DOC(
-ClipByNorm operator limits the L2 norm of the input 'X' within 'max_norm'. 
-If the L2 norm of 'X' is less than or equal to 'max_norm', 'Out' will be 
-the same as 'X'. If the L2 norm of 'X' is greater than 'max_norm', 'X' will 
-be linearly scaled to make the L2 norm of 'Out' equal to 'max_norm', as 
-shown in the following formula：
+ClipByNorm Operator.
 
-'Out' = 'max_norm' * 'X' / norm('X'),
+This operator limits the L2 norm of the input $X$ within $max\_norm$.
+If the L2 norm of $X$ is less than or equal to $max\_norm$, $Out$ will be
+the same as $X$. If the L2 norm of $X$ is greater than $max\_norm$, $X$ will
+be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as
+shown in the following formula:
 
-where norm('X') represents the L2 norm of 'X'.
+$$
+Out = \frac{max\_norm * X}{norm(X)},
+$$
+
+where $norm(X)$ represents the L2 norm of $X$.
 )DOC");
   }
 };
@@ -67,4 +71,5 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp,
                              ops::ClipByNormOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    clip_by_norm, ops::ClipByNormKernel<paddle::platform::CPUPlace, float>);
+    clip_by_norm,
+    ops::ClipByNormKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/clip_by_norm_op.cu b/paddle/operators/clip_by_norm_op.cu
index 2593a24ebb..acd7543823 100644
--- a/paddle/operators/clip_by_norm_op.cu
+++ b/paddle/operators/clip_by_norm_op.cu
@@ -15,5 +15,6 @@
 #include "paddle/operators/clip_by_norm_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    clip_by_norm, ops::ClipByNormKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    clip_by_norm,
+    ops::ClipByNormKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/clip_by_norm_op.h b/paddle/operators/clip_by_norm_op.h
index b26476cae9..d8db1566b0 100644
--- a/paddle/operators/clip_by_norm_op.h
+++ b/paddle/operators/clip_by_norm_op.h
@@ -26,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ClipByNormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -38,7 +38,8 @@ class ClipByNormKernel : public framework::OpKernel<T> {
     auto x = EigenVector<T>::Flatten(*input);
     auto out = EigenVector<T>::Flatten(*output);
     auto x_norm = x.square().sum().sqrt();
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto temp = (x_norm <= max_norm).template cast<T>().eval();
     auto scaling = temp + (static_cast<T>(1) - temp) * max_norm / x_norm;
diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index 3e9066ceb2..6092212de4 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -52,7 +52,11 @@ class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
 Clip Operator.
 
 The clip operator limits the value of given input within an interval. The interval is
-specified with arguments 'min' and 'max'.
+specified with arguments 'min' and 'max':
+
+$$
+Out = \min(\max(X, min), max)
+$$
 
 )DOC");
   }
@@ -79,7 +83,7 @@ class ClipOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
             ops::ClipOpGrad);
-REGISTER_OP_CPU_KERNEL(clip,
-                       ops::ClipKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(clip_grad,
-                       ops::ClipGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/clip_op.cu b/paddle/operators/clip_op.cu
index ca9701298f..bb7dcc671a 100644
--- a/paddle/operators/clip_op.cu
+++ b/paddle/operators/clip_op.cu
@@ -15,7 +15,7 @@
 #include "paddle/operators/clip_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(clip,
-                       ops::ClipKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(clip_grad,
-                       ops::ClipGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    clip, ops::ClipKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    clip_grad, ops::ClipGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/clip_op.h b/paddle/operators/clip_op.h
index ac702e9935..0c40797410 100644
--- a/paddle/operators/clip_op.h
+++ b/paddle/operators/clip_op.h
@@ -55,7 +55,7 @@ class ClipGradFunctor {
   T max_;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ClipKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -66,13 +66,13 @@ class ClipKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
     const T* x_data = x->data<T>();
     int64_t numel = x->numel();
-    Transform<Place> trans;
-    trans(context.device_context(), x_data, x_data + numel, out_data,
-          ClipFunctor<T>(min, max));
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x_data,
+          x_data + numel, out_data, ClipFunctor<T>(min, max));
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ClipGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -86,9 +86,9 @@ class ClipGradKernel : public framework::OpKernel<T> {
       auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
       const T* d_out_data = d_out->data<T>();
       const T* x_data = x->data<T>();
-      Transform<Place> trans;
-      trans(context.device_context(), d_out_data, d_out_data + numel, x_data,
-            d_x_data, ClipGradFunctor<T>(min, max));
+      Transform<DeviceContext> trans;
+      trans(context.template device_context<DeviceContext>(), d_out_data,
+            d_out_data + numel, x_data, d_x_data, ClipGradFunctor<T>(min, max));
     }
   }
 };
diff --git a/paddle/operators/compare_op.cu b/paddle/operators/compare_op.cu
index 6ac8c124b9..596a878bcf 100644
--- a/paddle/operators/compare_op.cu
+++ b/paddle/operators/compare_op.cu
@@ -14,10 +14,10 @@
 
 #include "paddle/operators/compare_op.h"
 
-REGISTER_LOGICAL_KERNEL(less_than, GPU, paddle::operators::LessThanFunctor);
-REGISTER_LOGICAL_KERNEL(less_equal, GPU, paddle::operators::LessEqualFunctor);
-REGISTER_LOGICAL_KERNEL(greater_than, GPU,
+REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_KERNEL(greater_than, CUDA,
                         paddle::operators::GreaterThanFunctor);
-REGISTER_LOGICAL_KERNEL(greater_equal, GPU,
+REGISTER_LOGICAL_KERNEL(greater_equal, CUDA,
                         paddle::operators::GreaterEqualFunctor);
-REGISTER_LOGICAL_KERNEL(equal, GPU, paddle::operators::EqualFunctor);
+REGISTER_LOGICAL_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h
index afdf3ab3e0..a56536e155 100644
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
@@ -59,7 +59,7 @@ struct EqualFunctor {
   }
 };
 
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class CompareOpKernel
     : public framework::OpKernel<typename Functor::ELEM_TYPE> {
  public:
@@ -69,24 +69,23 @@ class CompareOpKernel
     auto* y = context.Input<framework::Tensor>("Y");
     auto* out = context.Output<framework::Tensor>("Out");
     Functor binary_func;
-    platform::Transform<Place> trans;
-    trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
-          y->data<T>(), out->mutable_data<bool>(context.GetPlace()),
-          binary_func);
+    platform::Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x->data<T>(),
+          x->data<T>() + x->numel(), y->data<T>(),
+          out->mutable_data<bool>(context.GetPlace()), binary_func);
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor)                     \
-  REGISTER_OP_##dev##_KERNEL(                                              \
-      op_type,                                                             \
-      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
-                                           functor<int>>,                  \
-      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
-                                           functor<int64_t>>,              \
-      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
-                                           functor<float>>,                \
-      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
-                                           functor<double>>);
+#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor)                    \
+  REGISTER_OP_##dev##_KERNEL(                                             \
+      op_type, ::paddle::operators::CompareOpKernel<                      \
+                   ::paddle::platform::dev##DeviceContext, functor<int>>, \
+      ::paddle::operators::CompareOpKernel<                               \
+          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,      \
+      ::paddle::operators::CompareOpKernel<                               \
+          ::paddle::platform::dev##DeviceContext, functor<float>>,        \
+      ::paddle::operators::CompareOpKernel<                               \
+          ::paddle::platform::dev##DeviceContext, functor<double>>);
diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
index 6134ac78b1..cf522d6921 100644
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -41,14 +41,18 @@ class ConcatOp : public framework::OperatorWithKernel {
       for (size_t j = 0; j < in_zero_dims_size; j++) {
         if (j == axis) {
           out_dims[axis] += ins[i][j];
-          continue;
+        } else {
+          PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
+                            "Input tensors should have the same "
+                            "elements except the specify axis.");
         }
-        PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
-                          "Input tensors should have the same "
-                          "elements except the specify axis.");
       }
     }
+    if (out_dims[axis] < 0) {
+      out_dims[axis] = -1;
+    }
     ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/operators/concat_op.cu.cc b/paddle/operators/concat_op.cu.cc
index ede832ddcd..7b46452d3d 100644
--- a/paddle/operators/concat_op.cu.cc
+++ b/paddle/operators/concat_op.cu.cc
@@ -14,7 +14,8 @@ limitations under the License. */
 
 #include "paddle/operators/concat_op.h"
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(concat,
-                       ops::ConcatKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    concat_grad, ops::ConcatGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    concat_grad,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/concat_op.h b/paddle/operators/concat_op.h
index c113f19fb5..de4011585a 100644
--- a/paddle/operators/concat_op.h
+++ b/paddle/operators/concat_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ConcatKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -43,7 +43,7 @@ class ConcatKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ConcatGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc
index d5b124682d..03c58a7eab 100644
--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
@@ -142,9 +142,9 @@ class ConditionalBlockGradOp : public ConditionalOp {
         continue;
       }
       auto new_in_grad_name = cur_scope.Rename(in_grad_name);
-      auto assign =
-          framework::OpRegistry::CreateOp("assign", {{"X", {new_in_grad_name}}},
-                                          {{"Out", {out_grad_name}}}, {});
+      auto assign = framework::OpRegistry::CreateOp(
+          "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}},
+          framework::AttributeMap{});
       assign->Run(cur_scope, dev_ctx);
       cur_scope.Rename(new_in_grad_name, in_grad_name);
     }
diff --git a/paddle/operators/conv_cudnn_op.cc b/paddle/operators/conv_cudnn_op.cc
index 0dd8c13b2a..008bf01885 100644
--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
@@ -57,18 +57,20 @@ REGISTER_OP(conv2d_cudnn, ops::ConvOp, ops::CudnnConv2DOpMaker,
 REGISTER_OP(conv3d_cudnn, ops::ConvOp, ops::CudnnConv3DOpMaker,
             conv3d_cudnn_grad, ops::ConvOpGrad);
 
-REGISTER_OP_CPU_KERNEL(conv2d_cudnn,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_cudnn,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     conv2d_cudnn_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
 
-REGISTER_OP_CPU_KERNEL(conv3d_cudnn,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_cudnn,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     conv3d_cudnn_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc
index 3f97dc7ee0..3da0a9001a 100644
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -28,7 +28,8 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using DataLayout = platform::DataLayout;
 
-static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 1024 * 1024 * 1024;
+static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
+    static_cast<size_t>(1024) * 1024 * 1024;
 
 template <typename T>
 class CudnnConvOpKernel : public framework::OpKernel<T> {
@@ -44,7 +45,8 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     int groups = ctx.Attr<int>("groups");
-    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+    int64_t user_workspace_size =
+        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
 
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
@@ -116,7 +118,8 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
     }
     // ------------------- cudnn conv algorithm ---------------------
     cudnnConvolutionFwdAlgo_t algo;
-    auto handle = ctx.cuda_device_context().cudnn_handle();
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
 
     PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
@@ -163,7 +166,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     int groups = ctx.Attr<int>("groups");
-    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+    int64_t user_workspace_size =
+        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
 
     // ------------------- cudnn descriptors ---------------------
     ScopedTensorDescriptor input_desc;
@@ -235,7 +239,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
       workspace_size_limit = user_workspace_size * 1024 * 1024;
     }
 
-    auto handle = ctx.cuda_device_context().cudnn_handle();
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
       PADDLE_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
@@ -310,16 +315,16 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(conv2d_cudnn,
-                       paddle::operators::CudnnConvOpKernel<float>,
-                       paddle::operators::CudnnConvOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(conv2d_cudnn_grad,
-                       paddle::operators::CudnnConvGradOpKernel<float>,
-                       paddle::operators::CudnnConvGradOpKernel<double>);
-
-REGISTER_OP_GPU_KERNEL(conv3d_cudnn,
-                       paddle::operators::CudnnConvOpKernel<float>,
-                       paddle::operators::CudnnConvOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(conv3d_cudnn_grad,
-                       paddle::operators::CudnnConvGradOpKernel<float>,
-                       paddle::operators::CudnnConvGradOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(conv2d_cudnn,
+                        paddle::operators::CudnnConvOpKernel<float>,
+                        paddle::operators::CudnnConvOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(conv2d_cudnn_grad,
+                        paddle::operators::CudnnConvGradOpKernel<float>,
+                        paddle::operators::CudnnConvGradOpKernel<double>);
+
+REGISTER_OP_CUDA_KERNEL(conv3d_cudnn,
+                        paddle::operators::CudnnConvOpKernel<float>,
+                        paddle::operators::CudnnConvOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(conv3d_cudnn_grad,
+                        paddle::operators::CudnnConvGradOpKernel<float>,
+                        paddle::operators::CudnnConvGradOpKernel<double>);
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index 462e6d9cbc..7ef805fd44 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -235,16 +235,18 @@ namespace ops = paddle::operators;
 REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
             ops::ConvOpGrad);
 
-REGISTER_OP_CPU_KERNEL(conv2d,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
+    conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
 
-REGISTER_OP_CPU_KERNEL(conv3d,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
+    conv3d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc
index 546451234a..38615a8bef 100644
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
@@ -16,16 +16,18 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(conv2d,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
-    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv2d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
 
-REGISTER_OP_GPU_KERNEL(conv3d,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
-    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv3d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv3d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h
index 09bff0a68d..749258183b 100644
--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
@@ -72,7 +72,7 @@ class ConvOpGrad : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GemmConvKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -141,9 +141,10 @@ class GemmConvKernel : public framework::OpKernel<T> {
     int in_step = static_cast<int>(input->dims()[1]) / groups;
     int out_step = static_cast<int>(output->dims()[1]) / groups;
 
-    math::Vol2ColFunctor<Place, T> vol2col;
-    math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
+    math::Vol2ColFunctor<DeviceContext, T> vol2col;
+    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
 
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     for (int i = 0; i < batch_size; i++) {
       Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
       Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
@@ -157,27 +158,26 @@ class GemmConvKernel : public framework::OpKernel<T> {
           col_matrix.Resize(col_matrix_shape);
         } else if (data_dim == 2U) {
           // im2col
-          im2col(context.device_context(), in_slice, dilations, strides,
+          im2col(dev_ctx, in_slice, dilations, strides,
                  std::vector<int>{paddings[0], paddings[1], paddings[0],
                                   paddings[1]},
                  &col);
         } else if (data_dim == 3U) {
           // vol2col
-          vol2col(context.device_context(), in_slice, dilations, strides,
-                  paddings, &col);
+          vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
         }
 
         // gemm
         Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
         Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        math::matmul<Place, T>(context.device_context(), filter_slice, false,
-                               col_matrix, false, T(1.0), &out_slice, T(0.0));
+        math::matmul<DeviceContext, T>(dev_ctx, filter_slice, false, col_matrix,
+                                       false, T(1.0), &out_slice, T(0.0));
       }
     }
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GemmConvGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -256,14 +256,15 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
       col_matrix.Resize(col_matrix_shape);
     }
 
-    math::SetConstant<Place, T> set_zero;
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
 
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
-      set_zero(context.device_context(), input_grad, static_cast<T>(0));
+      set_zero(dev_ctx, input_grad, static_cast<T>(0));
 
-      math::Col2VolFunctor<Place, T> col2vol;
-      math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
+      math::Col2VolFunctor<DeviceContext, T> col2vol;
+      math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
 
       for (int i = 0; i < batch_size; i++) {
         Tensor out_grad_batch =
@@ -282,18 +283,17 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
             col_matrix.ShareDataWith(in_grad_slice);
             col_matrix.Resize(col_matrix_shape);
           }
-          math::matmul<Place, T>(context.device_context(), filter_slice, true,
-                                 out_grad_slice, false, T(1.0), &col_matrix,
-                                 T(0.0));
+          math::matmul<DeviceContext, T>(dev_ctx, filter_slice, true,
+                                         out_grad_slice, false, T(1.0),
+                                         &col_matrix, T(0.0));
 
           if (is_expand && data_dim == 2U) {
-            col2im(context.device_context(), col, dilations, strides,
+            col2im(dev_ctx, col, dilations, strides,
                    std::vector<int>{paddings[0], paddings[1], paddings[0],
                                     paddings[1]},
                    &in_grad_slice);
           } else if (is_expand && data_dim == 3U) {
-            col2vol(context.device_context(), col, dilations, strides, paddings,
-                    &in_grad_slice);
+            col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
           }
         }
       }
@@ -303,9 +303,9 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
       filter_grad->mutable_data<T>(context.GetPlace());
       Tensor filter_grad_ = *filter_grad;
       filter_grad_.Resize(filter_matrix_shape);
-      set_zero(context.device_context(), filter_grad, static_cast<T>(0));
-      math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
-      math::Vol2ColFunctor<Place, T> vol2col;
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
+      math::Vol2ColFunctor<DeviceContext, T> vol2col;
       for (int i = 0; i < batch_size; i++) {
         Tensor out_grad_batch =
             output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
@@ -321,21 +321,20 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
             col_matrix.ShareDataWith(col);
             col_matrix.Resize(col_matrix_shape);
           } else if (data_dim == 2U) {
-            im2col(context.device_context(), in_slice, dilations, strides,
+            im2col(dev_ctx, in_slice, dilations, strides,
                    std::vector<int>{paddings[0], paddings[1], paddings[0],
                                     paddings[1]},
                    &col);
           } else if (data_dim == 3U) {
-            vol2col(context.device_context(), in_slice, dilations, strides,
-                    paddings, &col);
+            vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
           }
 
           // gemm
           Tensor filter_grad_slice =
               filter_grad_.Slice(g * out_step, (g + 1) * out_step);
-          math::matmul<Place, T>(context.device_context(), out_grad_slice,
-                                 false, col_matrix, true, T(1.0),
-                                 &filter_grad_slice, T(1.0));
+          math::matmul<DeviceContext, T>(dev_ctx, out_grad_slice, false,
+                                         col_matrix, true, T(1.0),
+                                         &filter_grad_slice, T(1.0));
         }
       }
     }
diff --git a/paddle/operators/conv_shift_op.cu b/paddle/operators/conv_shift_op.cu
index 95e13c38a8..f7ca82ce26 100644
--- a/paddle/operators/conv_shift_op.cu
+++ b/paddle/operators/conv_shift_op.cu
@@ -111,7 +111,8 @@ __global__ void ConvShiftDy(const T *x, const T *dout, int x_width, int y_width,
 }  // namespace
 
 template <typename T>
-class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
+class ConvShiftKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     const Tensor *X = context.Input<Tensor>("X");
@@ -132,7 +133,8 @@ class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
 
     dim3 grid_dim(num_x_blocks, batch_size);
 
-    auto stream = context.cuda_device_context().stream();
+    auto stream =
+        context.template device_context<platform::CUDADeviceContext>().stream();
 
     ConvShiftForward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
         x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data);
@@ -140,7 +142,7 @@ class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
 };
 
 template <typename T>
-class ConvShiftGradKernel<platform::GPUPlace, T>
+class ConvShiftGradKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -159,8 +161,9 @@ class ConvShiftGradKernel<platform::GPUPlace, T>
     int y_width = Y->dims()[1];
     int y_half_width = (y_width - 1) / 2;
 
-    auto &device_ctx = context.cuda_device_context();
-    math::SetConstant<platform::GPUPlace, T> zero;
+    auto &device_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    math::SetConstant<platform::CUDADeviceContext, T> zero;
 
     const int x_per_block = 256;
     int num_x_blocks = DivUp(x_width, x_per_block);
@@ -186,8 +189,9 @@ class ConvShiftGradKernel<platform::GPUPlace, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(conv_shift,
-                       ops::ConvShiftKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    conv_shift,
+    ops::ConvShiftKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     conv_shift_grad,
-    ops::ConvShiftGradKernel<paddle::platform::GPUPlace, float>);
+    ops::ConvShiftGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/conv_shift_op.h b/paddle/operators/conv_shift_op.h
index 5a160b0f16..1a70b38a0d 100644
--- a/paddle/operators/conv_shift_op.h
+++ b/paddle/operators/conv_shift_op.h
@@ -18,13 +18,13 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ConvShiftKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ConvShiftGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override;
diff --git a/paddle/operators/conv_transpose_cudnn_op.cc b/paddle/operators/conv_transpose_cudnn_op.cc
index 8d5804fce5..02a749a341 100644
--- a/paddle/operators/conv_transpose_cudnn_op.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cc
@@ -57,12 +57,13 @@ REGISTER_OP(conv2d_transpose_cudnn, ops::ConvTransposeOp,
 
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose_cudnn,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose_cudnn_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
 
 REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp,
             ops::CudnnConv3DTransposeOpMaker, conv3d_transpose_cudnn_grad,
@@ -70,9 +71,10 @@ REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp,
 
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose_cudnn,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose_cudnn_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
diff --git a/paddle/operators/conv_transpose_cudnn_op.cu.cc b/paddle/operators/conv_transpose_cudnn_op.cu.cc
index 494904fe52..f0297f6c40 100644
--- a/paddle/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc
@@ -83,7 +83,8 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
     }
     // ------------------- cudnn conv algorithm ---------------------
     cudnnConvolutionBwdDataAlgo_t algo;
-    auto handle = ctx.cuda_device_context().cudnn_handle();
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
     // Get the algorithm
     PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
         handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
@@ -165,7 +166,8 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
       workspace_size_limit = user_workspace_size * 1024 * 1024;
     }
 
-    auto handle = ctx.cuda_device_context().cudnn_handle();
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
       // choose backward algorithm for data
       PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
@@ -234,16 +236,16 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn,
-                       ops::CudnnConvTransposeOpKernel<float>,
-                       ops::CudnnConvTransposeOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn_grad,
-                       ops::CudnnConvTransposeGradOpKernel<float>,
-                       ops::CudnnConvTransposeGradOpKernel<double>);
-
-REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn,
-                       ops::CudnnConvTransposeOpKernel<float>,
-                       ops::CudnnConvTransposeOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn_grad,
-                       ops::CudnnConvTransposeGradOpKernel<float>,
-                       ops::CudnnConvTransposeGradOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(conv2d_transpose_cudnn,
+                        ops::CudnnConvTransposeOpKernel<float>,
+                        ops::CudnnConvTransposeOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(conv2d_transpose_cudnn_grad,
+                        ops::CudnnConvTransposeGradOpKernel<float>,
+                        ops::CudnnConvTransposeGradOpKernel<double>);
+
+REGISTER_OP_CUDA_KERNEL(conv3d_transpose_cudnn,
+                        ops::CudnnConvTransposeOpKernel<float>,
+                        ops::CudnnConvTransposeOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(conv3d_transpose_cudnn_grad,
+                        ops::CudnnConvTransposeGradOpKernel<float>,
+                        ops::CudnnConvTransposeGradOpKernel<double>);
diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc
index c31a2e4a70..9e8971937e 100644
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -215,21 +215,23 @@ REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
 
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
 
 REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
             conv3d_transpose_grad, ops::ConvTransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
diff --git a/paddle/operators/conv_transpose_op.cu.cc b/paddle/operators/conv_transpose_op.cu.cc
index 4165eb0c7b..b91ebd7922 100644
--- a/paddle/operators/conv_transpose_op.cu.cc
+++ b/paddle/operators/conv_transpose_op.cu.cc
@@ -16,20 +16,24 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
     conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     double>);
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
     conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     double>);
diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h
index 65a0076d9c..42dfbb23bc 100644
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -52,7 +52,7 @@ class ConvTransposeOpGrad : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GemmConvTransposeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -110,11 +110,12 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     filter.Resize(filter_matrix_shape);
 
     output->mutable_data<T>(context.GetPlace());
-    math::SetConstant<Place, T> set_zero;
-    set_zero(context.device_context(), output, static_cast<T>(0));
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    set_zero(dev_ctx, output, static_cast<T>(0));
 
-    math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
-    math::Col2VolFunctor<Place, T> col2vol;
+    math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
+    math::Col2VolFunctor<DeviceContext, T> col2vol;
 
     // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
     // on input)
@@ -127,28 +128,27 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
 
       // col_matrix = filter * input_batch
       // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
-      math::matmul<Place, T>(context.device_context(), filter, true,
-                             input_batch, false, static_cast<T>(1.0),
-                             &col_matrix, static_cast<T>(0.0));
+      math::matmul<DeviceContext, T>(dev_ctx, filter, true, input_batch, false,
+                                     static_cast<T>(1.0), &col_matrix,
+                                     static_cast<T>(0.0));
 
       if (data_dim == 2U) {
         // col2im: col_matrix -> dy
         // from (c * k_h * k_w, h * w) to (c, o_h, o_w)
-        col2im(context.device_context(), col, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
+        col2im(dev_ctx, col, std::vector<int>{dilations[0], dilations[1]},
+               strides, std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                         paddings[1]},
                &output_batch);
       } else if (data_dim == 3U) {
         // col2vol: col_matrix -> dy
         // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
-        col2vol(context.device_context(), col, dilations, strides, paddings,
-                &output_batch);
+        col2vol(dev_ctx, col, dilations, strides, paddings, &output_batch);
       }
     }
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -206,6 +206,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
     // convolution transpose grad on input:
     // im2col + gemm (similar to conv-forward)
     // input need to compute gradient
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (input_grad || filter_grad) {
       Tensor col;
       col.mutable_data<T>(col_shape, context.GetPlace());
@@ -217,18 +218,18 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
       col_matrix.Resize(col_matrix_shape);
 
       Tensor filter_grad_;
-      math::SetConstant<Place, T> set_zero;
+      math::SetConstant<DeviceContext, T> set_zero;
 
-      math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
-      math::Vol2ColFunctor<Place, T> vol2col;
+      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
+      math::Vol2ColFunctor<DeviceContext, T> vol2col;
 
       if (input_grad) {
         input_grad->mutable_data<T>(context.GetPlace());
-        set_zero(context.device_context(), input_grad, static_cast<T>(0));
+        set_zero(dev_ctx, input_grad, static_cast<T>(0));
       }
       if (filter_grad) {  // filter size (m, c, k_h, k_w)
         filter_grad->mutable_data<T>(context.GetPlace());
-        set_zero(context.device_context(), filter_grad, static_cast<T>(0));
+        set_zero(dev_ctx, filter_grad, static_cast<T>(0));
         filter_grad_ = *filter_grad;
         filter_grad_.Resize(filter_matrix_shape);
       }
@@ -241,15 +242,16 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
         if (data_dim == 2U) {
           // im2col: dy -> col matrix
           // from (c, o_h, o_w) to (c * k_h * k_w, h * w)
-          im2col(context.device_context(), output_grad_batch, dilations,
-                 strides, std::vector<int>{paddings[0], paddings[1],
-                                           paddings[0], paddings[1]},
+          im2col(dev_ctx, output_grad_batch,
+                 std::vector<int>{dilations[0], dilations[1]}, strides,
+                 std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                  paddings[1]},
                  &col);
         } else if (data_dim == 3U) {
           // vol2col: dy -> col_matrix
           // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w)
-          vol2col(context.device_context(), output_grad_batch, dilations,
-                  strides, paddings, &col);
+          vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings,
+                  &col);
         }
 
         if (input_grad) {
@@ -261,9 +263,9 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
           // or
           // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
           // d, h, w)
-          math::matmul<Place, T>(context.device_context(), filter, false,
-                                 col_matrix, false, static_cast<T>(1.0),
-                                 &input_grad_batch, static_cast<T>(0.0));
+          math::matmul<DeviceContext, T>(
+              dev_ctx, filter, false, col_matrix, false, static_cast<T>(1.0),
+              &input_grad_batch, static_cast<T>(0.0));
         }
         if (filter_grad) {
           // input batch
@@ -273,9 +275,9 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
           // or
           // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
           // k_h * k_w)
-          math::matmul<Place, T>(context.device_context(), in_batch, false,
-                                 col_matrix, true, static_cast<T>(1.0),
-                                 &filter_grad_, static_cast<T>(1.0));
+          math::matmul<DeviceContext, T>(dev_ctx, in_batch, false, col_matrix,
+                                         true, static_cast<T>(1.0),
+                                         &filter_grad_, static_cast<T>(1.0));
         }
       }
     }
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
index 312264ccd4..440c427cba 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -155,7 +155,8 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad,
             ops::CosSimOpGrad);
-REGISTER_OP_CPU_KERNEL(cos_sim,
-                       ops::CosSimKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    cos_sim_grad, ops::CosSimGradKernel<paddle::platform::CPUPlace, float>);
+    cos_sim, ops::CosSimKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    cos_sim_grad,
+    ops::CosSimGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu
index 0cb8fd26de..1cb01f5945 100644
--- a/paddle/operators/cos_sim_op.cu
+++ b/paddle/operators/cos_sim_op.cu
@@ -16,7 +16,8 @@
 #include "paddle/operators/cos_sim_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(cos_sim,
-                       ops::CosSimKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    cos_sim_grad, ops::CosSimGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    cos_sim, ops::CosSimKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    cos_sim_grad,
+    ops::CosSimGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
index 62a4e484ec..fecb5a79b2 100644
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CosSimKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -51,7 +51,8 @@ class CosSimKernel : public framework::OpKernel<T> {
     auto y_norm = EigenVector<T>::Flatten(*out_y_norm);
 
     // compute
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     auto row_along = Eigen::array<int, 1>({{1}});
     x_norm.device(place) = x.square().sum(row_along).sqrt();
     y_norm.device(place) = y.square().sum(row_along).sqrt();
@@ -66,7 +67,7 @@ class CosSimKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CosSimGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -96,7 +97,8 @@ class CosSimGradKernel : public framework::OpKernel<T> {
     auto z_bcast = z.broadcast(bcast_cols);
     auto dz_bcast = dz.broadcast(bcast_cols);
     auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast_cols);
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     if (rows_x == rows_y) {
       auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_cols);
       auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast_cols);
diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc
index f418f489c0..1ce189fa6e 100644
--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
@@ -36,17 +36,18 @@ class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
         "w. See more details in comments of the linear_chain_crf operator.");
     AddInput(
         "Label",
-        "(LoDTensor,  LoDTensor<int>). The ground truth with shape "
+        "(LoDTensor,  LoDTensor<int64_t>). The ground truth with shape "
         "[N x 1]. This input is optional. See more details in the operator's "
         "comments.")
         .AsDispensable();
-    AddOutput("ViterbiPath",
-              "(LoDTensor, LoDTensor<int>). The decoding results. What to "
-              "return changes depending on whether the Input(Label) (the groud "
-              "truth) is given. See more details in the operator's comment.");
+    AddOutput(
+        "ViterbiPath",
+        "(LoDTensor, LoDTensor<int64_t>). The decoding results. What to "
+        "return changes depending on whether the Input(Label) (the ground "
+        "truth) is given. See more details in the operator's comment.");
     AddComment(R"DOC(
 The crf_decoding operator reads the emission feature weights and the transition
-freature weights learned by the linear_chain_crf operator. It implements the
+feature weights learned by the linear_chain_crf operator. It implements the
 Viterbi algorithm which is a dynamic programming algorithm for finding the most
 likely sequence of hidden states, called the Viterbi path, that results in a
 sequence of observed tags.
@@ -60,14 +61,14 @@ operator.
 
 When Input(Label) is given, the crf_decoding operator returns a row vector
 with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
-prediction, or 1 indicating a tag is correctly predicted. Such an ouput is the
+prediction, or 1 indicating a tag is correctly predicted. Such an output is the
 input to chunk_eval operator.
 
 2. Input(Label) is not given:
 
 This is the standard decoding process.
 
-The crf_decoding operator returns a row vecotr with shape [N x 1] whose values
+The crf_decoding operator returns a row vector with shape [N x 1] whose values
 range from 0 to maximum tag number - 1. Each element indicates an index of a
 predicted tag.
 )DOC");
@@ -134,5 +135,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(crf_decoding, ops::CRFDecodingOp,
                              ops::CRFDecodingOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    crf_decoding, ops::CRFDecodingOpKernel<paddle::platform::CPUPlace, float>,
-    ops::CRFDecodingOpKernel<paddle::platform::CPUPlace, double>);
+    crf_decoding,
+    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/crf_decoding_op.h b/paddle/operators/crf_decoding_op.h
index 526e0c5dcb..f6827b7b11 100644
--- a/paddle/operators/crf_decoding_op.h
+++ b/paddle/operators/crf_decoding_op.h
@@ -24,7 +24,7 @@ using framework::LoDTensor;
 using framework::LoD;
 using framework::Tensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CRFDecodingOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -43,9 +43,9 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     const size_t level = 0;
     const size_t seq_num = lod[level].size() - 1;
 
-    int* path = decoded_path->mutable_data<int>(platform::CPUPlace());
-    math::SetConstant<platform::CPUPlace, int>()(ctx.device_context(),
-                                                 decoded_path, 0);
+    int64_t* path = decoded_path->mutable_data<int64_t>(platform::CPUPlace());
+    math::SetConstant<DeviceContext, int64_t>()(
+        ctx.template device_context<DeviceContext>(), decoded_path, 0);
     for (size_t i = 0; i < seq_num; ++i) {
       int start_pos = static_cast<int>(lod[level][i]);
       int end_pos = static_cast<int>(lod[level][i + 1]);
@@ -57,7 +57,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     if (label) {
       PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL,
                         "The Input(Label) should be a sequence.");
-      const int* label_value = label->data<int>();
+      const int64_t* label_value = label->data<int64_t>();
       size_t batch_size = emission_weights->dims()[0];
       for (size_t i = 0; i < batch_size; ++i) {
         path[i] = label_value[i] == path[i] ? 1 : 0;
@@ -76,7 +76,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
 
     const T* x = emission_weights.data<T>();
     const T* w = transition_weights.data<T>();
-    int* path = decoded_path->data<int>();
+    int64_t* path = decoded_path->data<int64_t>();
 
     // alpha is a memo table. An element alpha(k, v) records the score of the
     // best sequence of tags from position 1 to position k with v being the end
diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
index 6752eb8c1c..7c2a0ac7a7 100644
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -133,5 +133,5 @@ class CropOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad);
 REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
-REGISTER_OP_CPU_KERNEL(crop_grad,
-                       ops::CropGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/crop_op.cu b/paddle/operators/crop_op.cu
index f8ee18a1d6..90fd83ca10 100644
--- a/paddle/operators/crop_op.cu
+++ b/paddle/operators/crop_op.cu
@@ -16,6 +16,6 @@
 #include "paddle/operators/crop_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(crop, ops::CropKernel<float>);
-REGISTER_OP_GPU_KERNEL(crop_grad,
-                       ops::CropGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CUDA_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h
index 2e72583d68..d531a19c78 100644
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
@@ -49,7 +49,7 @@ class CropKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T, size_t D>
+template <typename DeviceContext, typename T, size_t D>
 void CropGradFunction(const framework::ExecutionContext& context) {
   auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
   if (d_x != nullptr) {
@@ -63,12 +63,13 @@ void CropGradFunction(const framework::ExecutionContext& context) {
     }
     auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
     auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
-    d_x_tensor.device(context.GetEigenDevice<Place>()) =
+    d_x_tensor.device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
         d_out_tensor.pad(paddings, 0);
   }
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CropGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -76,22 +77,22 @@ class CropGradKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
     switch (rank) {
       case 1:
-        CropGradFunction<Place, T, 1>(context);
+        CropGradFunction<DeviceContext, T, 1>(context);
         break;
       case 2:
-        CropGradFunction<Place, T, 2>(context);
+        CropGradFunction<DeviceContext, T, 2>(context);
         break;
       case 3:
-        CropGradFunction<Place, T, 3>(context);
+        CropGradFunction<DeviceContext, T, 3>(context);
         break;
       case 4:
-        CropGradFunction<Place, T, 4>(context);
+        CropGradFunction<DeviceContext, T, 4>(context);
         break;
       case 5:
-        CropGradFunction<Place, T, 5>(context);
+        CropGradFunction<DeviceContext, T, 5>(context);
         break;
       case 6:
-        CropGradFunction<Place, T, 6>(context);
+        CropGradFunction<DeviceContext, T, 6>(context);
         break;
       default:
         PADDLE_THROW(
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 1e82742eaf..2b06012b69 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -95,6 +95,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                         "Input(Label) should be 1.");
     }
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
   }
 
  protected:
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 6212e39dfd..0546964588 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -53,8 +53,9 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     Tensor* y = ctx.Output<Tensor>("Y");
     y->mutable_data<T>(ctx.GetPlace());
 
-    math::CrossEntropyFunctor<platform::GPUPlace, T>()(
-        ctx.device_context(), y, x, label, ctx.Attr<bool>("soft_label"));
+    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
+        ctx.template device_context<platform::CUDADeviceContext>(), y, x, label,
+        ctx.Attr<bool>("soft_label"));
   }
 };
 
@@ -80,15 +81,17 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
 
     int block = 512;
     int grid = (batch_size * class_num + block - 1) / block;
-    auto stream = ctx.cuda_device_context().stream();
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto stream = dev_ctx.stream();
 
     if (ctx.Attr<bool>("soft_label")) {
       auto* label_data = label->data<T>();
       SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
           dx_data, dy_data, x_data, label_data, batch_size, class_num);
     } else {
-      math::SetConstant<platform::GPUPlace, T> functor;
-      functor(ctx.device_context(), dx, 0);
+      math::SetConstant<platform::CUDADeviceContext, T> functor;
+      functor(dev_ctx, dx, 0);
       auto* label_data = label->data<int64_t>();
       grid = (batch_size + block - 1) / block;
       CrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
@@ -101,8 +104,8 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>,
-                       ops::CrossEntropyOpCUDAKernel<double>);
-REGISTER_OP_GPU_KERNEL(cross_entropy_grad,
-                       ops::CrossEntropyGradientOpCUDAKernel<float>,
-                       ops::CrossEntropyGradientOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>,
+                        ops::CrossEntropyOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
+                        ops::CrossEntropyGradientOpCUDAKernel<float>,
+                        ops::CrossEntropyGradientOpCUDAKernel<double>);
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index 37db0a930a..5623d2ded1 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -37,8 +37,9 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
     Tensor* y = ctx.Output<Tensor>("Y");
     y->mutable_data<T>(ctx.GetPlace());
 
-    math::CrossEntropyFunctor<platform::CPUPlace, T>()(
-        ctx.device_context(), y, x, labels, ctx.Attr<bool>("soft_label"));
+    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
+        ctx.template device_context<platform::CPUDeviceContext>(), y, x, labels,
+        ctx.Attr<bool>("soft_label"));
   }
 };
 
@@ -61,7 +62,8 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
       auto lbl_mat = EigenMatrix<T>::From(*label);
       auto dx_mat = EigenMatrix<T>::From(*dx);
 
-      dx_mat.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
+      dx_mat.device(*ctx.template device_context<platform::CPUDeviceContext>()
+                         .eigen_device()) =
           -(lbl_mat *
             dy_mat.broadcast(Eigen::DSizes<int64_t, 2>(1, class_num)) / x_mat);
     } else {
@@ -70,8 +72,8 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
       const T* x_data = x->data<T>();
       const int64_t* label_data = label->data<int64_t>();
 
-      math::SetConstant<platform::CPUPlace, T> functor;
-      functor(ctx.device_context(), dx, 0);
+      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      functor(ctx.template device_context<platform::CPUDeviceContext>(), dx, 0);
 
       for (int64_t i = 0; i < batch_size; ++i) {
         PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
diff --git a/paddle/operators/decayed_adagrad_op.cc b/paddle/operators/decayed_adagrad_op.cc
index 640b4e7744..fd29c7270b 100644
--- a/paddle/operators/decayed_adagrad_op.cc
+++ b/paddle/operators/decayed_adagrad_op.cc
@@ -99,4 +99,4 @@ REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp,
                              ops::DecayedAdagradOpMaker);
 REGISTER_OP_CPU_KERNEL(
     decayed_adagrad,
-    ops::DecayedAdagradOpKernel<paddle::platform::CPUPlace, float>);
+    ops::DecayedAdagradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/decayed_adagrad_op.cu b/paddle/operators/decayed_adagrad_op.cu
index 6fce77fe4e..282b90f275 100644
--- a/paddle/operators/decayed_adagrad_op.cu
+++ b/paddle/operators/decayed_adagrad_op.cu
@@ -16,6 +16,6 @@
 #include "paddle/operators/decayed_adagrad_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     decayed_adagrad,
-    ops::DecayedAdagradOpKernel<paddle::platform::GPUPlace, float>);
+    ops::DecayedAdagradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/decayed_adagrad_op.h b/paddle/operators/decayed_adagrad_op.h
index 0fe0fc5acd..fec9705cfc 100644
--- a/paddle/operators/decayed_adagrad_op.h
+++ b/paddle/operators/decayed_adagrad_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class DecayedAdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -43,7 +43,7 @@ class DecayedAdagradOpKernel : public framework::OpKernel<T> {
 
     auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
     auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     moment_out.device(place) = decay * moment + (1 - decay) * grad * grad;
     Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index 932c0bf8fb..acd526ae80 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -100,6 +100,8 @@ namespace ops = paddle::operators;
 REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad,
             ops::DropoutOpGrad<float>);
 REGISTER_OP_CPU_KERNEL(
-    dropout, ops::CPUDropoutKernel<paddle::platform::CPUPlace, float, float>);
+    dropout,
+    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float, float>);
 REGISTER_OP_CPU_KERNEL(
-    dropout_grad, ops::DropoutGradKernel<paddle::platform::CPUPlace, float>);
+    dropout_grad,
+    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
index db3578b9bf..10c670751d 100644
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -58,7 +58,7 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
     auto X = EigenMatrix<T>::Reshape(*x, 1);
     auto Y = EigenMatrix<T>::Reshape(*y, 1);
 
-    auto place = context.GetEigenDevice<Place>();
+    auto& place = *context.template device_context<Place>().eigen_device();
     if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
@@ -80,7 +80,9 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    dropout, ops::GPUDropoutKernel<paddle::platform::GPUPlace, float, float>);
-REGISTER_OP_GPU_KERNEL(
-    dropout_grad, ops::DropoutGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    dropout,
+    ops::GPUDropoutKernel<paddle::platform::CUDADeviceContext, float, float>);
+REGISTER_OP_CUDA_KERNEL(
+    dropout_grad,
+    ops::DropoutGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h
index d9a130fdc0..84ad39f0bb 100644
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T, typename AttrType>
+template <typename DeviceContext, typename T, typename AttrType>
 class CPUDropoutKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -55,13 +55,14 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
     } else {
       auto X = EigenMatrix<T>::Reshape(*x, 1);
       auto Y = EigenMatrix<T>::Reshape(*y, 1);
-      auto place = context.GetEigenDevice<Place>();
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
       Y.device(place) = X * dropout_prob;
     }
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class DropoutGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -77,7 +78,8 @@ class DropoutGradKernel : public framework::OpKernel<T> {
     auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
     auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
 
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     dX.device(place) = dY * M;
   }
 };
diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/operators/elementwise_add_op.cc
index 432b9ba6f7..a62eeeeb95 100644
--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/operators/elementwise_add_op.cc
@@ -34,13 +34,13 @@ REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker,
             elementwise_add_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_add,
-    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_add_op.cu b/paddle/operators/elementwise_add_op.cu
index 7591428ac7..78642bb424 100644
--- a/paddle/operators/elementwise_add_op.cu
+++ b/paddle/operators/elementwise_add_op.cu
@@ -17,15 +17,16 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     elementwise_add,
-    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, int64_t>);
-REGISTER_OP_GPU_KERNEL(
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
     elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, int64_t>);
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h
index f04fe3ec60..069bdaf0ab 100644
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
@@ -19,11 +19,48 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename DeviceContext, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseCompute<EigenAddFunctor, Place, T>(ctx);
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
+        x, y, z, ctx.template device_context<DeviceContext>(), AddFunctor<T>());
+
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                      "Rank of first input must >= rank of second input.");
+
+    if (x_dims == y_dims) {
+      functor.Run();
+      return;
+    }
+
+    int axis = ctx.Attr<int>("axis");
+    axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+    PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                   "Axis should be in range [0, x_dims)");
+
+    int pre, n, post;
+    get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+    if (post == 1) {
+      functor.RunRowWise(n, pre);
+      return;
+    } else {
+      functor.RunMidWise(n, pre, post);
+      return;
+    }
   }
 };
 
@@ -100,11 +137,11 @@ struct ElementwiseAddBroadCast2GradFunctor {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseAddGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseGradCompute<Place, T, ElementwiseAddGradFunctor<T>,
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseAddGradFunctor<T>,
                            ElementwiseAddOneGradFunctor<T>,
                            ElementwiseAddBroadCastGradFunctor<T>,
                            ElementwiseAddBroadCast2GradFunctor<T>>(ctx);
diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/operators/elementwise_div_op.cc
index 7a325199bd..1c3e9e70ee 100644
--- a/paddle/operators/elementwise_div_op.cc
+++ b/paddle/operators/elementwise_div_op.cc
@@ -35,13 +35,13 @@ REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
             elementwise_div_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_div_op.cu b/paddle/operators/elementwise_div_op.cu
index de4d0c3344..502c528936 100644
--- a/paddle/operators/elementwise_div_op.cu
+++ b/paddle/operators/elementwise_div_op.cu
@@ -17,15 +17,16 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, int64_t>);
-REGISTER_OP_GPU_KERNEL(
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
     elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, int64_t>);
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h
index 8946ff3d25..d91313db42 100644
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
@@ -19,11 +19,11 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseDivKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseCompute<EigenDivFunctor, Place, T>(ctx);
+    ElementwiseCompute<EigenDivFunctor, DeviceContext, T>(ctx);
   }
 };
 
@@ -102,11 +102,11 @@ struct ElementwiseDivBroadCast2GradFunctor {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseDivGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseGradCompute<Place, T, ElementwiseDivGradFunctor<T>,
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>,
                            ElementwiseDivGradFunctor<T>,
                            ElementwiseDivBroadCastGradFunctor<T>,
                            ElementwiseDivBroadCast2GradFunctor<T>>(ctx);
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index 8851267a52..aadb95cbe3 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -36,13 +36,13 @@ REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
             elementwise_mul_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_mul_op.cu b/paddle/operators/elementwise_mul_op.cu
index b0dfdee1cc..089451b3e1 100644
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
@@ -17,15 +17,16 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, int64_t>);
-REGISTER_OP_GPU_KERNEL(
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
     elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, int64_t>);
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h
index 4469b07eaa..16fa5ec4b3 100644
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -18,11 +18,11 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseMulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseCompute<EigenMulFunctor, Place, T>(ctx);
+    ElementwiseCompute<EigenMulFunctor, DeviceContext, T>(ctx);
   }
 };
 
@@ -101,11 +101,11 @@ struct ElementwiseMulBroadCast2GradFunctor {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseMulGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseGradCompute<Place, T, ElementwiseMulGradFunctor<T>,
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>,
                            ElementwiseMulGradFunctor<T>,
                            ElementwiseMulBroadCastGradFunctor<T>,
                            ElementwiseMulBroadCast2GradFunctor<T>>(ctx);
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h
index 8aa35b2c46..7ebfc7df8c 100644
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -16,6 +16,11 @@
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
+#include "paddle/platform/transform.h"
+
+#ifdef __NVCC__
+#include <thrust/iterator/iterator_adaptor.h>
+#endif
 
 #include "paddle/operators/math/math_function.h"
 
@@ -54,18 +59,173 @@ inline void get_mid_dims(const framework::DDim& x_dims,
   }
 }
 
+template <typename T, typename DeviceContext>
+class RowwiseTransformIterator;
+template <typename T, typename DeviceContext>
+class MidWiseTransformIterator;
+
+template <typename T>
+class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
+ public:
+  RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
+
+  RowwiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
+    ++i_;
+    if (UNLIKELY(i_ == n_)) {
+      i_ = 0;
+    }
+    return *this;
+  }
+
+  bool operator==(const RowwiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
+    return (ptr_ + i_) == &(*rhs);
+  }
+
+  bool operator!=(const RowwiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
+    return (ptr_ + i_) != &(*rhs);
+  }
+
+  const T& operator*() { return ptr_[i_]; }
+
+ private:
+  const T* ptr_;
+  int i_;
+  int64_t n_;
+};
+
+template <typename T>
+class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
+ public:
+  MidWiseTransformIterator(const T* ptr, int n, int post)
+      : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
+
+  MidWiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
+    ++j_;
+    i_ = j_ / post_;
+    if (UNLIKELY(i_ == n_)) {
+      j_ = 0;
+      i_ = 0;
+    }
+    return *this;
+  }
+
+  bool operator==(const MidWiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
+    return (ptr_ + i_) == &(*rhs);
+  }
+
+  bool operator!=(const MidWiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
+    return (ptr_ + i_) != &(*rhs);
+  }
+
+  const T& operator*() { return ptr_[i_]; }
+
+ private:
+  const T* ptr_;
+  int i_;
+  int64_t j_;
+  int64_t n_;
+  int post_;
+};
+
+#ifdef __NVCC__
+template <typename T>
+class RowwiseTransformIterator<T, platform::CUDADeviceContext>
+    : public thrust::iterator_adaptor<
+          RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*> {
+ public:
+  typedef thrust::iterator_adaptor<
+      RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
+      super_t;
+  HOSTDEVICE RowwiseTransformIterator(const T* x, int n)
+      : super_t(x), begin_(x), n_(n){};
+  friend class thrust::iterator_core_access;
+
+ private:
+  unsigned int n_;
+  const T* begin_;
+  HOSTDEVICE typename super_t::reference dereference() const {
+    return *(begin_ + (this->base() - begin_) % n_);
+  }
+};
+
+template <typename T>
+class MidWiseTransformIterator<T, platform::CUDADeviceContext>
+    : public thrust::iterator_adaptor<
+          MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*> {
+ public:
+  typedef thrust::iterator_adaptor<
+      MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
+      super_t;
+  HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post)
+      : super_t(x), begin_(x), n_(n), post_(post){};
+  friend class thrust::iterator_core_access;
+
+ private:
+  unsigned int post_;
+  unsigned int n_;
+  const T* begin_;
+  HOSTDEVICE typename super_t::reference dereference() const {
+    return *(begin_ + (((this->base() - begin_) / post_) % n_));
+  }
+};
+#endif
+
+template <typename Functor, typename T, typename DeviceContext>
+class TransformFunctor {
+ public:
+  TransformFunctor(const framework::Tensor* x, const framework::Tensor* y,
+                   framework::Tensor* z, const DeviceContext& ctx, Functor func)
+      : x_(x->data<T>()),
+        y_(y->data<T>()),
+        z_(z->mutable_data<T>(ctx.GetPlace())),
+        nx_(x->numel()),
+        ctx_(ctx),
+        func_(func) {}
+
+  inline void Run() const {
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, x_, x_ + nx_, y_, z_, func_);
+  }
+
+  inline void RunRowWise(int n, int pre) const {
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator<T, DeviceContext>(y_, n),
+          z_, func_);
+  }
+
+  inline void RunMidWise(int n, int pre, int post) const {
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, x_, x_ + nx_,
+          MidWiseTransformIterator<T, DeviceContext>(y_, n, post), z_, func_);
+  }
+
+ private:
+  const T* x_;
+  const T* y_;
+  T* z_;
+  int64_t nx_;
+  const DeviceContext& ctx_;
+  Functor func_;
+};
+
 #define EIGEN_FUNCTOR(name, eigen_op)                                          \
   struct Eigen##name##Functor {                                                \
-    template <typename Place, typename T>                                      \
+    template <typename DeviceContext, typename T>                              \
     inline void Run(const framework::Tensor* x, const framework::Tensor* y,    \
                     framework::Tensor* z,                                      \
                     const framework::ExecutionContext& ctx) {                  \
       auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
       auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
       auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_e);            \
+      z_e.device(                                                              \
+          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
+          eigen_op(x_e, y_e);                                                  \
     }                                                                          \
-    template <typename Place, typename T>                                      \
+    template <typename DeviceContext, typename T>                              \
     inline void RunBroadCast(const framework::Tensor* x,                       \
                              const framework::Tensor* y, framework::Tensor* z, \
                              const framework::ExecutionContext& ctx, int pre,  \
@@ -76,9 +236,11 @@ inline void get_mid_dims(const framework::DDim& x_dims,
       auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))                  \
                          .broadcast(Eigen::DSizes<int, 2>(pre, 1))             \
                          .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
+      z_e.device(                                                              \
+          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
+          eigen_op(x_e, y_bcast);                                              \
     }                                                                          \
-    template <typename Place, typename T>                                      \
+    template <typename DeviceContext, typename T>                              \
     inline void RunBroadCast2(const framework::Tensor* x,                      \
                               const framework::Tensor* y,                      \
                               framework::Tensor* z,                            \
@@ -90,11 +252,13 @@ inline void get_mid_dims(const framework::DDim& x_dims,
       auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))               \
                          .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))       \
                          .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(ctx.GetEigenDevice<Place>()) = eigen_op(x_e, y_bcast);        \
+      z_e.device(                                                              \
+          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
+          eigen_op(x_e, y_bcast);                                              \
     }                                                                          \
   }
 
-template <class functor, typename Place, typename T>
+template <class functor, typename DeviceContext, typename T>
 void ElementwiseCompute(const framework::ExecutionContext& ctx) {
   using Tensor = framework::Tensor;
 
@@ -110,7 +274,7 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) {
 
   if (x_dims == y_dims) {
     functor f;
-    f.template Run<Place, T>(x, y, z, ctx);
+    f.template Run<DeviceContext, T>(x, y, z, ctx);
     return;
   }
 
@@ -123,11 +287,11 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) {
   get_mid_dims(x_dims, y_dims, axis, pre, n, post);
   if (post == 1) {
     functor f;
-    f.template RunBroadCast<Place, T>(x, y, z, ctx, pre, n);
+    f.template RunBroadCast<DeviceContext, T>(x, y, z, ctx, pre, n);
     return;
   } else {
     functor f;
-    f.template RunBroadCast2<Place, T>(x, y, z, ctx, pre, n, post);
+    f.template RunBroadCast2<DeviceContext, T>(x, y, z, ctx, pre, n, post);
     return;
   }
 }
@@ -144,8 +308,9 @@ EIGEN_FUNCTOR(Mul, EIGEN_MUL);
 #define EIGEN_DIV(x, y) ((x) / (y))
 EIGEN_FUNCTOR(Div, EIGEN_DIV);
 
-template <typename Place, typename T, typename functor, typename functor1,
-          typename broadcastfunctor, typename broadcast2functor>
+template <typename DeviceContext, typename T, typename functor,
+          typename functor1, typename broadcastfunctor,
+          typename broadcast2functor>
 void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
   using Tensor = framework::Tensor;
 
@@ -154,7 +319,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
   auto* out = ctx.Input<Tensor>("Out");
   auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
-  auto place = ctx.GetEigenDevice<Place>();
+  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
   auto x_dims = x->dims();
   auto y_dims = y->dims();
diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/operators/elementwise_sub_op.cc
index 95d7979e39..3e4d19361e 100644
--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/operators/elementwise_sub_op.cc
@@ -34,13 +34,13 @@ REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
             elementwise_sub_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, double>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, int64_t>);
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_sub_op.cu b/paddle/operators/elementwise_sub_op.cu
index ec23bec35f..0b2f0f7d4d 100644
--- a/paddle/operators/elementwise_sub_op.cu
+++ b/paddle/operators/elementwise_sub_op.cu
@@ -17,15 +17,16 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, int64_t>);
-REGISTER_OP_GPU_KERNEL(
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
     elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, double>,
-    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, int64_t>);
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h
index 3f40c1c5bc..731a30c5e3 100644
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
@@ -18,11 +18,11 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseSubKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseCompute<EigenSubFunctor, Place, T>(ctx);
+    ElementwiseCompute<EigenSubFunctor, DeviceContext, T>(ctx);
   }
 };
 
@@ -101,11 +101,11 @@ struct ElementwiseSubBroadCast2GradFunctor {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ElementwiseSubGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseGradCompute<Place, T, ElementwiseSubGradFunctor<T>,
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>,
                            ElementwiseSubOneGradFunctor<T>,
                            ElementwiseSubBroadCastGradFunctor<T>,
                            ElementwiseSubBroadCast2GradFunctor<T>>(ctx);
diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc
index 282775fcda..8b3cddbb94 100644
--- a/paddle/operators/expand_op.cc
+++ b/paddle/operators/expand_op.cc
@@ -130,7 +130,8 @@ class ExpandGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
             ops::ExpandGradOp);
-REGISTER_OP_CPU_KERNEL(expand,
-                       ops::ExpandKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    expand_grad, ops::ExpandGradKernel<paddle::platform::CPUPlace, float>);
+    expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    expand_grad,
+    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/expand_op.cu b/paddle/operators/expand_op.cu
index 6744562b6c..99ee584d08 100644
--- a/paddle/operators/expand_op.cu
+++ b/paddle/operators/expand_op.cu
@@ -17,7 +17,8 @@
 #include "paddle/operators/expand_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(expand,
-                       ops::ExpandKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    expand_grad, ops::ExpandGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_grad,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h
index 4d7996ad1e..14ef8b0912 100644
--- a/paddle/operators/expand_op.h
+++ b/paddle/operators/expand_op.h
@@ -56,7 +56,7 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -83,12 +83,13 @@ class ExpandKernel : public framework::OpKernel<T> {
     auto x = EigenTensor<T, Rank>::From(*in0);
     out0->mutable_data<T>(context.GetPlace());
     auto y = EigenTensor<T, Rank>::From(*out0);
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     y.device(place) = x.broadcast(bcast_dims);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -164,7 +165,8 @@ class ExpandGradKernel : public framework::OpKernel<T> {
       reduce_dims[i] = reduce_dims_vec[i];
     }
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(context.GetEigenDevice<Place>()) =
+    x_grad.device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
         out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions());
   }
 };
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc
index 892922cd3a..7fb74e2b95 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -100,8 +100,11 @@ REGISTER_OPERATOR(fill_constant_batch_size_like,
                   ops::FillConstantBatchSizeLikeOpMaker);
 REGISTER_OP_CPU_KERNEL(
     fill_constant_batch_size_like,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, double>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, int>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
+                                           float>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
+                                           double>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
+                                           int>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
                                            int64_t>);
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
index 9e7a1eeab8..2e0e15f36b 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cu.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
@@ -16,10 +16,13 @@
 #include "paddle/framework/op_registry.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     fill_constant_batch_size_like,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, double>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, int>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           float>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           double>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           int>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
                                            int64_t>);
diff --git a/paddle/operators/fill_constant_batch_size_like_op.h b/paddle/operators/fill_constant_batch_size_like_op.h
index 339d97a30a..66da9d0307 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/operators/fill_constant_batch_size_like_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -27,8 +27,9 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(ctx.GetPlace());
     auto value = ctx.Attr<float>("value");
 
-    math::SetConstant<Place, T> setter;
-    setter(ctx.device_context(), out, static_cast<T>(value));
+    math::SetConstant<DeviceContext, T> setter;
+    setter(ctx.template device_context<DeviceContext>(), out,
+           static_cast<T>(value));
   }
 };
 
diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc
new file mode 100644
index 0000000000..382e161c5d
--- /dev/null
+++ b/paddle/operators/fill_op.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+struct FillOpVisitor {
+  FillOpVisitor(framework::LoDTensor *tensor, const std::vector<float> &value)
+      : tensor_(tensor), value_(value) {}
+
+  template <typename T>
+  void operator()() const {
+    platform::CPUPlace cpu;
+    auto *data = tensor_->mutable_data<T>(cpu);
+    std::transform(value_.data(), value_.data() + tensor_->numel(), data,
+                   [](float dat) { return static_cast<T>(dat); });
+  }
+
+  framework::LoDTensor *tensor_;
+  const std::vector<float> &value_;
+};
+
+class FillOp : public framework::OperatorBase {
+ public:
+  FillOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &out =
+        detail::Ref(detail::Ref(scope.FindVar(Output("Out")),
+                                "Cannot find variable %s", Output("Out"))
+                        .GetMutable<framework::LoDTensor>());
+    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    auto dtype = static_cast<framework::DataType>(Attr<int>("dtype"));
+    platform::CPUPlace cpu;
+    auto force_cpu = Attr<bool>("force_cpu");
+    out.mutable_data(force_cpu ? cpu : dev_ctx.GetPlace(),
+                     framework::ToTypeIndex(dtype));
+
+    framework::LoDTensor tensor;
+
+    if (force_cpu || platform::is_cpu_place(dev_ctx.GetPlace())) {
+      tensor.ShareDataWith(out);
+    } else {
+      // Always make tensor in CPU memory.
+      tensor.Resize(out.dims());
+      tensor.mutable_data(cpu, framework::ToTypeIndex(dtype));
+    }
+
+    framework::VisitDataType(
+        dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
+
+    if (!force_cpu && platform::is_gpu_place(dev_ctx.GetPlace())) {
+      // Copy tensor to out
+      framework::CopyFrom(tensor, dev_ctx.GetPlace(), dev_ctx, &out);
+    }
+  }
+};
+
+class FillOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FillOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddComment(R"DOC(Fill operator
+
+Fill an tensor with `value` and `shape`. The type of the tensor is specify by
+`dtype`.
+)DOC");
+    AddOutput("Out", "(LoDTensor) The output tensor.");
+    AddAttr<std::vector<float>>(
+        "value", "The float values of tensor, which are flatten in row major");
+    AddAttr<std::vector<int>>("shape", "The shape of output tensor");
+    AddAttr<int>("dtype", "The data type of output tensor, Default is float")
+        .SetDefault(framework::DataType::FP32);
+    AddAttr<bool>("force_cpu",
+                  "Whether the output tensor must be at CPU memory or not. "
+                  "Default is false.")
+        .SetDefault(false);
+  }
+};
+
+class FillOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim(
+        "Out",
+        framework::make_ddim(context->Attrs().Get<std::vector<int>>("shape")));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fill, ops::FillOp, ops::FillOpInferShape, ops::FillOpMaker);
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
index 95fb5932b8..720c11f5f1 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -54,8 +54,9 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp,
                              ops::FillZerosLikeOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    fill_zeros_like, ops::FillZerosLikeKernel<paddle::platform::CPUPlace, int>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, int64_t>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, float>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, double>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, bool>);
+    fill_zeros_like,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
diff --git a/paddle/operators/fill_zeros_like_op.cu.cc b/paddle/operators/fill_zeros_like_op.cu.cc
index 1501a17441..9f412306bb 100644
--- a/paddle/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/operators/fill_zeros_like_op.cu.cc
@@ -16,9 +16,10 @@
 #include "paddle/framework/op_registry.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    fill_zeros_like, ops::FillZerosLikeKernel<paddle::platform::GPUPlace, int>,
-    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, int64_t>,
-    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, float>,
-    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, double>,
-    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    fill_zeros_like,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
index 7e7d78eea2..a6e2941f52 100644
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -19,15 +19,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class FillZerosLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* out = context.Output<framework::Tensor>("Y");
     out->mutable_data<T>(context.GetPlace());
 
-    math::SetConstant<Place, T> setter;
-    setter(context.device_context(), out, static_cast<T>(0));
+    math::SetConstant<DeviceContext, T> setter;
+    setter(context.template device_context<DeviceContext>(), out,
+           static_cast<T>(0));
   }
 };
 
diff --git a/paddle/operators/ftrl_op.cc b/paddle/operators/ftrl_op.cc
index cb7ae69196..b14913ff21 100644
--- a/paddle/operators/ftrl_op.cc
+++ b/paddle/operators/ftrl_op.cc
@@ -135,5 +135,5 @@ The paper that proposed Follow The Regularized Leader (FTRL):
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(ftrl, ops::FTRLOp, ops::FTRLOpMaker);
-REGISTER_OP_CPU_KERNEL(ftrl,
-                       ops::FTRLOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    ftrl, ops::FTRLOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/ftrl_op.cu b/paddle/operators/ftrl_op.cu
index 97b36dade6..abbbe7adbe 100644
--- a/paddle/operators/ftrl_op.cu
+++ b/paddle/operators/ftrl_op.cu
@@ -15,5 +15,5 @@ specific language governing permissions and limitations under the License. */
 #include "paddle/operators/ftrl_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(ftrl,
-                       ops::FTRLOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    ftrl, ops::FTRLOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/ftrl_op.h b/paddle/operators/ftrl_op.h
index b040162f8d..4eea04cd8d 100644
--- a/paddle/operators/ftrl_op.h
+++ b/paddle/operators/ftrl_op.h
@@ -24,7 +24,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class FTRLOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -53,7 +53,7 @@ class FTRLOpKernel : public framework::OpKernel<T> {
     auto p_out = EigenVector<T>::Flatten(*param_out);
     auto s_acc_out = EigenVector<T>::Flatten(*sq_accum_out);
     auto l_acc_out = EigenVector<T>::Flatten(*lin_accum_out);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<int, 1> grad_dsize(grad->numel());
 
diff --git a/paddle/operators/gather.cu.h b/paddle/operators/gather.cu.h
index 8d04ecd284..c806aa5f05 100644
--- a/paddle/operators/gather.cu.h
+++ b/paddle/operators/gather.cu.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 using framework::Tensor;
-using platform::Place;
+using platform::DeviceContext;
 
 #define CUDA_1D_KERNEL_LOOP(i, n)                              \
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu
index 92219d6a43..b37f0576e2 100644
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
@@ -49,7 +49,8 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
 
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto place = ctx.GetEigenDevice<platform::GPUPlace>();
+    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
+                       .eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
 
     GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
@@ -60,5 +61,5 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
-REGISTER_OP_GPU_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
diff --git a/paddle/operators/gather_op.h b/paddle/operators/gather_op.h
index 8276ed0d3d..1a1ba0c41a 100644
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
@@ -53,7 +53,8 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
 
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto place = ctx.GetEigenDevice<platform::CPUPlace>();
+    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
 
     ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
index 315560bf1b..ffce6f7138 100644
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@@ -60,5 +60,5 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(gaussian_random,
-                       paddle::operators::GPUGaussianRandomKernel<float>);
+REGISTER_OP_CUDA_KERNEL(gaussian_random,
+                        paddle::operators::GPUGaussianRandomKernel<float>);
diff --git a/paddle/operators/gru_op.cc b/paddle/operators/gru_op.cc
index 5aa03f8916..311e7edcf1 100644
--- a/paddle/operators/gru_op.cc
+++ b/paddle/operators/gru_op.cc
@@ -213,8 +213,9 @@ class GRUGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
-REGISTER_OP_CPU_KERNEL(gru, ops::GRUKernel<paddle::platform::CPUPlace, float>,
-                       ops::GRUKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(gru_grad,
-                       ops::GRUGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::GRUGradKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    gru, ops::GRUKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    gru_grad, ops::GRUGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/gru_op.cu.cc b/paddle/operators/gru_op.cu.cc
index 0ceff94ec3..458630ca61 100644
--- a/paddle/operators/gru_op.cu.cc
+++ b/paddle/operators/gru_op.cu.cc
@@ -15,8 +15,9 @@
 #include "paddle/operators/gru_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(gru, ops::GRUKernel<paddle::platform::GPUPlace, float>,
-                       ops::GRUKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(gru_grad,
-                       ops::GRUGradKernel<paddle::platform::GPUPlace, float>,
-                       ops::GRUGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    gru, ops::GRUKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    gru_grad, ops::GRUGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h
index 564489d3a9..6d02dff578 100644
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@@ -27,16 +27,16 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
-inline void ReorderInitState(const platform::DeviceContext& ctx,
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
                              const framework::Tensor& src, const size_t* index,
                              framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<Place, T> row_shuffle;
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
   row_shuffle(ctx, src, index, *dst, indexed_src);
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GRUKernel : public framework::OpKernel<T> {
  public:
   void BatchCompute(const framework::ExecutionContext& context) const {
@@ -60,12 +60,12 @@ class GRUKernel : public framework::OpKernel<T> {
     auto hidden_dims = hidden->dims();
 
     bool is_reverse = context.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<Place, T> to_batch;
-    auto& dev_ctx = context.device_context();
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     to_batch(dev_ctx, *input, *batch_gate, true, is_reverse);
 
     if (bias) {
-      math::RowwiseAdd<Place, T> add_bias;
+      math::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
     }
 
@@ -80,8 +80,9 @@ class GRUKernel : public framework::OpKernel<T> {
       // Since the batch computing for GRU reorders the input sequences
       // according to their length. The initialized cell state also needs
       // to reorder.
-      ReorderInitState<Place, T>(context.device_context(), *h0, order,
-                                 &ordered_h0, true);
+      ReorderInitState<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), *h0, order,
+          &ordered_h0, true);
       gru_value.prev_out_value = ordered_h0.data<T>();
     } else {
       gru_value.prev_out_value = nullptr;
@@ -99,14 +100,14 @@ class GRUKernel : public framework::OpKernel<T> {
       gru_value.output_value = hidden_t.data<T>();
       gru_value.gate_value = gate_t.data<T>();
       gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-      math::GRUUnitFunctor<Place, T>::compute(
+      math::GRUUnitFunctor<DeviceContext, T>::compute(
           dev_ctx, gru_value, frame_size, cur_batch_size,
           math::ActiveType(context.Attr<std::string>("activation")),
           math::ActiveType(context.Attr<std::string>("gate_activation")));
       gru_value.prev_out_value = gru_value.output_value;
     }
 
-    math::Batch2LoDTensorFunctor<Place, T> to_seq;
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_hidden->set_lod(batch_gate->lod());
     to_seq(dev_ctx, *batch_hidden, *hidden);
   }
@@ -116,7 +117,7 @@ class GRUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GRUGradKernel : public framework::OpKernel<T> {
  public:
   void BatchCompute(const framework::ExecutionContext& context) const {
@@ -141,14 +142,14 @@ class GRUGradKernel : public framework::OpKernel<T> {
     auto hidden_dims = hidden->dims();
     int frame_size = hidden_dims[1];
 
-    math::LoDTensor2BatchFunctor<Place, T> to_batch;
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad;
     batch_hidden_grad.mutable_data<T>(hidden_dims, context.GetPlace());
     batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
     batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
                                                  context.GetPlace());
-    math::SetConstant<Place, T> zero;
-    auto& dev_ctx = context.device_context();
+    math::SetConstant<DeviceContext, T> zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     zero(dev_ctx, &batch_hidden_grad, static_cast<T>(0.0));
     zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
     zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
@@ -156,12 +157,13 @@ class GRUGradKernel : public framework::OpKernel<T> {
     Tensor ordered_h0, ordered_h0_grad;
     const size_t* order = batch_gate->lod()[2].data();
     if (h0) {
-      ReorderInitState<Place, T>(context.device_context(), *h0, order,
-                                 &ordered_h0, true);
+      ReorderInitState<DeviceContext, T>(dev_ctx, *h0, order, &ordered_h0,
+                                         true);
     }
     if (h0_grad) {
       ordered_h0_grad.mutable_data<T>(h0_grad->dims(), context.GetPlace());
-      zero(context.device_context(), &ordered_h0_grad, static_cast<T>(0.0));
+      zero(context.template device_context<DeviceContext>(), &ordered_h0_grad,
+           static_cast<T>(0.0));
     }
 
     bool is_reverse = context.Attr<bool>("is_reverse");
@@ -216,25 +218,25 @@ class GRUGradKernel : public framework::OpKernel<T> {
         gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
       }
 
-      math::GRUUnitGradFunctor<Place, T>::compute(
+      math::GRUUnitGradFunctor<DeviceContext, T>::compute(
           dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size,
           math::ActiveType(context.Attr<std::string>("activation")),
           math::ActiveType(context.Attr<std::string>("gate_activation")));
     }
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
-      math::Batch2LoDTensorFunctor<Place, T> to_seq;
+      math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
       batch_gate_grad.set_lod(batch_gate->lod());
       to_seq(dev_ctx, batch_gate_grad, *input_grad);
     }
     if (bias_grad) {
       bias_grad->mutable_data<T>(context.GetPlace());
-      math::ColwiseSum<Place, T> col_sum;
+      math::ColwiseSum<DeviceContext, T> col_sum;
       col_sum(dev_ctx, batch_gate_grad, bias_grad);
     }
     if (h0 && h0_grad) {
-      ReorderInitState<Place, T>(context.device_context(), ordered_h0_grad,
-                                 order, h0_grad, false);
+      ReorderInitState<DeviceContext, T>(dev_ctx, ordered_h0_grad, order,
+                                         h0_grad, false);
     }
   }
 
diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc
index 877c969103..705de87be5 100644
--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
@@ -201,9 +201,10 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
             ops::GRUUnitGradOp);
-REGISTER_OP_CPU_KERNEL(gru_unit,
-                       ops::GRUUnitKernel<paddle::platform::CPUPlace, float>,
-                       ops::GRUUnitKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::CPUPlace, float>,
-    ops::GRUUnitGradKernel<paddle::platform::CPUPlace, double>);
+    gru_unit, ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    gru_unit_grad,
+    ops::GRUUnitGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUUnitGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/gru_unit_op.cu b/paddle/operators/gru_unit_op.cu
index 821c8c6421..7c752db494 100644
--- a/paddle/operators/gru_unit_op.cu
+++ b/paddle/operators/gru_unit_op.cu
@@ -16,9 +16,10 @@
 #include "paddle/operators/gru_unit_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(gru_unit,
-                       ops::GRUUnitKernel<paddle::platform::GPUPlace, float>,
-                       ops::GRUUnitKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
-    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::GPUPlace, float>,
-    ops::GRUUnitGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    gru_unit, ops::GRUUnitKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUUnitKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    gru_unit_grad,
+    ops::GRUUnitGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUUnitGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h
index 3398c0934e..8fe60c750d 100644
--- a/paddle/operators/gru_unit_op.h
+++ b/paddle/operators/gru_unit_op.h
@@ -34,7 +34,7 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GRUUnitKernel : public framework::OpKernel<T> {
  public:
   template <typename Device, typename X, typename Y>
@@ -71,7 +71,8 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     auto g = EigenMatrix<T>::From(*gate);
     auto r_h_p = EigenMatrix<T>::From(*reset_hidden_prev);
     auto h = EigenMatrix<T>::From(*hidden);
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     // calculate unactivated gate outputs
     if (bias) {
@@ -86,10 +87,10 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     const T* weight_data = weight->data<T>();
     T* gate_data = gate->data<T>();
     T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    math::gemm<Place, T>(context.device_context(), false, false, batch_size,
-                         2 * frame_size, frame_size, 1, hidden_prev_data,
-                         frame_size, weight_data, frame_size * 2, 1, gate_data,
-                         frame_size * 3);
+    math::gemm<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), false, false,
+        batch_size, 2 * frame_size, frame_size, 1, hidden_prev_data, frame_size,
+        weight_data, frame_size * 2, 1, gate_data, frame_size * 3);
 
     // calculate activited gate
     Eigen::array<int, 2> extents({{batch_size, frame_size}});
@@ -102,11 +103,11 @@ class GRUUnitKernel : public framework::OpKernel<T> {
                g.slice(r_offsets, extents), g.slice(r_offsets, extents));
     auto r = g.slice(r_offsets, extents);  // reset gate
     r_h_p.device(place) = r * h_p;         // reset previous hidden state
-    math::gemm<Place, T>(context.device_context(), false, false, batch_size,
-                         frame_size, frame_size, 1, reset_hidden_prev_data,
-                         frame_size, weight_data + frame_size * frame_size * 2,
-                         frame_size, 1, gate_data + frame_size * 2,
-                         frame_size * 3);
+    math::gemm<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), false, false,
+        batch_size, frame_size, frame_size, 1, reset_hidden_prev_data,
+        frame_size, weight_data + frame_size * frame_size * 2, frame_size, 1,
+        gate_data + frame_size * 2, frame_size * 3);
 
     Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
     ActCompute(context.Attr<int>("activation"), place,
@@ -118,7 +119,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class GRUUnitGradKernel : public framework::OpKernel<T> {
  public:
   template <typename Device, typename X, typename Y, typename DX, typename DY>
@@ -166,7 +167,8 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     auto d_h = EigenMatrix<T>::From(*hidden_grad);
     auto d_g = EigenMatrix<T>::From(gate_grad);
     auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     int batch_size = input->dims()[0];
     int frame_size = hidden_prev->dims()[1];
@@ -186,11 +188,11 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     ActGradCompute(context.Attr<int>("activation"), place, c, c,
                    d_g.slice(c_offsets, extents), d_h * u);
     // backward for reset_hidden_prev
-    math::gemm<Place, T>(context.device_context(), false, true, batch_size,
-                         frame_size, frame_size, 1,
-                         gate_grad_data + frame_size * 2, frame_size * 3,
-                         weight_data + frame_size * frame_size * 2, frame_size,
-                         0, reset_hidden_prev_grad_data, frame_size);
+    math::gemm<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), false, true,
+        batch_size, frame_size, frame_size, 1, gate_grad_data + frame_size * 2,
+        frame_size * 3, weight_data + frame_size * frame_size * 2, frame_size,
+        0, reset_hidden_prev_grad_data, frame_size);
     // backward for unactivated reset gate
     ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
                    d_g.slice(r_offsets, extents), d_r_h_p * h_p);
@@ -198,17 +200,18 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     if (weight_grad) {
       T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
       // backward for state_weight
-      math::gemm<Place, T>(
-          context.device_context(), true, false, frame_size, frame_size,
-          batch_size, 1, reset_hidden_prev_data, frame_size,
-          gate_grad_data + frame_size * 2, frame_size * 3, 0,
+      math::gemm<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), true, false,
+          frame_size, frame_size, batch_size, 1, reset_hidden_prev_data,
+          frame_size, gate_grad_data + frame_size * 2, frame_size * 3, 0,
           weight_grad_data + frame_size * frame_size * 2, frame_size);
 
       // backward for update_gate_weight and reset_gate_weight
-      math::gemm<Place, T>(context.device_context(), true, false, frame_size,
-                           frame_size * 2, batch_size, 1, hidden_prev_data,
-                           frame_size, gate_grad_data, frame_size * 3, 0,
-                           weight_grad_data, frame_size * 2);
+      math::gemm<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), true, false,
+          frame_size, frame_size * 2, batch_size, 1, hidden_prev_data,
+          frame_size, gate_grad_data, frame_size * 3, 0, weight_grad_data,
+          frame_size * 2);
     }
     // backward for hidden_prev
     if (hidden_prev_grad) {
@@ -216,10 +219,11 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
           hidden_prev_grad->mutable_data<T>(context.GetPlace());
       auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
       d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
-      math::gemm<Place, T>(context.device_context(), false, true, batch_size,
-                           frame_size, frame_size * 2, 1, gate_grad_data,
-                           frame_size * 3, weight_data, frame_size * 2, 1,
-                           hidden_prev_grad_data, frame_size);
+      math::gemm<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), false, true,
+          batch_size, frame_size, frame_size * 2, 1, gate_grad_data,
+          frame_size * 3, weight_data, frame_size * 2, 1, hidden_prev_grad_data,
+          frame_size);
     }
     // backward for input
     if (input_grad) {
diff --git a/paddle/operators/hinge_loss_op.cc b/paddle/operators/hinge_loss_op.cc
index 1e13897bb6..373b4d99b4 100644
--- a/paddle/operators/hinge_loss_op.cc
+++ b/paddle/operators/hinge_loss_op.cc
@@ -106,8 +106,9 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
             hinge_loss_grad, ops::HingeLossGradOp);
-REGISTER_OP_CPU_KERNEL(hinge_loss,
-                       ops::HingeLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    hinge_loss,
+    ops::HingeLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     hinge_loss_grad,
-    ops::HingeLossGradKernel<paddle::platform::CPUPlace, float>);
+    ops::HingeLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/hinge_loss_op.cu b/paddle/operators/hinge_loss_op.cu
index ec20b08e30..31a5bde292 100644
--- a/paddle/operators/hinge_loss_op.cu
+++ b/paddle/operators/hinge_loss_op.cu
@@ -16,8 +16,9 @@
 #include "paddle/operators/hinge_loss_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(hinge_loss,
-                       ops::HingeLossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    hinge_loss,
+    ops::HingeLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     hinge_loss_grad,
-    ops::HingeLossGradKernel<paddle::platform::GPUPlace, float>);
+    ops::HingeLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/hinge_loss_op.h b/paddle/operators/hinge_loss_op.h
index c0be496f9c..91369cfb8a 100644
--- a/paddle/operators/hinge_loss_op.h
+++ b/paddle/operators/hinge_loss_op.h
@@ -19,14 +19,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class HingeLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* pred = context.Input<framework::Tensor>("Logits");
     auto* label = context.Input<framework::Tensor>("Labels");
     auto* loss = context.Output<framework::Tensor>("Loss");
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto x = framework::EigenVector<T>::Flatten(*pred);
     auto y = framework::EigenVector<T>::Flatten(*label);
@@ -38,7 +39,7 @@ class HingeLossKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class HingeLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -48,7 +49,8 @@ class HingeLossGradKernel : public framework::OpKernel<T> {
         context.Input<framework::Tensor>(framework::GradVarName("Loss"));
     auto* dpred =
         context.Output<framework::Tensor>(framework::GradVarName("Logits"));
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto x = framework::EigenVector<T>::Flatten(*pred);
     auto y = framework::EigenVector<T>::Flatten(*label);
diff --git a/paddle/operators/huber_loss_op.cc b/paddle/operators/huber_loss_op.cc
index 938803d5b3..11828d083a 100644
--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/operators/huber_loss_op.cc
@@ -124,8 +124,9 @@ class HuberLossGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
             huber_loss_grad, ops::HuberLossGradOp);
-REGISTER_OP_CPU_KERNEL(huber_loss,
-                       ops::HuberLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    huber_loss,
+    ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CPUPlace, float>);
+    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/huber_loss_op.cu b/paddle/operators/huber_loss_op.cu
index 317321dc6c..d49a4d9d42 100644
--- a/paddle/operators/huber_loss_op.cu
+++ b/paddle/operators/huber_loss_op.cu
@@ -16,8 +16,9 @@
 #include "paddle/operators/huber_loss_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(huber_loss,
-                       ops::HuberLossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    huber_loss,
+    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::GPUPlace, float>);
+    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/huber_loss_op.h b/paddle/operators/huber_loss_op.h
index 4e7bc55432..4dd20e8b08 100644
--- a/paddle/operators/huber_loss_op.h
+++ b/paddle/operators/huber_loss_op.h
@@ -41,7 +41,7 @@ struct HuberLossForward {
   T delta;
 };
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class HuberLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -50,7 +50,8 @@ class HuberLossKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>("Residual");
     auto* out1 = context.Output<Tensor>("Out");
     auto delta = static_cast<T>(context.Attr<AttrType>("delta"));
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto x = EigenVector<T>::Flatten(*in0);
     auto y = EigenVector<T>::Flatten(*in1);
@@ -85,7 +86,7 @@ struct HuberLossBackward {
   T delta;
 };
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class HuberLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -94,7 +95,8 @@ class HuberLossGradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     auto* out1 = context.Output<Tensor>(framework::GradVarName("Y"));
     auto delta = static_cast<T>(context.op().Attr<AttrType>("delta"));
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto residual = EigenVector<T>::Flatten(*in0);
     auto out_grad = EigenVector<T>::Flatten(*in1);
diff --git a/paddle/operators/l1_norm_op.cc b/paddle/operators/l1_norm_op.cc
index 02ebf02296..c0b51202c6 100644
--- a/paddle/operators/l1_norm_op.cc
+++ b/paddle/operators/l1_norm_op.cc
@@ -69,7 +69,8 @@ $$Out = \sum{|X|}$$
 namespace ops = paddle::operators;
 REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad,
             ops::L1NormGradOp);
-REGISTER_OP_CPU_KERNEL(l1_norm,
-                       ops::L1NormKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    l1_norm_grad, ops::L1NormGradKernel<paddle::platform::CPUPlace, float>);
+    l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    l1_norm_grad,
+    ops::L1NormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/l1_norm_op.cu b/paddle/operators/l1_norm_op.cu
index 1c206e04cc..fd725f86f6 100644
--- a/paddle/operators/l1_norm_op.cu
+++ b/paddle/operators/l1_norm_op.cu
@@ -16,7 +16,8 @@
 #include "paddle/operators/l1_norm_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(l1_norm,
-                       ops::L1NormKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    l1_norm_grad, ops::L1NormGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    l1_norm, ops::L1NormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    l1_norm_grad,
+    ops::L1NormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/l1_norm_op.h b/paddle/operators/l1_norm_op.h
index 3c60dc3dc7..ae3878f2b7 100644
--- a/paddle/operators/l1_norm_op.h
+++ b/paddle/operators/l1_norm_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 // Out = sum(abs(X))
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class L1NormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -30,14 +30,15 @@ class L1NormKernel : public framework::OpKernel<T> {
 
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto out = framework::EigenScalar<T>::From(*Out);
-    auto place = context.GetEigenDevice<Place>();
+    auto &place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     out.device(place) = x.abs().sum();
   }
 };
 
 // dX = dout * sign(X)
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class L1NormGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -52,7 +53,8 @@ class L1NormGradKernel : public framework::OpKernel<T> {
     auto x_eigen = framework::EigenVector<T>::Flatten(*x);
     auto d_out_eigen = framework::EigenVector<T>::Flatten(*d_out);
     auto dx_eigen = framework::EigenVector<T>::Flatten(*dx);
-    auto place = context.GetEigenDevice<Place>();
+    auto &place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<int, 1> x_dsize(x->numel());
     dx_eigen.device(place) = d_out_eigen.broadcast(x_dsize) * x_eigen.sign();
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
index 8e079a14e0..896e3657d4 100644
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -261,9 +261,10 @@ REGISTER_OP(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFOpMaker,
             linear_chain_crf_grad, ops::LinearChainCRFGradOp);
 REGISTER_OP_CPU_KERNEL(
     linear_chain_crf,
-    ops::LinearChainCRFOpKernel<paddle::platform::CPUPlace, float>,
-    ops::LinearChainCRFOpKernel<paddle::platform::CPUPlace, double>);
+    ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     linear_chain_crf_grad,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUPlace, float>,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUPlace, double>);
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUDeviceContext,
+                                    double>);
diff --git a/paddle/operators/linear_chain_crf_op.cu b/paddle/operators/linear_chain_crf_op.cu
index 6fc8995f4c..3b105ec341 100644
--- a/paddle/operators/linear_chain_crf_op.cu
+++ b/paddle/operators/linear_chain_crf_op.cu
@@ -16,11 +16,12 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     linear_chain_crf,
-    ops::LinearChainCRFOpKernel<paddle::platform::GPUPlace, float>,
-    ops::LinearChainCRFOpKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
+    ops::LinearChainCRFOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LinearChainCRFOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
     linear_chain_crf_grad,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::GPUPlace, float>,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::GPUPlace, double>);
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CUDADeviceContext,
+                                    double>);
diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h
index 014bbfa758..694584e79c 100644
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
@@ -50,7 +50,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LinearChainCRFOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -137,7 +137,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
         framework::make_ddim({static_cast<int64_t>(batch_size), 1}),
         platform::CPUPlace());
 
-    auto place = ctx.GetEigenDevice<platform::CPUPlace>();
+    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
     auto x = EigenMatrix<T>::From(*emission_weights);
     auto x_row_max = EigenMatrix<T>::From(emission_row_max);
     x_row_max.device(place) =
@@ -287,7 +288,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -359,8 +360,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     emission_grad->mutable_data<T>(platform::CPUPlace());
     if (transition_grad) {
       transition_grad->mutable_data<T>(platform::CPUPlace());
-      math::SetConstant<platform::CPUPlace, T>()(ctx.device_context(),
-                                                 transition_grad, 0.);
+      math::set_constant(ctx.device_context(), transition_grad, 0.);
     }
     // Now, all the inputs and outputs should be on the CPU memory.
 
@@ -384,10 +384,10 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
       Tensor one_seq_beta = beta.Slice(start_pos, end_pos);
       Tensor one_seq_emission_grad = emission_grad->Slice(start_pos, end_pos);
 
-      BackwardOneSequence(ctx.device_context(), ll_grad[i],
-                          one_seq_emission_exps, *transition_exps,
-                          one_seq_alpha, one_seq_label, &one_seq_beta,
-                          transition_grad, &one_seq_emission_grad);
+      BackwardOneSequence(
+          ctx.template device_context<platform::CPUDeviceContext>(), ll_grad[i],
+          one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label,
+          &one_seq_beta, transition_grad, &one_seq_emission_grad);
     }
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
@@ -441,8 +441,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     copyTensor(ctx, transition_grad_src, transition_grad_dst);
   }
 
-  void BackwardOneSequence(const platform::DeviceContext& ctx, const T ll_grad,
-                           const Tensor& emission_exps,
+  void BackwardOneSequence(const platform::CPUDeviceContext& ctx,
+                           const T ll_grad, const Tensor& emission_exps,
                            const Tensor& transition_exps, const Tensor& alpha,
                            const Tensor& label, Tensor* beta,
                            Tensor* transition_grad,
@@ -481,7 +481,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     auto alpha_mat = EigenMatrix<T>::From(alpha);
     auto beta_mat = EigenMatrix<T>::From(*beta);
 
-    auto* place = ctx.GetEigenDevice<platform::CPUPlace>();
+    auto* place = ctx.eigen_device();
     auto prob = alpha_mat * beta_mat;
     auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
                        .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
diff --git a/paddle/operators/lod_array_length_op.cc b/paddle/operators/lod_array_length_op.cc
index 80445eb575..b2f4ec57fa 100644
--- a/paddle/operators/lod_array_length_op.cc
+++ b/paddle/operators/lod_array_length_op.cc
@@ -43,12 +43,16 @@ class LoDArrayLengthProtoMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(LoDTensorArray) The input tensor array.");
     AddOutput("Out", "(Tensor) 1x1 CPU Tensor of length, int64_t");
-    AddComment(R"DOC(Get the length of lod tensor array
+    AddComment(R"DOC(
+LoDArrayLength Operator.
 
-Out = len(X)
+This operator obtains the length of lod tensor array:
+
+$$Out = len(X)$$
 
 NOTE: The output is a CPU Tensor since the control variable should be only in
 CPU and the length of LoDTensorArray should be used as control variables.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/lod_reset_op.cu b/paddle/operators/lod_reset_op.cu
index 5244a17c3a..f7c2358980 100644
--- a/paddle/operators/lod_reset_op.cu
+++ b/paddle/operators/lod_reset_op.cu
@@ -16,9 +16,10 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(lod_reset,
-                       ops::LoDResetKernel<paddle::platform::GPUPlace, float>,
-                       ops::LoDResetKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
-    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::GPUPlace, float>,
-    ops::LoDResetGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lod_reset, ops::LoDResetKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lod_reset_grad,
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/lod_reset_op.h b/paddle/operators/lod_reset_op.h
index cbcbf80adc..b86f8b1313 100644
--- a/paddle/operators/lod_reset_op.h
+++ b/paddle/operators/lod_reset_op.h
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LoDResetKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -65,7 +65,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LoDResetGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
diff --git a/paddle/operators/log_loss_op.cc b/paddle/operators/log_loss_op.cc
index 257e5c8a49..4524229a33 100644
--- a/paddle/operators/log_loss_op.cc
+++ b/paddle/operators/log_loss_op.cc
@@ -109,7 +109,8 @@ class LogLossGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
             ops::LogLossGradOp);
-REGISTER_OP_CPU_KERNEL(log_loss,
-                       ops::LogLossKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    log_loss_grad, ops::LogLossGradKernel<paddle::platform::CPUPlace, float>);
+    log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    log_loss_grad,
+    ops::LogLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/log_loss_op.cu b/paddle/operators/log_loss_op.cu
index 6c189ef341..e87ac7d12a 100644
--- a/paddle/operators/log_loss_op.cu
+++ b/paddle/operators/log_loss_op.cu
@@ -16,7 +16,8 @@
 #include "paddle/operators/log_loss_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(log_loss,
-                       ops::LogLossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    log_loss_grad, ops::LogLossGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    log_loss_grad,
+    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/log_loss_op.h b/paddle/operators/log_loss_op.h
index 73404fce91..743eddb740 100644
--- a/paddle/operators/log_loss_op.h
+++ b/paddle/operators/log_loss_op.h
@@ -24,7 +24,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class LogLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -38,7 +38,7 @@ class LogLossKernel : public framework::OpKernel<T> {
     auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
 
     auto loss = EigenVector<T>::Flatten(*loss_out);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     loss.device(place) = (-(label * (prediction + epsilon).log()) -
                           ((static_cast<T>(1) - label) *
@@ -46,7 +46,7 @@ class LogLossKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class LogLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -59,7 +59,7 @@ class LogLossGradKernel : public framework::OpKernel<T> {
     auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
 
     auto dl = EigenVector<T>::Flatten(*dloss);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     if (dpred) {
       dpred->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/operators/logical_op.cc b/paddle/operators/logical_op.cc
index a37582c1d8..c818d5e9c1 100644
--- a/paddle/operators/logical_op.cc
+++ b/paddle/operators/logical_op.cc
@@ -139,15 +139,16 @@ class LogicalOp : public framework::OperatorWithKernel {
       ::paddle::operators::UnaryLogicalOpInferShape<_##op_type##Comment>, \
       ::paddle::framework::EmptyGradOpMaker);
 
-REGISTER_BINARY_LOGICAL_OP(logical_and, "Out = X && Y");
+REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$");
 REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU,
                                paddle::operators::LogicalAndFunctor);
-REGISTER_BINARY_LOGICAL_OP(logical_or, "Out = X && Y");
+REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$");
 REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU,
                                paddle::operators::LogicalOrFunctor);
-REGISTER_UNARY_LOGICAL_OP(logical_not, "Out = !X");
+REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
 REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
                               paddle::operators::LogicalNotFunctor);
-REGISTER_BINARY_LOGICAL_OP(logical_xor, "Out = (X || Y) && !(X && Y)");
+REGISTER_BINARY_LOGICAL_OP(logical_xor,
+                           "$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$");
 REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
                                paddle::operators::LogicalXorFunctor);
diff --git a/paddle/operators/logical_op.cu b/paddle/operators/logical_op.cu
index d41239b2ca..7fef60e0c9 100644
--- a/paddle/operators/logical_op.cu
+++ b/paddle/operators/logical_op.cu
@@ -14,11 +14,11 @@
 
 #include "paddle/operators/logical_op.h"
 
-REGISTER_BINARY_LOGICAL_KERNEL(logical_and, GPU,
+REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA,
                                paddle::operators::LogicalAndFunctor);
-REGISTER_BINARY_LOGICAL_KERNEL(logical_or, GPU,
+REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CUDA,
                                paddle::operators::LogicalOrFunctor);
-REGISTER_UNARY_LOGICAL_KERNEL(logical_not, GPU,
+REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CUDA,
                               paddle::operators::LogicalNotFunctor);
-REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, GPU,
+REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CUDA,
                                paddle::operators::LogicalXorFunctor);
diff --git a/paddle/operators/logical_op.h b/paddle/operators/logical_op.h
index 6e78a7d6ed..629388cac8 100644
--- a/paddle/operators/logical_op.h
+++ b/paddle/operators/logical_op.h
@@ -47,7 +47,7 @@ struct LogicalXorFunctor {
   }
 };
 
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class BinaryLogicalOpKernel
     : public framework::OpKernel<typename Functor::ELEM_TYPE> {
  public:
@@ -57,14 +57,14 @@ class BinaryLogicalOpKernel
     auto* y = context.Input<framework::Tensor>("Y");
     auto* out = context.Output<framework::Tensor>("Out");
     Functor binary_func;
-    platform::Transform<Place> trans;
-    trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
-          y->data<T>(), out->mutable_data<bool>(context.GetPlace()),
-          binary_func);
+    platform::Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x->data<T>(),
+          x->data<T>() + x->numel(), y->data<T>(),
+          out->mutable_data<bool>(context.GetPlace()), binary_func);
   }
 };
 
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class UnaryLogicalOpKernel
     : public framework::OpKernel<typename Functor::ELEM_TYPE> {
  public:
@@ -73,8 +73,9 @@ class UnaryLogicalOpKernel
     auto* x = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
     Functor unary_func;
-    platform::Transform<Place> trans;
-    trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
+    platform::Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x->data<T>(),
+          x->data<T>() + x->numel(),
           out->mutable_data<bool>(context.GetPlace()), unary_func);
   }
 };
@@ -85,9 +86,9 @@ class UnaryLogicalOpKernel
 #define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \
   REGISTER_OP_##dev##_KERNEL(                                 \
       op_type, ::paddle::operators::BinaryLogicalOpKernel<    \
-                   ::paddle::platform::dev##Place, functor<bool>>);
+                   ::paddle::platform::dev##DeviceContext, functor<bool>>);
 
 #define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \
   REGISTER_OP_##dev##_KERNEL(                                \
       op_type, ::paddle::operators::UnaryLogicalOpKernel<    \
-                   ::paddle::platform::dev##Place, functor<bool>>);
+                   ::paddle::platform::dev##DeviceContext, functor<bool>>);
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
index 84b044184a..9431030a53 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -85,6 +85,8 @@ template <typename T>
 class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
+    auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
     bool is_sparse = context.Attr<bool>("is_sparse");
     if (is_sparse) {
       auto* ids = context.Input<LoDTensor>("Ids");
@@ -95,7 +97,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       auto* ids_data = ids->data<int64_t>();
       auto ids_dim = ids->dims();
 
-      auto stream = context.cuda_device_context().stream();
+      auto stream = dev_ctx.stream();
       // copy GPU memory to CPU pinned memory
       framework::Vector<int64_t> new_rows;
       new_rows.resize(ids_dim[0]);
@@ -129,14 +131,11 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
 
       auto t = framework::EigenVector<T>::Flatten(*d_table_t);
-      t.device(context.GetEigenDevice<platform::GPUPlace>()) =
-          t.constant(static_cast<T>(0));
+      t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
 
       dim3 threads(128, 8);
       dim3 grids(8, 1);
-      LookupTableGrad<
-          T, 128, 8,
-          8><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+      LookupTableGrad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>(
           d_table, d_output, ids, N, K, D);
     }
   }
@@ -146,7 +145,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>,
-                       ops::LookupTableCUDAKernel<double>);
-REGISTER_OP_GPU_KERNEL(lookup_table_grad, ops::LookupTableGradCUDAKernel<float>,
-                       ops::LookupTableGradCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>,
+                        ops::LookupTableCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(lookup_table_grad,
+                        ops::LookupTableGradCUDAKernel<float>,
+                        ops::LookupTableGradCUDAKernel<double>);
diff --git a/paddle/operators/lrn_op.cc b/paddle/operators/lrn_op.cc
index 00392b7967..b5b7bc940a 100644
--- a/paddle/operators/lrn_op.cc
+++ b/paddle/operators/lrn_op.cc
@@ -19,6 +19,103 @@ namespace operators {
 
 using framework::Tensor;
 
+template <typename T>
+struct LRNFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta) {
+    auto x_v = framework::EigenVector<T>::Flatten(input);
+
+    const int start = -(n - 1) / 2;
+    const int end = start + n;
+
+    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
+    e_mid = e_mid.constant(k);
+
+    auto e_x = framework::EigenTensor<T, 4>::From(input);
+    for (int m = 0; m < N; m++) {
+      for (int i = 0; i < C; i++) {
+        for (int c = start; c <= end; c++) {
+          int ch = i + c;
+          if (ch >= 0 && ch < C) {
+            auto s = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+            auto r = e_x.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                               Eigen::array<int, 4>({{1, 1, H, W}}));
+
+            s += alpha * r.square();
+          }
+        }
+      }
+    }
+
+    auto out_e = framework::EigenVector<T>::Flatten(*out);
+    out_e = x_v * e_mid.reshape(Eigen::DSizes<int, 1>(e_mid.size())).pow(-beta);
+  }
+};
+template struct LRNFunctor<platform::CPUDeviceContext, float>;
+template struct LRNFunctor<platform::CPUDeviceContext, double>;
+
+template <typename T>
+struct LRNGradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta) {
+    T ratio = -2 * alpha * beta;
+    auto x_g_e = framework::EigenVector<T>::Flatten(*x_g);
+    x_g_e = x_g_e.constant(0.0);
+
+    auto e_x = framework::EigenTensor<T, 4>::From(x);
+    auto e_x_g = framework::EigenTensor<T, 4>::From(*x_g);
+    auto e_out = framework::EigenTensor<T, 4>::From(out);
+    auto e_out_g = framework::EigenTensor<T, 4>::From(out_g);
+    auto e_mid = framework::EigenTensor<T, 4>::From(mid);
+
+    const int start = -(n - 1) / 2;
+    const int end = start + n;
+    for (int m = 0; m < N; m++) {
+      for (int i = 0; i < C; i++) {
+        auto i_x = e_x.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                             Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_x_g = e_x_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                     Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_mid = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        i_x_g = i_mid.pow(-beta) * i_out_g;
+        for (int c = start; c <= end; c++) {
+          int ch = i + c;
+          if (ch < 0 || ch >= C) {
+            continue;
+          }
+
+          auto c_out = e_out.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                   Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          auto c_mid = e_mid.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                   Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          auto c_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                       Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          i_x_g += ratio * c_out_g * c_out * i_x / c_mid;
+        }
+      }
+    }
+  }
+};
+template struct LRNGradFunctor<platform::CPUDeviceContext, float>;
+template struct LRNGradFunctor<platform::CPUDeviceContext, double>;
+
 class LRNOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -83,8 +180,8 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Local Response Normalization Operator.
 
-This operator comes from the paper
-"ImageNet Classification with Deep Convolutional Neural Networks".
+This operator comes from the paper:
+<<ImageNet Classification with Deep Convolutional Neural Networks>>.
 
 The original formula is:
 
@@ -107,7 +204,7 @@ Input(i, x, y), Output(i, x, y) represents an element in an image.
 C is the number of feature maps of one image. n is a hyper-parameter
 configured when operator is initialized. The sum in the denominator
 is the sum of the same positions in the neighboring maps.
-    
+
 )DOC");
   }
 };
@@ -119,8 +216,7 @@ class LRNOpGrad : public framework::OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("MidOut")),
-                   "Input(MidOut@GRAD) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("MidOut"), "Input(MidOut) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null");
 
@@ -134,6 +230,7 @@ class LRNOpGrad : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker<float>, lrn_grad, ops::LRNOpGrad);
-REGISTER_OP_CPU_KERNEL(lrn, ops::LRNKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(lrn_grad,
-                       ops::LRNGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    lrn, ops::LRNKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    lrn_grad, ops::LRNGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/lrn_op.cu b/paddle/operators/lrn_op.cu
index 607dc6d86a..c6857c2b6d 100644
--- a/paddle/operators/lrn_op.cu
+++ b/paddle/operators/lrn_op.cu
@@ -12,11 +12,167 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/operators/lrn_op.h"
 
-namespace ops = paddle::operators;
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void KeCMRNormFillScale(int img_size, const T* in, T* mid, int C,
+                                   int H, int W, int size, T k, T alpha) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < img_size) {
+    const int w = idx % W;
+    const int h = (idx / W) % H;
+    const int n = idx / W / H;
+    const int offset = (n * C * H + h) * W + w;
+
+    in += offset;
+    mid += offset;
+    const int step = H * W;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+
+    T accum = 0;
+    int index = 0;
+    while (index < C + post_pad) {
+      if (index < C) {
+        T val = in[index * step];
+        accum += val * val;
+      }
+      if (index >= size) {
+        T val = in[(index - size) * step];
+        accum -= val * val;
+      }
+      if (index >= post_pad) {
+        mid[(index - post_pad) * step] = k + accum * alpha;
+      }
+      ++index;
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeCMRNormOutput(int input_size, const T* in, const T* mid,
+                                T negative_beta, T* out) {
+  const int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < input_size) {
+    out[index] = in[index] * pow(mid[index], negative_beta);
+  }
+}
+
+template <typename T>
+void CrossMapNormal(const framework::ExecutionContext& ctx, const T* inputs,
+                    T* outputs, T* mid, int N, int C, int H, int W, int n, T k,
+                    T alpha, T beta) {
+  int img_size = N * H * W;
+  const int block_size = 1024;
+  int grid_size = (img_size + block_size - 1) / block_size;
+
+  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  KeCMRNormFillScale<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      img_size, inputs, mid, C, H, W, n, k, alpha);
+
+  int input_size = N * H * W * C;
+  grid_size = (input_size + block_size - 1) / block_size;
+  KeCMRNormOutput<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      input_size, inputs, mid, -beta, outputs);
+}
+
+template <typename T>
+struct LRNFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta) {
+    CrossMapNormal<T>(
+        ctx, input.data<T>(), out->mutable_data<T>(ctx.GetPlace()),
+        mid->mutable_data<T>(ctx.GetPlace()), N, C, H, W, n, k, alpha, beta);
+  }
+};
+
+template struct LRNFunctor<platform::CUDADeviceContext, float>;
+template struct LRNFunctor<platform::CUDADeviceContext, double>;
 
-REGISTER_OP_GPU_KERNEL(lrn, ops::LRNKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(lrn_grad,
-                       ops::LRNGradKernel<paddle::platform::GPUPlace, float>);
+template <typename T>
+__global__ void KeCMRNormDiff(int img_size, const T* x, const T* out,
+                              const T* mid, T* x_g, const T* out_g, int C,
+                              int H, int W, int size, T negative_beta,
+                              T ratio) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < img_size) {
+    const int w = idx % W;
+    const int h = (idx / W) % H;
+    const int n = idx / W / H;
+    const int offset = (n * C * H + h) * W + w;
+    x += offset;
+    out += offset;
+    mid += offset;
+    out_g += offset;
+    x_g += offset;
+
+    const int step = H * W;
+    const int pre_pad = size - (size + 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+
+    int index = 0;
+    T accum = 0;
+    // TODO(gongwb): optimize this with thread shared array.
+    while (index < C + post_pad) {
+      if (index < C) {
+        x_g[index * step] = 0.0;
+        accum += out_g[index * step] * out[index * step] / mid[index * step];
+      }
+      if (index >= size) {
+        accum -= out_g[(index - size) * step] * out[(index - size) * step] /
+                 mid[(index - size) * step];
+      }
+      if (index >= post_pad) {
+        x_g[(index - post_pad) * step] +=
+            out_g[(index - post_pad) * step] *
+                pow(mid[(index - post_pad) * step], negative_beta) -
+            ratio * x[(index - post_pad) * step] * accum;
+      }
+      ++index;
+    }
+  }
+}
+
+template <typename T>
+void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const T* x,
+                        const T* out, const T* mid, T* x_g, const T* out_g,
+                        int N, int C, int H, int W, int n, T alpha, T beta) {
+  int img_size = N * H * W;
+
+  const int block_size = 1024;
+  int grid_size = (img_size + block_size - 1) / block_size;
+
+  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  KeCMRNormDiff<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      img_size, x, out, mid, x_g, out_g, C, H, W, n, -beta,
+      2.0f * alpha * beta);
+}
+
+template <typename T>
+struct LRNGradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta) {
+    CrossMapNormalGrad<T>(ctx, x.data<T>(), out.data<T>(), mid.data<T>(),
+                          x_g->mutable_data<T>(ctx.GetPlace()), out_g.data<T>(),
+                          N, C, H, W, n, alpha, beta);
+  }
+};
+
+template struct LRNGradFunctor<platform::CUDADeviceContext, float>;
+template struct LRNGradFunctor<platform::CUDADeviceContext, double>;
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    lrn, ops::LRNKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    lrn_grad, ops::LRNGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/lrn_op.h b/paddle/operators/lrn_op.h
index 606c657443..44063d3e03 100644
--- a/paddle/operators/lrn_op.h
+++ b/paddle/operators/lrn_op.h
@@ -21,7 +21,15 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename place, typename T>
+struct LRNFunctor {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta);
+};
+
+template <typename DeviceContext, typename T>
 class LRNKernel : public framework::OpKernel<T> {
  public:
   using Tensor = framework::Tensor;
@@ -31,8 +39,8 @@ class LRNKernel : public framework::OpKernel<T> {
   // f(x) represents outputs
   void Compute(const framework::ExecutionContext& ctx) const override {
     // input
-    const Tensor* x = ctx.Input<Tensor>("X");
-    auto x_dims = x->dims();
+    const Tensor& x = *ctx.Input<Tensor>("X");
+    auto x_dims = x.dims();
 
     // NCHW
     int N = x_dims[0];
@@ -57,38 +65,20 @@ class LRNKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0");
     PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0");
 
-    auto x_v = framework::EigenVector<T>::Flatten(*x);
-
-    const int start = -(n - 1) / 2;
-    const int end = start + n;
-
-    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
-    e_mid.device(ctx.GetEigenDevice<Place>()) = e_mid.constant(k);
-
-    auto e_x = framework::EigenTensor<T, 4>::From(*x);
-    for (int m = 0; m < N; m++) {
-      for (int i = 0; i < C; i++) {
-        for (int c = start; c <= end; c++) {
-          int ch = i + c;
-          if (ch >= 0 && ch < C) {
-            auto s = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-            auto r = e_x.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                               Eigen::array<int, 4>({{1, 1, H, W}}));
-
-            s.device(ctx.GetEigenDevice<Place>()) += alpha * r.square();
-          }
-        }
-      }
-    }
-
-    auto out_e = framework::EigenVector<T>::Flatten(*out);
-    out_e.device(ctx.GetEigenDevice<Place>()) =
-        x_v * e_mid.reshape(Eigen::DSizes<int, 1>(e_mid.size())).pow(-beta);
+    LRNFunctor<DeviceContext, T> f;
+    f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta);
   }
 };
 
+template <typename DeviceContext, typename T>
+struct LRNGradFunctor {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta);
+};
+
 /**
  * \brief Backward calculation for normalization with across maps.
  *
@@ -97,7 +87,7 @@ class LRNKernel : public framework::OpKernel<T> {
  * The implementation of this Function is derived from the
  * CrossMapNormalFunc implementation.
  *
- * InputGrad = OutputGrad * denoms ^ (-beta)
+ * InputGrad = OutputGrad * MidOut ^ (-beta)
  *    -- upper
  *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / MidOut) * InputValue
  *    -- lower
@@ -108,23 +98,20 @@ class LRNKernel : public framework::OpKernel<T> {
  * The upper and lower is the same as forward. The logic of the sum
  * is also the same as forward.
  */
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LRNGradKernel : public framework::OpKernel<T> {
  public:
   using Tensor = framework::Tensor;
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* out = ctx.Input<Tensor>("Out");
-    const Tensor* out_g = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    const Tensor* mid = ctx.Input<Tensor>("MidOut");
+    const Tensor& x = *ctx.Input<Tensor>("X");
+    const Tensor& out = *ctx.Input<Tensor>("Out");
+    const Tensor& out_g = *ctx.Input<Tensor>(framework::GradVarName("Out"));
+    const Tensor& mid = *ctx.Input<Tensor>("MidOut");
 
     auto x_g = ctx.Output<Tensor>(framework::GradVarName("X"));
     x_g->mutable_data<T>(ctx.GetPlace());
 
-    auto x_g_e = framework::EigenVector<T>::Flatten(*x_g);
-    x_g_e.device(ctx.GetEigenDevice<Place>()) = x_g_e.constant(0.0);
-
-    auto x_dims = x->dims();
+    auto x_dims = x.dims();
     int N = x_dims[0];
     int C = x_dims[1];
     int H = x_dims[2];
@@ -133,51 +120,9 @@ class LRNGradKernel : public framework::OpKernel<T> {
     int n = ctx.Attr<int>("n");
     T alpha = ctx.Attr<T>("alpha");
     T beta = ctx.Attr<T>("beta");
-    T ratio = -2 * alpha * beta;
-
-    auto e_x = framework::EigenTensor<T, 4>::From(*x);
-    auto e_x_g = framework::EigenTensor<T, 4>::From(*x_g);
-    auto e_out = framework::EigenTensor<T, 4>::From(*out);
-    auto e_out_g = framework::EigenTensor<T, 4>::From(*out_g);
-    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
-
-    const int start = -(n - 1) / 2;
-    const int end = start + n;
-    for (int m = 0; m < N; m++) {
-      for (int i = 0; i < C; i++) {
-        auto i_x = e_x.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                             Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_x_g = e_x_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                     Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_mid = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        i_x_g.device(ctx.GetEigenDevice<Place>()) = i_mid.pow(-beta) * i_out_g;
-        for (int c = start; c <= end; c++) {
-          int ch = i + c;
-          if (ch < 0 || ch >= C) {
-            continue;
-          }
-
-          auto c_out = e_out.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                   Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          auto c_mid = e_mid.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                   Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          auto c_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                       Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          i_x_g.device(ctx.GetEigenDevice<Place>()) +=
-              ratio * c_out_g * c_out * i_x / c_mid;
-        }
-      }
-    }
+
+    LRNGradFunctor<DeviceContext, T> f;
+    f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta);
   }
 };
 
diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc
index fa8e5f2da8..2db7da30db 100644
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@@ -273,8 +273,9 @@ class LSTMGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp);
-REGISTER_OP_CPU_KERNEL(lstm, ops::LSTMKernel<paddle::platform::CPUPlace, float>,
-                       ops::LSTMKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(lstm_grad,
-                       ops::LSTMGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::LSTMGradKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    lstm_grad, ops::LSTMGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/lstm_op.cu.cc b/paddle/operators/lstm_op.cu.cc
index 610cbb03e8..48519bed6f 100644
--- a/paddle/operators/lstm_op.cu.cc
+++ b/paddle/operators/lstm_op.cu.cc
@@ -15,8 +15,9 @@
 #include "paddle/operators/lstm_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(lstm, ops::LSTMKernel<paddle::platform::GPUPlace, float>,
-                       ops::LSTMKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(lstm_grad,
-                       ops::LSTMGradKernel<paddle::platform::GPUPlace, float>,
-                       ops::LSTMGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lstm, ops::LSTMKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lstm_grad, ops::LSTMGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h
index a78f548aaf..14abd4bf0a 100644
--- a/paddle/operators/lstm_op.h
+++ b/paddle/operators/lstm_op.h
@@ -24,16 +24,16 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
-inline void ReorderInitState(const platform::DeviceContext& ctx,
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
                              const framework::Tensor& src, const size_t* index,
                              framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<Place, T> row_shuffle;
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
   row_shuffle(ctx, src, index, *dst, indexed_src);
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LSTMKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -52,8 +52,8 @@ class LSTMKernel : public framework::OpKernel<T> {
     cell_out->mutable_data<T>(ctx.GetPlace());
 
     bool is_reverse = ctx.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<Place, T> to_batch;
-    auto& device_ctx = ctx.device_context();
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
     to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
 
     auto in_dims = input->dims();
@@ -64,7 +64,7 @@ class LSTMKernel : public framework::OpKernel<T> {
       Tensor b = *bias;
       b.Resize({bias->numel(), 1});
       Tensor gate_bias = b.Slice(0, 4 * frame_size);
-      math::RowwiseAdd<Place, T> add_bias;
+      math::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
     }
 
@@ -88,8 +88,8 @@ class LSTMKernel : public framework::OpKernel<T> {
       // Since the batch computing for LSTM reorders the input sequence
       // according to their length. The initialized cell state also needs
       // to reorder.
-      ReorderInitState<Place, T>(device_ctx, *cell_t0, order, &ordered_c0,
-                                 true);
+      ReorderInitState<DeviceContext, T>(device_ctx, *cell_t0, order,
+                                         &ordered_c0, true);
       lstm_value.prev_state_value = ordered_c0.data<T>();
     }
 
@@ -121,9 +121,9 @@ class LSTMKernel : public framework::OpKernel<T> {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
-        math::matmul<Place, T>(device_ctx, pre_hidden_t, false, *weight, false,
-                               static_cast<T>(1.0), &gate_t,
-                               static_cast<T>(1.0));
+        math::matmul<DeviceContext, T>(device_ctx, pre_hidden_t, false, *weight,
+                                       false, static_cast<T>(1.0), &gate_t,
+                                       static_cast<T>(1.0));
       } else if (hidden_t0) {
         // If n == 0 and there is no initialized hidden state, that is to say
         // the H0 is zeros, the calculation W_h * H0 will be skiped.
@@ -133,24 +133,24 @@ class LSTMKernel : public framework::OpKernel<T> {
         // according to their length. The initialized hidden state also needs
         // to reorder.
         Tensor ordered_h0;
-        ReorderInitState<Place, T>(device_ctx, *hidden_t0, order, &ordered_h0,
-                                   true);
-        math::matmul<Place, T>(device_ctx, ordered_h0, false, *weight, false,
-                               static_cast<T>(1.0), &gate_t,
-                               static_cast<T>(1.0));
+        ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
+                                           &ordered_h0, true);
+        math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false, *weight,
+                                       false, static_cast<T>(1.0), &gate_t,
+                                       static_cast<T>(1.0));
       }
 
       lstm_value.gate_value = gate_t.data<T>();
       lstm_value.output_value = out_t.data<T>();
       lstm_value.state_value = cell_t.data<T>();
       lstm_value.state_active_value = cell_pre_act_t.data<T>();
-      math::LstmUnitFunctor<Place, T>::compute(device_ctx, lstm_value,
-                                               frame_size, cur_batch_size,
-                                               gate_act, cell_act, cand_act);
+      math::LstmUnitFunctor<DeviceContext, T>::compute(
+          device_ctx, lstm_value, frame_size, cur_batch_size, gate_act,
+          cell_act, cand_act);
       lstm_value.prev_state_value = lstm_value.state_value;
     }
 
-    math::Batch2LoDTensorFunctor<Place, T> to_seq;
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_hidden.set_lod(batch_gate->lod());
     // restore the output hidden in LoDTensor from the batch hidden
     to_seq(device_ctx, batch_hidden, *hidden_out);
@@ -161,7 +161,7 @@ class LSTMKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LSTMGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -187,8 +187,8 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
     auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
 
-    auto& device_ctx = ctx.device_context();
-    math::SetConstant<Place, T> zero;
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
     if (weight_g) {
       weight_g->mutable_data<T>(ctx.GetPlace());
       zero(device_ctx, weight_g, static_cast<T>(0.0));
@@ -200,7 +200,8 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
     const size_t* order = batch_gate->lod()[2].data();
     if (c0) {
-      ReorderInitState<Place, T>(device_ctx, *c0, order, &ordered_c0, true);
+      ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
+                                         true);
     }
     if (c0 && c0_g) {
       ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
@@ -240,10 +241,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       lstm_grad.check_og_grad = nullptr;
     }
 
-    math::LoDTensor2BatchFunctor<Place, T> to_batch;
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
 
     auto ToBatch = [&batch_gate, &to_batch](
-        const platform::DeviceContext& ctx, const framework::LoDTensor& src,
+        const DeviceContext& ctx, const framework::LoDTensor& src,
         const framework::DDim& dims, framework::LoDTensor& dst) {
       dst.mutable_data<T>(dims, ctx.GetPlace());
       dst.set_lod(batch_gate->lod());
@@ -299,7 +300,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       }
 
       int cur_batch_size = bend - bstart;
-      math::LstmUnitGradFunctor<Place, T>::compute(
+      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
           device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
           gate_act, cell_act, cand_act);
 
@@ -307,33 +308,34 @@ class LSTMGradKernel : public framework::OpKernel<T> {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end);
-        math::matmul<Place, T>(device_ctx, gate_g, false, *weight, true,
-                               static_cast<T>(1.0), &pre_hidden_g,
-                               static_cast<T>(1.0));
+        math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true,
+                                       static_cast<T>(1.0), &pre_hidden_g,
+                                       static_cast<T>(1.0));
         if (weight_g) {
           /* backward weight */
           auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end);
-          math::matmul<Place, T>(device_ctx, pre_hidden, true, gate_g, false,
-                                 static_cast<T>(1.0), weight_g,
-                                 static_cast<T>(1.0));
+          math::matmul<DeviceContext, T>(device_ctx, pre_hidden, true, gate_g,
+                                         false, static_cast<T>(1.0), weight_g,
+                                         static_cast<T>(1.0));
         }
       } else {
         if (h0 && weight_g) {
-          ReorderInitState<Place, T>(device_ctx, *h0, order, &ordered_h0, true);
-          math::matmul<Place, T>(device_ctx, ordered_h0, true, gate_g, false,
-                                 static_cast<T>(1.0), weight_g,
-                                 static_cast<T>(1.0));
+          ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
+                                             &ordered_h0, true);
+          math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true, gate_g,
+                                         false, static_cast<T>(1.0), weight_g,
+                                         static_cast<T>(1.0));
         }
         if (h0 && h0_g) {
           ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
-          math::matmul<Place, T>(device_ctx, gate_g, false, *weight, true,
-                                 static_cast<T>(1.0), &ordered_h0_g,
-                                 static_cast<T>(0.0));
+          math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
+                                         true, static_cast<T>(1.0),
+                                         &ordered_h0_g, static_cast<T>(0.0));
         }
       }
     }
 
-    math::Batch2LoDTensorFunctor<Place, T> to_seq;
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     if (in_g) {
       /* backward data */
       in_g->mutable_data<T>(ctx.GetPlace());
@@ -344,15 +346,17 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       Tensor b_g = *bias_g;
       b_g.Resize({bias_g->numel(), 1});
       Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
-      math::ColwiseSum<Place, T> col_sum;
+      math::ColwiseSum<DeviceContext, T> col_sum;
       col_sum(device_ctx, batch_gate_g, &gate_bias_g);
     }
 
     if (h0 && h0_g) {
-      ReorderInitState<Place, T>(device_ctx, ordered_h0_g, order, h0_g, false);
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_h0_g, order, h0_g,
+                                         false);
     }
     if (c0 && c0_g) {
-      ReorderInitState<Place, T>(device_ctx, ordered_c0_g, order, c0_g, false);
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_c0_g, order, c0_g,
+                                         false);
     }
   }
 };
diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu
index e192283aa0..291f2c295e 100644
--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/operators/lstm_unit_op.cu
@@ -173,7 +173,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>,
-                       ops::LstmUnitOpCUDAKernel<double>);
-REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>,
-                       ops::LstmUnitGradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>,
+                        ops::LstmUnitOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>,
+                        ops::LstmUnitGradOpCUDAKernel<double>);
diff --git a/paddle/operators/lstm_unit_op.h b/paddle/operators/lstm_unit_op.h
index 38cb298f92..61705675d9 100644
--- a/paddle/operators/lstm_unit_op.h
+++ b/paddle/operators/lstm_unit_op.h
@@ -35,7 +35,7 @@ inline T tanh(T x) {
   return 2. * sigmoid(2. * x) - 1.;
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LstmUnitKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -78,7 +78,7 @@ class LstmUnitKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LstmUnitGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/margin_rank_loss_op.cc b/paddle/operators/margin_rank_loss_op.cc
index d7e8a0ea76..42e8961c0e 100644
--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
@@ -117,7 +117,7 @@ REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp,
             ops::MarginRankLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     margin_rank_loss,
-    ops::MarginRankLossKernel<paddle::platform::CPUPlace, float>);
+    ops::MarginRankLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     margin_rank_loss_grad,
-    ops::MarginRankLossGradKernel<paddle::platform::CPUPlace, float>);
+    ops::MarginRankLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/margin_rank_loss_op.cu b/paddle/operators/margin_rank_loss_op.cu
index 3a639f25d4..1c2afccc5b 100644
--- a/paddle/operators/margin_rank_loss_op.cu
+++ b/paddle/operators/margin_rank_loss_op.cu
@@ -16,9 +16,9 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     margin_rank_loss,
-    ops::MarginRankLossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+    ops::MarginRankLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     margin_rank_loss_grad,
-    ops::MarginRankLossGradKernel<paddle::platform::GPUPlace, float>);
+    ops::MarginRankLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/margin_rank_loss_op.h b/paddle/operators/margin_rank_loss_op.h
index 8d0830147e..9c1f96cac1 100644
--- a/paddle/operators/margin_rank_loss_op.h
+++ b/paddle/operators/margin_rank_loss_op.h
@@ -34,7 +34,7 @@ struct Heaviside {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MarginRankLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -56,13 +56,13 @@ class MarginRankLossKernel : public framework::OpKernel<T> {
     auto x1 = framework::EigenVector<T>::Flatten(*x1_t);
     auto x2 = framework::EigenVector<T>::Flatten(*x2_t);
 
-    auto& dev = ctx.GetEigenDevice<Place>();
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
     out.device(dev) = (-label * (x1 - x2) + margin).unaryExpr(ReLU<T>());
     act.device(dev) = out.unaryExpr(Heaviside<T>());
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MarginRankLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -78,7 +78,7 @@ class MarginRankLossGradKernel : public framework::OpKernel<T> {
     auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
     auto act = framework::EigenVector<T>::Flatten(*act_t);
     auto label = framework::EigenVector<T>::Flatten(*label_t);
-    auto& dev = ctx.GetEigenDevice<Place>();
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
 
     // compute d_x1
     if (d_x1_t) {
diff --git a/paddle/operators/math/context_project.cc b/paddle/operators/math/context_project.cc
index f82ea5d7be..980dd90df8 100644
--- a/paddle/operators/math/context_project.cc
+++ b/paddle/operators/math/context_project.cc
@@ -18,8 +18,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class ContextProjectFunctor<platform::CPUPlace, float>;
-template class ContextProjectFunctor<platform::CPUPlace, double>;
+template class ContextProjectFunctor<platform::CPUDeviceContext, float>;
+template class ContextProjectFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/context_project.cu b/paddle/operators/math/context_project.cu
index 04eeed543c..934e3df645 100644
--- a/paddle/operators/math/context_project.cu
+++ b/paddle/operators/math/context_project.cu
@@ -20,8 +20,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class ContextProjectFunctor<platform::GPUPlace, float>;
-template class ContextProjectFunctor<platform::GPUPlace, double>;
+template class ContextProjectFunctor<platform::CUDADeviceContext, float>;
+template class ContextProjectFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/context_project.h b/paddle/operators/math/context_project.h
index d853507188..4036614086 100644
--- a/paddle/operators/math/context_project.h
+++ b/paddle/operators/math/context_project.h
@@ -81,17 +81,17 @@ using LoDTensor = framework::LoDTensor;
  *
  */
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ContextProjectFunctor {
  public:
-  void operator()(const platform::DeviceContext& context, const LoDTensor& in,
+  void operator()(const DeviceContext& context, const LoDTensor& in,
                   const Tensor& padding_data, bool padding_trainable,
                   const int context_start, const int context_length,
                   const int context_stride, const int up_pad,
                   const int down_pad, Tensor* col) {
     auto lod_level_0 = in.lod()[0];
 
-    math::Im2ColFunctor<math::ColFormat::kOCF, Place, float> im2col_ocf;
+    math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, float> im2col_ocf;
 
     std::vector<int> dilation({1, 1});
     std::vector<int> padding({up_pad, 0, down_pad, 0});
@@ -188,17 +188,17 @@ class ContextProjectFunctor {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ContextProjectGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context, const LoDTensor& in,
+  void operator()(const DeviceContext& context, const LoDTensor& in,
                   bool padding_trainable, const int context_start,
                   const int context_length, const int context_stride,
                   const int up_pad, const int down_pad, bool pad_grad,
                   bool input_grad, Tensor* padding_data, Tensor* col) {
     auto lod_level_0 = in.lod()[0];
 
-    math::Col2ImFunctor<math::ColFormat::kOCF, Place, float> col2im_ocf;
+    math::Col2ImFunctor<math::ColFormat::kOCF, DeviceContext, float> col2im_ocf;
 
     std::vector<int> dilation({1, 1});
     std::vector<int> padding({up_pad, 0, down_pad, 0});
@@ -258,8 +258,8 @@ class ContextProjectGradFunctor {
               Tensor out_t_sub = out_t.Slice(k * context_length,
                                              k * context_length + padding_size);
               Tensor w_sub = padding_data->Slice(k, k + padding_size);
-              axpy<Place, T>(context, w_sub.numel(), static_cast<T>(1),
-                             out_t_sub.data<T>(), w_sub.data<T>());
+              axpy<DeviceContext, T>(context, w_sub.numel(), static_cast<T>(1),
+                                     out_t_sub.data<T>(), w_sub.data<T>());
             }
           }
           if (down_pad > 0) {
@@ -290,8 +290,8 @@ class ContextProjectGradFunctor {
                   (down_pad_begin_row + t) * context_length);
               Tensor w_sub = padding_data->Slice(
                   up_pad + padding_idx, up_pad + padding_idx + padding_size);
-              axpy<Place, T>(context, w_sub.numel(), static_cast<T>(1),
-                             out_t_sub.data<T>(), w_sub.data<T>());
+              axpy<DeviceContext, T>(context, w_sub.numel(), static_cast<T>(1),
+                                     out_t_sub.data<T>(), w_sub.data<T>());
             }
           }
           out_t.Resize({sequence_height, context_length * sequence_width});
diff --git a/paddle/operators/math/cross_entropy.cc b/paddle/operators/math/cross_entropy.cc
index cf238a58e0..6011a196d4 100644
--- a/paddle/operators/math/cross_entropy.cc
+++ b/paddle/operators/math/cross_entropy.cc
@@ -24,9 +24,9 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename T>
-class CrossEntropyFunctor<platform::CPUPlace, T> {
+class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& ctx, framework::Tensor* out,
+  void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out,
                   const framework::Tensor* prob,
                   const framework::Tensor* labels, const bool softLabel) {
     const int batch_size = prob->dims()[0];
@@ -35,7 +35,7 @@ class CrossEntropyFunctor<platform::CPUPlace, T> {
       auto lbl = EigenMatrix<T>::From(*labels);
       auto loss = EigenMatrix<T>::From(*out);
 
-      loss.device(*ctx.GetEigenDevice<platform::CPUPlace>()) =
+      loss.device(*ctx.eigen_device()) =
           -((lbl * in.log().unaryExpr(math::TolerableValue<T>()))
                 .sum(Eigen::DSizes<int, 1>(1))
                 .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
@@ -53,8 +53,8 @@ class CrossEntropyFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class CrossEntropyFunctor<platform::CPUPlace, float>;
-template class CrossEntropyFunctor<platform::CPUPlace, double>;
+template class CrossEntropyFunctor<platform::CPUDeviceContext, float>;
+template class CrossEntropyFunctor<platform::CPUDeviceContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/cross_entropy.cu b/paddle/operators/math/cross_entropy.cu
index 651c08f740..2132d49c93 100644
--- a/paddle/operators/math/cross_entropy.cu
+++ b/paddle/operators/math/cross_entropy.cu
@@ -95,10 +95,10 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
 using Tensor = framework::Tensor;
 
 template <typename T>
-class CrossEntropyFunctor<platform::GPUPlace, T> {
+class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& ctx, framework::Tensor* out,
-                  const framework::Tensor* prob,
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  framework::Tensor* out, const framework::Tensor* prob,
                   const framework::Tensor* labels, bool softLabel) {
     const T* prob_data = prob->data<T>();
     T* loss_data = out->mutable_data<T>(ctx.GetPlace());
@@ -118,16 +118,14 @@ class CrossEntropyFunctor<platform::GPUPlace, T> {
       const int64_t* label_data = labels->data<int64_t>();
       int block = 512;
       int grid = (batch_size + block - 1) / block;
-      CrossEntropyKernel<T><<<
-          grid, block, 0,
-          reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      CrossEntropyKernel<T><<<grid, block, 0, ctx.stream()>>>(
           loss_data, prob_data, label_data, batch_size, class_num);
     }
   }
 };
 
-template class CrossEntropyFunctor<platform::GPUPlace, float>;
-template class CrossEntropyFunctor<platform::GPUPlace, double>;
+template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
+template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/cross_entropy.h b/paddle/operators/math/cross_entropy.h
index 70ed9ddd55..677adb5ada 100644
--- a/paddle/operators/math/cross_entropy.h
+++ b/paddle/operators/math/cross_entropy.h
@@ -33,11 +33,11 @@ struct TolerableValue {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CrossEntropyFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor* out, const framework::Tensor* prob,
+  void operator()(const DeviceContext& context, framework::Tensor* out,
+                  const framework::Tensor* prob,
                   const framework::Tensor* labels, const bool softLabel);
 };
 }  // namespace math
diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc
index ae4e47b014..d570c68cd4 100644
--- a/paddle/operators/math/gru_compute.cc
+++ b/paddle/operators/math/gru_compute.cc
@@ -19,14 +19,14 @@ namespace operators {
 namespace math {
 
 template <typename T>
-struct GRUUnitFunctor<platform::CPUPlace, T> {
-  static void compute(const platform::DeviceContext &context,
+struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext &context,
                       hl_gru_value<T> value, int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate) {
 #ifndef __NVCC__
     if (value.prev_out_value) {
-      math::gemm<platform::CPUPlace, T>(
+      math::gemm<platform::CPUDeviceContext, T>(
           context, false, false, batch_size, frame_size * 2, frame_size, 1,
           value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
           1, value.gate_value, frame_size * 3);
@@ -36,7 +36,7 @@ struct GRUUnitFunctor<platform::CPUPlace, T> {
                                  frame_size, batch_size, active_gate);
 
     if (value.prev_out_value) {
-      math::gemm<platform::CPUPlace, T>(
+      math::gemm<platform::CPUDeviceContext, T>(
           context, false, false, batch_size, frame_size, frame_size, 1,
           value.reset_output_value, frame_size, value.state_weight, frame_size,
           1, value.gate_value + frame_size * 2, frame_size * 3);
@@ -49,8 +49,8 @@ struct GRUUnitFunctor<platform::CPUPlace, T> {
 };
 
 template <typename T>
-struct GRUUnitGradFunctor<platform::CPUPlace, T> {
-  static void compute(const platform::DeviceContext &context,
+struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext &context,
                       hl_gru_value<T> value, hl_gru_grad<T> grad,
                       int frame_size, int batch_size,
                       activation_mode_t active_node,
@@ -60,13 +60,13 @@ struct GRUUnitGradFunctor<platform::CPUPlace, T> {
                                 grad, frame_size, batch_size, active_node);
 
     if (value.prev_out_value && grad.prev_out_grad) {
-      math::gemm<platform::CPUPlace, T>(
+      math::gemm<platform::CPUDeviceContext, T>(
           context, false, true, batch_size, frame_size, frame_size, 1,
           grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
           frame_size, 0, grad.reset_output_grad, frame_size);
 
       if (grad.state_weight_grad) {
-        math::gemm<platform::CPUPlace, T>(
+        math::gemm<platform::CPUDeviceContext, T>(
             context, true, false, frame_size, frame_size, batch_size, 1,
             value.reset_output_value, frame_size,
             grad.gate_grad + frame_size * 2, frame_size * 3, 1,
@@ -78,13 +78,13 @@ struct GRUUnitGradFunctor<platform::CPUPlace, T> {
                                 grad, frame_size, batch_size, active_gate);
 
     if (grad.prev_out_grad && value.prev_out_value) {
-      math::gemm<platform::CPUPlace, T>(
+      math::gemm<platform::CPUDeviceContext, T>(
           context, false, true, batch_size, frame_size, frame_size * 2, 1,
           grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
           grad.prev_out_grad, frame_size);
 
       if (grad.gate_weight_grad) {
-        math::gemm<platform::CPUPlace, T>(
+        math::gemm<platform::CPUDeviceContext, T>(
             context, true, false, frame_size, frame_size * 2, batch_size, 1,
             value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
             grad.gate_weight_grad, frame_size * 2);
@@ -94,10 +94,10 @@ struct GRUUnitGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template struct GRUUnitFunctor<platform::CPUPlace, float>;
-template struct GRUUnitFunctor<platform::CPUPlace, double>;
-template struct GRUUnitGradFunctor<platform::CPUPlace, float>;
-template struct GRUUnitGradFunctor<platform::CPUPlace, double>;
+template struct GRUUnitFunctor<platform::CPUDeviceContext, float>;
+template struct GRUUnitFunctor<platform::CPUDeviceContext, double>;
+template struct GRUUnitGradFunctor<platform::CPUDeviceContext, float>;
+template struct GRUUnitGradFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu
index 0252bdbdb6..dd518cd1e4 100644
--- a/paddle/operators/math/gru_compute.cu
+++ b/paddle/operators/math/gru_compute.cu
@@ -19,13 +19,12 @@ namespace operators {
 namespace math {
 
 template <typename T>
-struct GRUUnitFunctor<platform::GPUPlace, T> {
-  static void compute(const platform::DeviceContext &context,
+struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext &context,
                       hl_gru_value<T> value, int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate) {
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
+    auto stream = context.stream();
     dim3 threads;
     dim3 grid;
     if (batch_size == 1) {
@@ -39,7 +38,7 @@ struct GRUUnitFunctor<platform::GPUPlace, T> {
     }
 
     if (value.prev_out_value) {
-      math::gemm<platform::GPUPlace, T>(
+      math::gemm<platform::CUDADeviceContext, T>(
           context, false, false, batch_size, frame_size * 2, frame_size, 1,
           value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
           1, value.gate_value, frame_size * 3);
@@ -62,7 +61,7 @@ struct GRUUnitFunctor<platform::GPUPlace, T> {
     }
 
     if (value.prev_out_value) {
-      math::gemm<platform::GPUPlace, T>(
+      math::gemm<platform::CUDADeviceContext, T>(
           context, false, false, batch_size, frame_size, frame_size, 1,
           value.reset_output_value, frame_size, value.state_weight, frame_size,
           1, value.gate_value + frame_size * 2, frame_size * 3);
@@ -87,14 +86,13 @@ struct GRUUnitFunctor<platform::GPUPlace, T> {
 };
 
 template <typename T>
-struct GRUUnitGradFunctor<platform::GPUPlace, T> {
-  static void compute(const platform::DeviceContext &context,
+struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext &context,
                       hl_gru_value<T> value, hl_gru_grad<T> grad,
                       int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate) {
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
+    auto stream = context.stream();
     dim3 threads;
     dim3 grid;
     if (batch_size == 1) {
@@ -124,13 +122,13 @@ struct GRUUnitGradFunctor<platform::GPUPlace, T> {
     }
 
     if (value.prev_out_value && grad.prev_out_grad) {
-      math::gemm<platform::GPUPlace, T>(
+      math::gemm<platform::CUDADeviceContext, T>(
           context, false, true, batch_size, frame_size, frame_size, 1,
           grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
           frame_size, 0, grad.reset_output_grad, frame_size);
 
       if (grad.state_weight_grad) {
-        math::gemm<platform::GPUPlace, T>(
+        math::gemm<platform::CUDADeviceContext, T>(
             context, true, false, frame_size, frame_size, batch_size, 1,
             value.reset_output_value, frame_size,
             grad.gate_grad + frame_size * 2, frame_size * 3, 1,
@@ -155,13 +153,13 @@ struct GRUUnitGradFunctor<platform::GPUPlace, T> {
     }
 
     if (grad.prev_out_grad && value.prev_out_value) {
-      math::gemm<platform::GPUPlace, T>(
+      math::gemm<platform::CUDADeviceContext, T>(
           context, false, true, batch_size, frame_size, frame_size * 2, 1,
           grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
           grad.prev_out_grad, frame_size);
 
       if (grad.gate_weight_grad) {
-        math::gemm<platform::GPUPlace, T>(
+        math::gemm<platform::CUDADeviceContext, T>(
             context, true, false, frame_size, frame_size * 2, batch_size, 1,
             value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
             grad.gate_weight_grad, frame_size * 2);
@@ -170,10 +168,10 @@ struct GRUUnitGradFunctor<platform::GPUPlace, T> {
   }
 };
 
-template struct GRUUnitFunctor<platform::GPUPlace, float>;
-template struct GRUUnitFunctor<platform::GPUPlace, double>;
-template struct GRUUnitGradFunctor<platform::GPUPlace, float>;
-template struct GRUUnitGradFunctor<platform::GPUPlace, double>;
+template struct GRUUnitFunctor<platform::CUDADeviceContext, float>;
+template struct GRUUnitFunctor<platform::CUDADeviceContext, double>;
+template struct GRUUnitGradFunctor<platform::CUDADeviceContext, float>;
+template struct GRUUnitGradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/gru_compute.h b/paddle/operators/math/gru_compute.h
index 58ea59f68e..ca1343cb2c 100644
--- a/paddle/operators/math/gru_compute.h
+++ b/paddle/operators/math/gru_compute.h
@@ -40,19 +40,18 @@ struct hl_gru_grad {
   T *prev_out_grad;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct GRUUnitFunctor {
-  static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, int frame_size, int batch_size,
+  static void compute(const DeviceContext &context, hl_gru_value<T> value,
+                      int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct GRUUnitGradFunctor {
-  static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad,
-                      int frame_size, int batch_size,
+  static void compute(const DeviceContext &context, hl_gru_value<T> value,
+                      hl_gru_grad<T> grad, int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate);
 };
diff --git a/paddle/operators/math/im2col.cc b/paddle/operators/math/im2col.cc
index c10c44c520..707ebf0596 100644
--- a/paddle/operators/math/im2col.cc
+++ b/paddle/operators/math/im2col.cc
@@ -25,9 +25,9 @@ namespace math {
  */
 template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CPUPlace, T> {
+                    platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& im, const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col) {
@@ -90,9 +90,9 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
  */
 template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CPUPlace, T> {
+                    platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
@@ -149,13 +149,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
 };
 
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUPlace, float>;
+                             platform::CPUDeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUPlace, double>;
+                             platform::CPUDeviceContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUPlace, float>;
+                             platform::CPUDeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUPlace, double>;
+                             platform::CPUDeviceContext, double>;
 
 /*
  * im = [input_channels, input_height, input_width]
@@ -164,9 +164,9 @@ template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
  */
 template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CPUPlace, T> {
+                    platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& im, const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col) {
@@ -235,9 +235,9 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
  */
 template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CPUPlace, T> {
+                    platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
@@ -300,13 +300,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
 };
 
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUPlace, float>;
+                             platform::CPUDeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUPlace, double>;
+                             platform::CPUDeviceContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUPlace, float>;
+                             platform::CPUDeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUPlace, double>;
+                             platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/im2col.cu b/paddle/operators/math/im2col.cu
index bf78942439..a88e837b03 100644
--- a/paddle/operators/math/im2col.cu
+++ b/paddle/operators/math/im2col.cu
@@ -58,9 +58,9 @@ __global__ void im2col(const T* data_im, int num_outs, int im_height,
  */
 template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::GPUPlace, T> {
+                    platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& im, const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col) {
@@ -96,9 +96,7 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     int block_y = (blocks + 512 - 1) / 512;
     dim3 threads(1024, 1);
     dim3 grid(block_x, block_y);
-    im2col<T><<<grid, threads, 0,
-                reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                    .stream()>>>(
+    im2col<T><<<grid, threads, 0, context.stream()>>>(
         im.data<T>(), num_outputs, im_height, im_width, dilation[0],
         dilation[1], filter_height, filter_width, stride[0], stride[1],
         padding[0], padding[1], col_height, col_width, col->data<T>());
@@ -160,9 +158,9 @@ __global__ void col2im(int n, const T* data_col, int im_height, int im_width,
  */
 template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::GPUPlace, T> {
+                    platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
@@ -203,9 +201,7 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
 
     // To avoid involving atomic operations, we will launch one kernel per
     // bottom dimension, and then in the kernel add up the top dimensions.
-    col2im<T><<<grid, threads, 0,
-                reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                    .stream()>>>(
+    col2im<T><<<grid, threads, 0, context.stream()>>>(
         num_kernels, col.data<T>(), im_height, im_width, dilation[0],
         dilation[1], filter_height, filter_width, stride[0], stride[1],
         padding[0], padding[2], col_height, col_width, im->data<T>());
@@ -213,13 +209,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
 };
 
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::GPUPlace, float>;
+                             platform::CUDADeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::GPUPlace, double>;
+                             platform::CUDADeviceContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::GPUPlace, float>;
+                             platform::CUDADeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::GPUPlace, double>;
+                             platform::CUDADeviceContext, double>;
 
 template <class T>
 __global__ void im2colOCF(const T* im_data, int im_channels, int im_height,
@@ -260,9 +256,9 @@ __global__ void im2colOCF(const T* im_data, int im_channels, int im_height,
  */
 template <class T>
 class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::GPUPlace, T> {
+                    platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& im, const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col) {
@@ -310,9 +306,7 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
     int block_dim_z = 1024 / block_dim_x / block_dim_y;
     dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
     dim3 grid(col_width, col_height);
-    im2colOCF<T><<<grid, threads, 0,
-                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                       .stream()>>>(
+    im2colOCF<T><<<grid, threads, 0, context.stream()>>>(
         im.data<T>(), im_channels, im_height, im_width, filter_height,
         filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
         col_width, col->data<T>());
@@ -358,9 +352,9 @@ __global__ void col2imOCF(const T* col_data, int im_channels, int im_height,
  */
 template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::GPUPlace, T> {
+                    platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
@@ -409,9 +403,7 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
     int block_dim_z = 1024 / block_dim_x / block_dim_y;
     dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
     dim3 grid(col_width, col_height);
-    col2imOCF<T><<<grid, threads, 0,
-                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                       .stream()>>>(
+    col2imOCF<T><<<grid, threads, 0, context.stream()>>>(
         col.data<T>(), im_channels, im_height, im_width, filter_height,
         filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
         col_width, im->data<T>());
@@ -419,13 +411,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
 };
 
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::GPUPlace, float>;
+                             platform::CUDADeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::GPUPlace, double>;
+                             platform::CUDADeviceContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::GPUPlace, float>;
+                             platform::CUDADeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::GPUPlace, double>;
+                             platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/im2col.h b/paddle/operators/math/im2col.h
index 24fd9a06e9..38f2c9fe0a 100644
--- a/paddle/operators/math/im2col.h
+++ b/paddle/operators/math/im2col.h
@@ -79,20 +79,19 @@ enum class ColFormat { kCFO = 0, kOCF = 1 };
  * \note The caller needs to ensure that imShape.inputChannels is equal to
  *       colShape.inputChannels.
  */
-template <ColFormat Format, typename Place, typename T>
+template <ColFormat Format, typename DeviceContext, typename T>
 class Im2ColFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& im, const std::vector<int>& dilation,
+  void operator()(const DeviceContext& context, const framework::Tensor& im,
+                  const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col);
 };
 
-template <ColFormat Format, typename Place, typename T>
+template <ColFormat Format, typename DeviceContext, typename T>
 class Col2ImFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& col,
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im);
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc
index ae197a97ed..256f3bc9bd 100644
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <iostream>
 
-template <typename Place>
+template <typename DeviceContext, typename Place>
 void testIm2col() {
   paddle::framework::Tensor input_tmp;
   paddle::framework::Tensor input;
@@ -59,18 +59,7 @@ void testIm2col() {
   memcpy(input_ptr, arr, 6 * sizeof(float));
 
   auto* place = new Place();
-  paddle::platform::DeviceContext* context;
-  if (paddle::platform::is_cpu_place(*place)) {
-    context =
-        new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
-  } else {
-#ifdef PADDLE_WITH_CUDA
-    context =
-        new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
-#else
-    PADDLE_THROW("no GPU support");
-#endif  // PADDLE_WITH_CUDA
-  }
+  DeviceContext* context = new DeviceContext(*place);
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
@@ -83,10 +72,10 @@ void testIm2col() {
 
   // Im2Col
   paddle::operators::math::Im2ColFunctor<
-      paddle::operators::math::ColFormat::kCFO, Place, float>
+      paddle::operators::math::ColFormat::kCFO, DeviceContext, float>
       im2col;
   paddle::operators::math::Im2ColFunctor<
-      paddle::operators::math::ColFormat::kOCF, Place, float>
+      paddle::operators::math::ColFormat::kOCF, DeviceContext, float>
       im2col_ocf;
 
   im2col(*context, input, dilation, stride, padding, &output_cfo);
@@ -119,10 +108,10 @@ void testIm2col() {
 
   // Col2Im: kCFO
   paddle::operators::math::Col2ImFunctor<
-      paddle::operators::math::ColFormat::kCFO, Place, float>
+      paddle::operators::math::ColFormat::kCFO, DeviceContext, float>
       col2im;
   paddle::operators::math::Col2ImFunctor<
-      paddle::operators::math::ColFormat::kOCF, Place, float>
+      paddle::operators::math::ColFormat::kOCF, DeviceContext, float>
       col2im_ocf;
   float col2im_data[] = {0, 2, 2, 3, 8, 5};
 
@@ -168,8 +157,8 @@ void testIm2col() {
 }
 
 TEST(math, im2col) {
-  testIm2col<paddle::platform::CPUPlace>();
+  testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #ifdef PADDLE_WITH_CUDA
-  testIm2col<paddle::platform::GPUPlace>();
+  testIm2col<paddle::platform::CUDADeviceContext, paddle::platform::GPUPlace>();
 #endif
 }
diff --git a/paddle/operators/math/lstm_compute.cc b/paddle/operators/math/lstm_compute.cc
index ad3a59bcdb..2c2e8bb82e 100644
--- a/paddle/operators/math/lstm_compute.cc
+++ b/paddle/operators/math/lstm_compute.cc
@@ -21,8 +21,8 @@ namespace operators {
 namespace math {
 
 template <class T>
-struct LstmUnitFunctor<platform::CPUPlace, T> {
-  static void compute(const platform::DeviceContext& context,
+struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext& context,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
                       const std::string& gate_act, const std::string& cell_act,
                       const std::string& cand_act) {
@@ -42,8 +42,8 @@ struct LstmUnitFunctor<platform::CPUPlace, T> {
 };
 
 template <class T>
-struct LstmUnitGradFunctor<platform::CPUPlace, T> {
-  static void compute(const platform::DeviceContext& context,
+struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext& context,
                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
                       int frame_size, int batch_size,
                       const std::string& gate_act, const std::string& cell_act,
@@ -72,10 +72,10 @@ struct LstmUnitGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class LstmUnitFunctor<platform::CPUPlace, float>;
-template class LstmUnitFunctor<platform::CPUPlace, double>;
-template class LstmUnitGradFunctor<platform::CPUPlace, float>;
-template class LstmUnitGradFunctor<platform::CPUPlace, double>;
+template class LstmUnitFunctor<platform::CPUDeviceContext, float>;
+template class LstmUnitFunctor<platform::CPUDeviceContext, double>;
+template class LstmUnitGradFunctor<platform::CPUDeviceContext, float>;
+template class LstmUnitGradFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/lstm_compute.cu b/paddle/operators/math/lstm_compute.cu
index b2122f2a5c..92b1f4228b 100644
--- a/paddle/operators/math/lstm_compute.cu
+++ b/paddle/operators/math/lstm_compute.cu
@@ -21,8 +21,8 @@ namespace operators {
 namespace math {
 
 template <class T>
-struct LstmUnitFunctor<platform::GPUPlace, T> {
-  static void compute(const platform::DeviceContext& context,
+struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext& context,
                       LstmMetaValue<T> value, int frame_size, int batch_size,
                       const std::string& gate_act, const std::string& cell_act,
                       const std::string& cand_act) {
@@ -33,8 +33,8 @@ struct LstmUnitFunctor<platform::GPUPlace, T> {
 };
 
 template <class T>
-struct LstmUnitGradFunctor<platform::GPUPlace, T> {
-  static void compute(const platform::DeviceContext& context,
+struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext& context,
                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
                       int frame_size, int batch_size,
                       const std::string& gate_act, const std::string& cell_act,
@@ -45,10 +45,10 @@ struct LstmUnitGradFunctor<platform::GPUPlace, T> {
   }
 };
 
-template class LstmUnitFunctor<platform::GPUPlace, float>;
-template class LstmUnitFunctor<platform::GPUPlace, double>;
-template class LstmUnitGradFunctor<platform::GPUPlace, float>;
-template class LstmUnitGradFunctor<platform::GPUPlace, double>;
+template class LstmUnitFunctor<platform::CUDADeviceContext, float>;
+template class LstmUnitFunctor<platform::CUDADeviceContext, double>;
+template class LstmUnitGradFunctor<platform::CUDADeviceContext, float>;
+template class LstmUnitGradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/lstm_compute.h b/paddle/operators/math/lstm_compute.h
index 9652399d4c..5f74e27358 100644
--- a/paddle/operators/math/lstm_compute.h
+++ b/paddle/operators/math/lstm_compute.h
@@ -67,21 +67,20 @@ inline activation_mode_t ActiveType(const std::string &type) {
   }
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LstmUnitFunctor {
  public:
-  static void compute(const platform::DeviceContext &context,
-                      LstmMetaValue<T> value, int frame_size, int batch_size,
+  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
+                      int frame_size, int batch_size,
                       const std::string &gate_act, const std::string &cell_act,
                       const std::string &cand_act);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LstmUnitGradFunctor {
  public:
-  static void compute(const platform::DeviceContext &context,
-                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                      int frame_size, int batch_size,
+  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
+                      LstmMetaGrad<T> grad, int frame_size, int batch_size,
                       const std::string &gate_act, const std::string &cell_act,
                       const std::string &cand_act);
 };
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index 2e333a8cde..2b35e4532a 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -21,13 +21,11 @@ namespace operators {
 namespace math {
 
 template <>
-void gemm<platform::CPUPlace, float>(const platform::DeviceContext& context,
-                                     const CBLAS_TRANSPOSE transA,
-                                     const CBLAS_TRANSPOSE transB, const int M,
-                                     const int N, const int K,
-                                     const float alpha, const float* A,
-                                     const float* B, const float beta,
-                                     float* C) {
+void gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
   int lda = (transA == CblasNoTrans) ? K : M;
   int ldb = (transB == CblasNoTrans) ? N : K;
   int ldc = N;
@@ -36,13 +34,11 @@ void gemm<platform::CPUPlace, float>(const platform::DeviceContext& context,
 }
 
 template <>
-void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
-                                      const CBLAS_TRANSPOSE transA,
-                                      const CBLAS_TRANSPOSE transB, const int M,
-                                      const int N, const int K,
-                                      const double alpha, const double* A,
-                                      const double* B, const double beta,
-                                      double* C) {
+void gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
   int lda = (transA == CblasNoTrans) ? K : M;
   int ldb = (transB == CblasNoTrans) ? N : K;
   int ldc = N;
@@ -51,35 +47,32 @@ void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
 }
 
 template <>
-void gemm<platform::CPUPlace, float>(const platform::DeviceContext& context,
-                                     const bool transA, const bool transB,
-                                     const int M, const int N, const int K,
-                                     const float alpha, const float* A,
-                                     const int lda, const float* B,
-                                     const int ldb, const float beta, float* C,
-                                     const int ldc) {
+void gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K, const float alpha,
+    const float* A, const int lda, const float* B, const int ldb,
+    const float beta, float* C, const int ldc) {
   cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
               transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
               lda, B, ldb, beta, C, ldc);
 }
 
 template <>
-void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
-                                      const bool transA, const bool transB,
-                                      const int M, const int N, const int K,
-                                      const double alpha, const double* A,
-                                      const int lda, const double* B,
-                                      const int ldb, const double beta,
-                                      double* C, const int ldc) {
+void gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const int lda, const double* B,
+    const int ldb, const double beta, double* C, const int ldc) {
   cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
               transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
               lda, B, ldb, beta, C, ldc);
 }
 
 template <>
-void matmul<platform::CPUPlace, float>(
-    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
-    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha,
+void matmul<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, float alpha,
     framework::Tensor* matrix_out, float beta) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
@@ -99,15 +92,16 @@ void matmul<platform::CPUPlace, float>(
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
 
-  gemm<platform::CPUPlace, float>(
+  gemm<platform::CPUDeviceContext, float>(
       context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
       matrix_b.data<float>(), beta, matrix_out->data<float>());
 }
 
 template <>
-void matmul<platform::CPUPlace, double>(
-    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
-    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha,
+void matmul<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, double alpha,
     framework::Tensor* matrix_out, double beta) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
@@ -127,16 +121,16 @@ void matmul<platform::CPUPlace, double>(
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
 
-  gemm<platform::CPUPlace, double>(
+  gemm<platform::CPUDeviceContext, double>(
       context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
       matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
 
-#ifdef PADDLE_USE_MKLML
+#ifdef PADDLE_WITH_MKLML
 // Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize.
 template <>
-void batched_gemm<platform::CPUPlace, float>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const float alpha, const float* A, const float* B, const float beta,
     float* C, const int batchCount, const int strideA, const int strideB) {
@@ -157,8 +151,8 @@ void batched_gemm<platform::CPUPlace, float>(
 }
 
 template <>
-void batched_gemm<platform::CPUPlace, double>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const double alpha, const double* A, const double* B, const double beta,
     double* C, const int batchCount, const int strideA, const int strideB) {
@@ -183,8 +177,8 @@ void batched_gemm<platform::CPUPlace, double>(
 // functions of Intel MKL are not available. In the future, this computation
 // should be parallelized.
 template <>
-void batched_gemm<platform::CPUPlace, float>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const float alpha, const float* A, const float* B, const float beta,
     float* C, const int batchCount, const int strideA, const int strideB) {
@@ -192,14 +186,14 @@ void batched_gemm<platform::CPUPlace, float>(
     const float* Ak = &A[k * strideA];
     const float* Bk = &B[k * strideB];
     float* Ck = &C[k * M * N];
-    gemm<platform::CPUPlace, float>(context, transA, transB, M, N, K, alpha, Ak,
-                                    Bk, beta, Ck);
+    gemm<platform::CPUDeviceContext, float>(context, transA, transB, M, N, K,
+                                            alpha, Ak, Bk, beta, Ck);
   }
 }
 
 template <>
-void batched_gemm<platform::CPUPlace, double>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const double alpha, const double* A, const double* B, const double beta,
     double* C, const int batchCount, const int strideA, const int strideB) {
@@ -207,55 +201,53 @@ void batched_gemm<platform::CPUPlace, double>(
     const double* Ak = &A[k * strideA];
     const double* Bk = &B[k * strideB];
     double* Ck = &C[k * M * N];
-    gemm<platform::CPUPlace, double>(context, transA, transB, M, N, K, alpha,
-                                     Ak, Bk, beta, Ck);
+    gemm<platform::CPUDeviceContext, double>(context, transA, transB, M, N, K,
+                                             alpha, Ak, Bk, beta, Ck);
   }
 }
 #endif
 
 template <>
-void gemv<platform::CPUPlace, float>(const platform::DeviceContext& context,
-                                     const bool trans_a, const int M,
-                                     const int N, const float alpha,
-                                     const float* A, const float* B,
-                                     const float beta, float* C) {
+void gemv<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
+    const int N, const float alpha, const float* A, const float* B,
+    const float beta, float* C) {
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   cblas_sgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
 }
 
 template <>
-void gemv<platform::CPUPlace, double>(const platform::DeviceContext& context,
-                                      const bool trans_a, const int M,
-                                      const int N, const double alpha,
-                                      const double* A, const double* B,
-                                      const double beta, double* C) {
+void gemv<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
+    const int N, const double alpha, const double* A, const double* B,
+    const double beta, double* C) {
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
 }
 
 template <>
-void axpy<platform::CPUPlace, float>(const platform::DeviceContext& context,
-                                     const int n, const float alpha,
-                                     const float* x, float* y) {
+void axpy<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const int n, const float alpha,
+    const float* x, float* y) {
   cblas_saxpy(n, alpha, x, 1, y, 1);
 }
 
 template <>
-void axpy<platform::CPUPlace, double>(const platform::DeviceContext& context,
-                                      const int n, const double alpha,
-                                      const double* x, double* y) {
+void axpy<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const int n, const double alpha,
+    const double* x, double* y) {
   cblas_daxpy(n, alpha, x, 1, y, 1);
 }
 
-template struct SetConstant<platform::CPUPlace, float>;
-template struct SetConstant<platform::CPUPlace, double>;
-template struct SetConstant<platform::CPUPlace, int>;
-template struct SetConstant<platform::CPUPlace, int64_t>;
-template struct SetConstant<platform::CPUPlace, bool>;
+template struct SetConstant<platform::CPUDeviceContext, float>;
+template struct SetConstant<platform::CPUDeviceContext, double>;
+template struct SetConstant<platform::CPUDeviceContext, int>;
+template struct SetConstant<platform::CPUDeviceContext, int64_t>;
+template struct SetConstant<platform::CPUDeviceContext, bool>;
 
-#define DEFINE_CPU_TRANS(RANK)                                \
-  template struct Transpose<platform::CPUPlace, float, RANK>; \
-  template struct Transpose<platform::CPUPlace, double, RANK>;
+#define DEFINE_CPU_TRANS(RANK)                                        \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>; \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;
 
 DEFINE_CPU_TRANS(1);
 DEFINE_CPU_TRANS(2);
@@ -310,10 +302,10 @@ void set_constant(const platform::DeviceContext& context,
 #endif
 }
 
-template struct RowwiseAdd<platform::CPUPlace, float>;
-template struct RowwiseAdd<platform::CPUPlace, double>;
-template struct ColwiseSum<platform::CPUPlace, float>;
-template struct ColwiseSum<platform::CPUPlace, double>;
+template struct RowwiseAdd<platform::CPUDeviceContext, float>;
+template struct RowwiseAdd<platform::CPUDeviceContext, double>;
+template struct ColwiseSum<platform::CPUDeviceContext, float>;
+template struct ColwiseSum<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index 3018e50a4f..1b560a7e2d 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -22,13 +22,11 @@ namespace operators {
 namespace math {
 
 template <>
-void gemm<platform::GPUPlace, float>(const platform::DeviceContext& context,
-                                     const CBLAS_TRANSPOSE transA,
-                                     const CBLAS_TRANSPOSE transB, const int M,
-                                     const int N, const int K,
-                                     const float alpha, const float* A,
-                                     const float* B, const float beta,
-                                     float* C) {
+void gemm<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -39,19 +37,16 @@ void gemm<platform::GPUPlace, float>(const platform::DeviceContext& context,
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 
   PADDLE_ENFORCE(platform::dynload::cublasSgemm(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, N));
 }
 
 template <>
-void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
-                                      const CBLAS_TRANSPOSE transA,
-                                      const CBLAS_TRANSPOSE transB, const int M,
-                                      const int N, const int K,
-                                      const double alpha, const double* A,
-                                      const double* B, const double beta,
-                                      double* C) {
+void gemm<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -61,51 +56,45 @@ void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
   cublasOperation_t cuTransB =
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   PADDLE_ENFORCE(platform::dynload::cublasDgemm(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, N));
 }
 
 template <>
-void gemm<platform::GPUPlace, float>(const platform::DeviceContext& context,
-                                     const bool transA, const bool transB,
-                                     const int M, const int N, const int K,
-                                     const float alpha, const float* A,
-                                     const int lda, const float* B,
-                                     const int ldb, const float beta, float* C,
-                                     const int ldc) {
+void gemm<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K, const float alpha,
+    const float* A, const int lda, const float* B, const int ldb,
+    const float beta, float* C, const int ldc) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
   PADDLE_ENFORCE(platform::dynload::cublasSgemm(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, ldc));
 }
 
 template <>
-void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
-                                      const bool transA, const bool transB,
-                                      const int M, const int N, const int K,
-                                      const double alpha, const double* A,
-                                      const int lda, const double* B,
-                                      const int ldb, const double beta,
-                                      double* C, const int ldc) {
+void gemm<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const int lda, const double* B,
+    const int ldb, const double beta, double* C, const int ldc) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
   PADDLE_ENFORCE(platform::dynload::cublasDgemm(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, ldc));
 }
 
 template <>
-void matmul<platform::GPUPlace, float>(
-    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
-    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha,
+void matmul<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, float alpha,
     framework::Tensor* matrix_out, float beta) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
@@ -125,15 +114,16 @@ void matmul<platform::GPUPlace, float>(
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
 
-  gemm<platform::GPUPlace, float>(
+  gemm<platform::CUDADeviceContext, float>(
       context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
       matrix_b.data<float>(), beta, matrix_out->data<float>());
 }
 
 template <>
-void matmul<platform::GPUPlace, double>(
-    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
-    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha,
+void matmul<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, double alpha,
     framework::Tensor* matrix_out, double beta) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
@@ -153,14 +143,14 @@ void matmul<platform::GPUPlace, double>(
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
 
-  gemm<platform::GPUPlace, double>(
+  gemm<platform::CUDADeviceContext, double>(
       context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
       matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
 
 template <>
-void batched_gemm<platform::GPUPlace, float>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const float alpha, const float* A, const float* B, const float beta,
     float* C, const int batchCount, const int strideA, const int strideB) {
@@ -176,15 +166,13 @@ void batched_gemm<platform::GPUPlace, float>(
   const int strideC = M * N;
 
   PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA,
-      &beta, C, ldc, strideC, batchCount));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
+      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
 }
 
 template <>
-void batched_gemm<platform::GPUPlace, double>(
-    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+void batched_gemm<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const double alpha, const double* A, const double* B, const double beta,
     double* C, const int batchCount, const int strideA, const int strideB) {
@@ -200,68 +188,58 @@ void batched_gemm<platform::GPUPlace, double>(
   const int strideC = M * N;
 
   PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA,
-      &beta, C, ldc, strideC, batchCount));
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
+      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
 }
 
 template <>
-void gemv<platform::GPUPlace, float>(const platform::DeviceContext& context,
-                                     const bool trans_a, const int M,
-                                     const int N, const float alpha,
-                                     const float* A, const float* B,
-                                     const float beta, float* C) {
+void gemv<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const bool trans_a, const int M,
+    const int N, const float alpha, const float* A, const float* B,
+    const float beta, float* C) {
   cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
 
-  PADDLE_ENFORCE(platform::dynload::cublasSgemv(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1));
+  PADDLE_ENFORCE(platform::dynload::cublasSgemv(context.cublas_handle(),
+                                                cuTransA, N, M, &alpha, A, N, B,
+                                                1, &beta, C, 1));
 }
 
 template <>
-void gemv<platform::GPUPlace, double>(const platform::DeviceContext& context,
-                                      const bool trans_a, const int M,
-                                      const int N, const double alpha,
-                                      const double* A, const double* B,
-                                      const double beta, double* C) {
+void gemv<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const bool trans_a, const int M,
+    const int N, const double alpha, const double* A, const double* B,
+    const double beta, double* C) {
   cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  PADDLE_ENFORCE(platform::dynload::cublasDgemv(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1));
+  PADDLE_ENFORCE(platform::dynload::cublasDgemv(context.cublas_handle(),
+                                                cuTransA, N, M, &alpha, A, N, B,
+                                                1, &beta, C, 1));
 }
 
 template <>
-void axpy<platform::GPUPlace, float>(const platform::DeviceContext& context,
-                                     const int n, const float alpha,
-                                     const float* x, float* y) {
-  PADDLE_ENFORCE(platform::dynload::cublasSaxpy(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      n, &alpha, x, 1, y, 1));
+void axpy<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const int n, const float alpha,
+    const float* x, float* y) {
+  PADDLE_ENFORCE(platform::dynload::cublasSaxpy(context.cublas_handle(), n,
+                                                &alpha, x, 1, y, 1));
 }
 
 template <>
-void axpy<platform::GPUPlace, double>(const platform::DeviceContext& context,
-                                      const int n, const double alpha,
-                                      const double* x, double* y) {
-  PADDLE_ENFORCE(platform::dynload::cublasDaxpy(
-      reinterpret_cast<const platform::CUDADeviceContext&>(context)
-          .cublas_handle(),
-      n, &alpha, x, 1, y, 1));
+void axpy<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const int n, const double alpha,
+    const double* x, double* y) {
+  PADDLE_ENFORCE(platform::dynload::cublasDaxpy(context.cublas_handle(), n,
+                                                &alpha, x, 1, y, 1));
 }
 
-template struct SetConstant<platform::GPUPlace, float>;
-template struct SetConstant<platform::GPUPlace, double>;
-template struct SetConstant<platform::GPUPlace, int>;
-template struct SetConstant<platform::GPUPlace, int64_t>;
-template struct SetConstant<platform::GPUPlace, bool>;
+template struct SetConstant<platform::CUDADeviceContext, float>;
+template struct SetConstant<platform::CUDADeviceContext, double>;
+template struct SetConstant<platform::CUDADeviceContext, int>;
+template struct SetConstant<platform::CUDADeviceContext, int64_t>;
+template struct SetConstant<platform::CUDADeviceContext, bool>;
 
-#define DEFINE_GPU_TRANS(RANK)                                \
-  template struct Transpose<platform::GPUPlace, float, RANK>; \
-  template struct Transpose<platform::GPUPlace, double, RANK>;
+#define DEFINE_GPU_TRANS(RANK)                                         \
+  template struct Transpose<platform::CUDADeviceContext, float, RANK>; \
+  template struct Transpose<platform::CUDADeviceContext, double, RANK>;
 
 DEFINE_GPU_TRANS(1);
 DEFINE_GPU_TRANS(2);
@@ -277,8 +255,9 @@ struct TensorSetConstantGPU {
 
   template <typename T>
   void operator()() const {
-    SetConstant<platform::GPUPlace, T> functor;
-    functor(context_, tensor_, static_cast<T>(value_));
+    SetConstant<platform::CUDADeviceContext, T> functor;
+    functor(reinterpret_cast<const platform::CUDADeviceContext&>(context_),
+            tensor_, static_cast<T>(value_));
   }
 
   const platform::DeviceContext& context_;
@@ -294,27 +273,27 @@ void set_constant_with_place<platform::GPUPlace>(
                            TensorSetConstantGPU(context, tensor, value));
 }
 
-template struct RowwiseAdd<platform::GPUPlace, float>;
-template struct RowwiseAdd<platform::GPUPlace, double>;
-template struct ColwiseSum<platform::GPUPlace, float>;
-// template struct ColwiseSum<platform::GPUPlace, double>;
-// The ColwiseSum<platform::GPUPlace, double> failed in debug mode,
+template struct RowwiseAdd<platform::CUDADeviceContext, float>;
+template struct RowwiseAdd<platform::CUDADeviceContext, double>;
+template struct ColwiseSum<platform::CUDADeviceContext, float>;
+// template struct ColwiseSum<platform::CUDADeviceContext, double>;
+// The ColwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
 // and only failed for this case. So reimplemented it.
 template <>
-void ColwiseSum<platform::GPUPlace, double>::operator()(
-    const platform::DeviceContext& context, const framework::Tensor& input,
+void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
+    const platform::CUDADeviceContext& context, const framework::Tensor& input,
     framework::Tensor* vector) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
   PADDLE_ENFORCE_EQ(vector->numel(), size);
   framework::Tensor one;
   one.mutable_data<double>({in_dims[0]}, context.GetPlace());
-  SetConstant<platform::GPUPlace, double> set;
+  SetConstant<platform::CUDADeviceContext, double> set;
   set(context, &one, static_cast<double>(1.0));
-  gemv<platform::GPUPlace, double>(context, true, static_cast<int>(in_dims[0]),
-                                   static_cast<int>(in_dims[1]), 1.0,
-                                   input.data<double>(), one.data<double>(),
-                                   0.0, vector->data<double>());
+  gemv<platform::CUDADeviceContext, double>(
+      context, true, static_cast<int>(in_dims[0]), static_cast<int>(in_dims[1]),
+      1.0, input.data<double>(), one.data<double>(), 0.0,
+      vector->data<double>());
 }
 
 }  // namespace math
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
index 5a42854f22..8cc03c2ba0 100644
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#ifdef PADDLE_USE_MKLML
+#ifdef PADDLE_WITH_MKLML
 #include <mkl_cblas.h>
 #include <mkl_lapacke.h>
 #include <mkl_vml_functions.h>
@@ -62,53 +62,51 @@ namespace math {
 // Then matrixA: M * K, matrixB: K * N, matrixC : M * N
 // For more detailed info, please refer to
 // http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
-template <typename Place, typename T>
-void gemm(const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+template <typename DeviceContext, typename T>
+void gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
           const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
           const T alpha, const T* A, const T* B, const T beta, T* C);
 
 // gemm wrapper with stride args for matrix uncontinuous in memory
-template <typename Place, typename T>
-void gemm(const platform::DeviceContext& context, const bool transA,
-          const bool transB, const int M, const int N, const int K,
-          const T alpha, const T* A, const int lda, const T* B, const int ldb,
-          const T beta, T* C, const int ldc);
+template <typename DeviceContext, typename T>
+void gemm(const DeviceContext& context, const bool transA, const bool transB,
+          const int M, const int N, const int K, const T alpha, const T* A,
+          const int lda, const T* B, const int ldb, const T beta, T* C,
+          const int ldc);
 
 // matrix multiply with continuous memory
-template <typename Place, typename T>
-void matmul(const platform::DeviceContext& context,
-            const framework::Tensor& matrix_a, bool trans_a,
-            const framework::Tensor& matrix_b, bool trans_b, T alpha,
-            framework::Tensor* matrix_out, T beta);
+template <typename DeviceContext, typename T>
+void matmul(const DeviceContext& context, const framework::Tensor& matrix_a,
+            bool trans_a, const framework::Tensor& matrix_b, bool trans_b,
+            T alpha, framework::Tensor* matrix_out, T beta);
 
 // Batched gemm
-template <typename Place, typename T>
-void batched_gemm(const platform::DeviceContext& context,
-                  const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-                  const int M, const int N, const int K, const T alpha,
-                  const T* A, const T* B, const T beta, T* C,
-                  const int batchCount, const int strideA, const int strideB);
-
-template <typename Place, typename T>
-void gemv(const platform::DeviceContext& context, const bool trans_a,
-          const int M, const int N, const T alpha, const T* A, const T* B,
-          const T beta, T* C);
-
-template <typename Place, typename T>
-void axpy(const platform::DeviceContext& context, const int n, const T alpha,
-          const T* x, T* y);
-
-template <typename Place, typename T, int Rank>
+template <typename DeviceContext, typename T>
+void batched_gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE transB, const int M, const int N,
+                  const int K, const T alpha, const T* A, const T* B,
+                  const T beta, T* C, const int batchCount, const int strideA,
+                  const int strideB);
+
+template <typename DeviceContext, typename T>
+void gemv(const DeviceContext& context, const bool trans_a, const int M,
+          const int N, const T alpha, const T* A, const T* B, const T beta,
+          T* C);
+
+template <typename DeviceContext, typename T>
+void axpy(const DeviceContext& context, const int n, const T alpha, const T* x,
+          T* y);
+
+template <typename DeviceContext, typename T, int Rank>
 struct Transpose {
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& in, framework::Tensor* out,
-                  const std::vector<int>& axis);
+  void operator()(const DeviceContext& context, const framework::Tensor& in,
+                  framework::Tensor* out, const std::vector<int>& axis);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SetConstant {
-  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor* tensor, T num);
+  void operator()(const DeviceContext& context, framework::Tensor* tensor,
+                  T num);
 };
 
 template <typename Place>
@@ -118,17 +116,16 @@ void set_constant_with_place(const platform::DeviceContext& context,
 void set_constant(const platform::DeviceContext& context,
                   framework::Tensor* tensor, float value);
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct RowwiseAdd {
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, const framework::Tensor& vec,
-                  framework::Tensor* output);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& vec, framework::Tensor* output);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct ColwiseSum {
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* vec);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* vec);
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h
index 4dc17a4e52..3e6d833865 100644
--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
@@ -20,16 +20,17 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename Place, typename T>
-void SetConstant<Place, T>::operator()(const platform::DeviceContext& context,
-                                       framework::Tensor* tensor, T num) {
+template <typename DeviceContext, typename T>
+void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
+                                               framework::Tensor* tensor,
+                                               T num) {
   auto t = framework::EigenVector<T>::Flatten(*tensor);
-  t.device(*context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(num));
+  t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
 }
 
-template <typename Place, typename T, int Rank>
-void Transpose<Place, T, Rank>::operator()(
-    const platform::DeviceContext& context, const framework::Tensor& in,
+template <typename DeviceContext, typename T, int Rank>
+void Transpose<DeviceContext, T, Rank>::operator()(
+    const DeviceContext& context, const framework::Tensor& in,
     framework::Tensor* out, const std::vector<int>& axis) {
   Eigen::array<int, Rank> permute;
   for (int i = 0; i < Rank; i++) {
@@ -40,15 +41,15 @@ void Transpose<Place, T, Rank>::operator()(
 
   auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
   auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
-  auto* dev = context.GetEigenDevice<Place>();
+  auto* dev = context.eigen_device();
   eigen_out.device(*dev) = eigen_in.shuffle(permute);
 }
 
-template <typename Place, typename T>
-void RowwiseAdd<Place, T>::operator()(const platform::DeviceContext& context,
-                                      const framework::Tensor& input,
-                                      const framework::Tensor& vector,
-                                      framework::Tensor* output) {
+template <typename DeviceContext, typename T>
+void RowwiseAdd<DeviceContext, T>::operator()(const DeviceContext& context,
+                                              const framework::Tensor& input,
+                                              const framework::Tensor& vector,
+                                              framework::Tensor* output) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
   PADDLE_ENFORCE_EQ(vector.numel(), size);
@@ -59,14 +60,14 @@ void RowwiseAdd<Place, T>::operator()(const platform::DeviceContext& context,
   auto out = framework::EigenMatrix<T>::From(*output);
   Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
   Eigen::array<int, 2> bcast({{static_cast<int>(in_dims[0]), 1}});
-  out.device(*context.GetEigenDevice<Place>()) =
+  out.device(*context.eigen_device()) =
       in + vec.reshape(shape).broadcast(bcast);
 }
 
-template <typename Place, typename T>
-void ColwiseSum<Place, T>::operator()(const platform::DeviceContext& context,
-                                      const framework::Tensor& input,
-                                      framework::Tensor* vector) {
+template <typename DeviceContext, typename T>
+void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
+                                              const framework::Tensor& input,
+                                              framework::Tensor* vector) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
   PADDLE_ENFORCE_EQ(vector->numel(), size);
@@ -74,7 +75,7 @@ void ColwiseSum<Place, T>::operator()(const platform::DeviceContext& context,
   auto vec = framework::EigenMatrix<T>::From(*vector);
   auto in = framework::EigenMatrix<T>::From(input);
   Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
-  vec.reshape(shape).device(*context.GetEigenDevice<Place>()) =
+  vec.reshape(shape).device(*context.eigen_device()) =
       in.sum(Eigen::array<int, 1>({{0}})).reshape(shape);
 }
 
diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc
index 983c9fdcff..7c6f098ca9 100644
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
@@ -21,7 +21,7 @@ TEST(math_function, gemm_notrans_cblas) {
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::gemm<paddle::platform::CPUPlace, float>(
+  paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>(
       context, false, false, m, n, k, 1, input1_ptr, 3, input2_ptr + 1, 4, 1,
       input3_ptr + 1, 4);
 
@@ -55,7 +55,7 @@ TEST(math_function, gemm_trans_clbas) {
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::gemm<paddle::platform::CPUPlace, float>(
+  paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>(
       context, false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1,
       input3_ptr + 1, 4);
 
@@ -74,7 +74,8 @@ TEST(math_function, zero) {
   auto* cpu_place = new paddle::platform::CPUPlace();
   float* t = tensor.mutable_data<float>({2, 2}, *cpu_place);
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUPlace, float>
+  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
+                                       float>
       functor;
   functor(context, &tensor, 0);
   EXPECT_EQ(t[0], 0);
@@ -110,7 +111,7 @@ void GemvTest(int m, int n, bool trans) {
   }
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::gemv<paddle::platform::CPUPlace, T>(
+  paddle::operators::math::gemv<paddle::platform::CPUDeviceContext, T>(
       context, trans, static_cast<int>(m), static_cast<int>(n), 1., data_a,
       data_b, 0., data_c);
 
diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu
index d5d6f0c73b..32e96d9487 100644
--- a/paddle/operators/math/math_function_test.cu
+++ b/paddle/operators/math/math_function_test.cu
@@ -21,7 +21,7 @@ TEST(math_function, notrans_mul_trans) {
 
   out_gpu.mutable_data<float>({2, 2}, *gpu_place);
 
-  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
+  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
       context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
 
   paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out);
@@ -55,7 +55,7 @@ TEST(math_function, trans_mul_notrans) {
 
   out_gpu.mutable_data<float>({3, 3}, *gpu_place);
 
-  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
+  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
       context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
 
   paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out);
@@ -106,7 +106,7 @@ TEST(math_function, gemm_notrans_cublas) {
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(*gpu_place);
 
-  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
+  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
       context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
 
   paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3);
@@ -161,7 +161,7 @@ TEST(math_function, gemm_trans_cublas) {
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(*gpu_place);
 
-  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
+  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
       context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
 
   paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3);
@@ -208,7 +208,7 @@ void GemvTest(int m, int n, bool trans) {
   paddle::framework::CopyFrom(mat_a, *gpu_place, context, &g_mat_a);
   paddle::framework::CopyFrom(vec_b, *gpu_place, context, &g_vec_b);
 
-  paddle::operators::math::gemv<paddle::platform::GPUPlace, T>(
+  paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>(
       context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
       g_data_b, 0., g_data_c);
 
diff --git a/paddle/operators/math/matmul.h b/paddle/operators/math/matmul.h
index 6ba9a0ba9a..7048e11e6f 100644
--- a/paddle/operators/math/matmul.h
+++ b/paddle/operators/math/matmul.h
@@ -26,13 +26,12 @@ namespace math {
 //
 // Both a & b can be 1- to 3-dimensional. Higher rank tensors are not supported
 // yet.
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MatMulFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& a, bool trans_a,
-                  const framework::Tensor& b, bool trans_b, T alpha,
-                  framework::Tensor* out, T beta) {
+  void operator()(const DeviceContext& context, const framework::Tensor& a,
+                  bool trans_a, const framework::Tensor& b, bool trans_b,
+                  T alpha, framework::Tensor* out, T beta) {
     auto dim_a = a.dims();
     auto dim_b = b.dims();
 
@@ -108,13 +107,13 @@ class MatMulFunctor {
 
     if (!batchCount) {
       // regular matrix multiplication
-      gemm<Place, T>(context, transA, transB, M, N, kA, alpha, a.data<T>(),
-                     b.data<T>(), beta, out->data<T>());
+      gemm<DeviceContext, T>(context, transA, transB, M, N, kA, alpha,
+                             a.data<T>(), b.data<T>(), beta, out->data<T>());
     } else {
       // batched matrix multiplication
-      batched_gemm<Place, T>(context, transA, transB, M, N, kA, alpha,
-                             a.data<T>(), b.data<T>(), beta, out->data<T>(),
-                             batchCount, strideA, strideB);
+      batched_gemm<DeviceContext, T>(
+          context, transA, transB, M, N, kA, alpha, a.data<T>(), b.data<T>(),
+          beta, out->data<T>(), batchCount, strideA, strideB);
     }
   }
 };
diff --git a/paddle/operators/math/maxouting.cc b/paddle/operators/math/maxouting.cc
index c9003962d3..fea86675f7 100644
--- a/paddle/operators/math/maxouting.cc
+++ b/paddle/operators/math/maxouting.cc
@@ -20,9 +20,9 @@ namespace math {
 
 // All tensors are in NCHW format, and the groups must be greater than 1
 template <typename T>
-class MaxOutFunctor<platform::CPUPlace, T> {
+class MaxOutFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, framework::Tensor* output,
                   int groups) {
     const int batch_size = input.dims()[0];
@@ -54,9 +54,9 @@ class MaxOutFunctor<platform::CPUPlace, T> {
 };
 
 template <class T>
-class MaxOutGradFunctor<platform::CPUPlace, T> {
+class MaxOutGradFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, int groups) {
@@ -91,10 +91,10 @@ class MaxOutGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class MaxOutGradFunctor<platform::CPUPlace, float>;
-template class MaxOutGradFunctor<platform::CPUPlace, double>;
-template class MaxOutFunctor<platform::CPUPlace, float>;
-template class MaxOutFunctor<platform::CPUPlace, double>;
+template class MaxOutGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxOutGradFunctor<platform::CPUDeviceContext, double>;
+template class MaxOutFunctor<platform::CPUDeviceContext, float>;
+template class MaxOutFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/maxouting.cu b/paddle/operators/math/maxouting.cu
index c3fabcae08..6056ad251c 100644
--- a/paddle/operators/math/maxouting.cu
+++ b/paddle/operators/math/maxouting.cu
@@ -78,9 +78,9 @@ __global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
  * All tensors are in NCHW format.
  */
 template <typename T>
-class MaxOutFunctor<platform::GPUPlace, T> {
+class MaxOutFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, framework::Tensor* output,
                   int groups) {
     const int batch_size = input.dims()[0];
@@ -98,20 +98,18 @@ class MaxOutFunctor<platform::GPUPlace, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxOut<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, input_data, input_channels,
-                              input_height, input_width, groups, output_data);
+    KernelMaxOut<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, input_channels, input_height, input_width, groups,
+        output_data);
   }
 };
 /*
  * All tensors are in NCHW format.
  */
 template <typename T>
-class MaxOutGradFunctor<platform::GPUPlace, T> {
+class MaxOutGradFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, int groups) {
@@ -132,20 +130,17 @@ class MaxOutGradFunctor<platform::GPUPlace, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxoutGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, input_data, output_data,
-                              output_grad_data, input_grad_data, input_channels,
-                              input_height, input_width, groups);
+    KernelMaxoutGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_height, input_width, groups);
   }
 };
 
-template class MaxOutGradFunctor<platform::GPUPlace, float>;
-template class MaxOutGradFunctor<platform::GPUPlace, double>;
+template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
 
-template class MaxOutFunctor<platform::GPUPlace, float>;
-template class MaxOutFunctor<platform::GPUPlace, double>;
+template class MaxOutFunctor<platform::CUDADeviceContext, float>;
+template class MaxOutFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/maxouting.h b/paddle/operators/math/maxouting.h
index 2d9069b0b3..68f4743db0 100644
--- a/paddle/operators/math/maxouting.h
+++ b/paddle/operators/math/maxouting.h
@@ -23,20 +23,18 @@ namespace math {
 
 #define FLT_MAX __FLT_MAX__
 
-template <typename Place, typename T>
-
+template <typename DeviceContext, typename T>
 class MaxOutFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* output,
-                  int groups);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* output, int groups);
 };
 
-template <typename Place, class T>
+template <typename DeviceContext, class T>
 class MaxOutGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* input_grad,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* input_grad,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, int groups);
 };
diff --git a/paddle/operators/math/pooling.cc b/paddle/operators/math/pooling.cc
index 135984586a..150de6fd59 100644
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
@@ -24,9 +24,9 @@ namespace math {
  * height and width, respectively.
  */
 template <typename PoolProcess, typename T>
-class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
+class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_process, framework::Tensor* output) {
@@ -84,9 +84,9 @@ class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
 * and width, respectively.
 */
 template <typename PoolProcess, class T>
-class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
+class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -152,9 +152,9 @@ class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
  * height and width, respectively.
  */
 template <class T>
-class MaxPool2dGradFunctor<platform::CPUPlace, T> {
+class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -213,25 +213,29 @@ class MaxPool2dGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class MaxPool2dGradFunctor<platform::CPUPlace, float>;
-template class MaxPool2dGradFunctor<platform::CPUPlace, double>;
+template class MaxPool2dGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxPool2dGradFunctor<platform::CPUDeviceContext, double>;
 
-template class Pool2dFunctor<platform::CPUPlace,
+template class Pool2dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::MaxPool<float>, float>;
-template class Pool2dFunctor<platform::CPUPlace,
+template class Pool2dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::AvgPool<float>, float>;
-template class Pool2dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
-template class Pool2dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
-template class Pool2dFunctor<platform::CPUPlace,
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool2dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::MaxPool<double>, double>;
-template class Pool2dFunctor<platform::CPUPlace,
+template class Pool2dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::AvgPool<double>, double>;
-template class Pool2dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
-template class Pool2dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
 
 /*
  * All tensors are in NCDHW format.
@@ -239,9 +243,9 @@ template class Pool2dGradFunctor<
  * depth, height and width, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
+class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_process, framework::Tensor* output) {
@@ -314,9 +318,9 @@ class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
  * depth, height and width, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
+class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -398,9 +402,9 @@ class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
  * depth, height and width, respectively.
  */
 template <class T>
-class MaxPool3dGradFunctor<platform::CPUPlace, T> {
+class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -473,25 +477,29 @@ class MaxPool3dGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class MaxPool3dGradFunctor<platform::CPUPlace, float>;
-template class MaxPool3dGradFunctor<platform::CPUPlace, double>;
+template class MaxPool3dGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxPool3dGradFunctor<platform::CPUDeviceContext, double>;
 
-template class Pool3dFunctor<platform::CPUPlace,
+template class Pool3dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::MaxPool<float>, float>;
-template class Pool3dFunctor<platform::CPUPlace,
+template class Pool3dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::AvgPool<float>, float>;
-template class Pool3dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
-template class Pool3dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
-template class Pool3dFunctor<platform::CPUPlace,
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool3dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::MaxPool<double>, double>;
-template class Pool3dFunctor<platform::CPUPlace,
+template class Pool3dFunctor<platform::CPUDeviceContext,
                              paddle::operators::math::AvgPool<double>, double>;
-template class Pool3dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
-template class Pool3dGradFunctor<
-    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
 
 /*
  * All tensors are in NCHW format.
@@ -499,9 +507,9 @@ template class Pool3dGradFunctor<
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexFunctor<platform::CPUPlace, T1, T2> {
+class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* output, framework::Tensor* mask) {
@@ -564,9 +572,9 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T1, T2> {
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
+class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
@@ -602,10 +610,14 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
   }
 };
 
-template class MaxPool2dWithIndexFunctor<platform::CPUPlace, float, int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, float, int>;
-template class MaxPool2dWithIndexFunctor<platform::CPUPlace, double, int>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, double, int>;
+template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, float,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, float,
+                                             int>;
+template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, double,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, double,
+                                             int>;
 
 /*
  * All tensors are in NCDHW format.
@@ -613,9 +625,9 @@ template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, double, int>;
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexFunctor<platform::CPUPlace, T1, T2> {
+class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* output, framework::Tensor* mask) {
@@ -692,9 +704,9 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T1, T2> {
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
+class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
@@ -735,10 +747,14 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
   }
 };
 
-template class MaxPool3dWithIndexFunctor<platform::CPUPlace, float, int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, float, int>;
-template class MaxPool3dWithIndexFunctor<platform::CPUPlace, double, int>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, double, int>;
+template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, float,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, float,
+                                             int>;
+template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, double,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, double,
+                                             int>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/pooling.cu b/paddle/operators/math/pooling.cu
index ca3560f264..0243cf8316 100644
--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
@@ -155,9 +155,9 @@ __global__ void KernelMaxPool2DGrad(
  * height and width, respectively.
  */
 template <typename PoolProcess, typename T>
-class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
+class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_process, framework::Tensor* output) {
@@ -183,11 +183,7 @@ class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool2D<
-        PoolProcess,
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
         stride_width, padding_height, padding_width, pool_process, output_data);
@@ -200,9 +196,9 @@ class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
  * height and width, respectively.
  */
 template <typename PoolProcess, typename T>
-class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
+class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -231,11 +227,7 @@ class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool2DGrad<
-        PoolProcess,
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelPool2DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_height, input_width, output_height, output_width, ksize_height,
         ksize_width, stride_height, stride_width, padding_height, padding_width,
@@ -249,9 +241,9 @@ class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
  * height and width, respectively.
  */
 template <typename T>
-class MaxPool2dGradFunctor<platform::GPUPlace, T> {
+class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -281,10 +273,7 @@ class MaxPool2dGradFunctor<platform::GPUPlace, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool2DGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_height, input_width, output_height, output_width, ksize_height,
         ksize_width, stride_height, stride_width, padding_height, padding_width,
@@ -292,25 +281,29 @@ class MaxPool2dGradFunctor<platform::GPUPlace, T> {
   }
 };
 
-template class MaxPool2dGradFunctor<platform::GPUPlace, float>;
-template class MaxPool2dGradFunctor<platform::GPUPlace, double>;
+template class MaxPool2dGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxPool2dGradFunctor<platform::CUDADeviceContext, double>;
 
-template class Pool2dFunctor<platform::GPUPlace,
+template class Pool2dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::MaxPool<float>, float>;
-template class Pool2dFunctor<platform::GPUPlace,
+template class Pool2dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::AvgPool<float>, float>;
-template class Pool2dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
-template class Pool2dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
-template class Pool2dFunctor<platform::GPUPlace,
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool2dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::MaxPool<double>, double>;
-template class Pool2dFunctor<platform::GPUPlace,
+template class Pool2dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::AvgPool<double>, double>;
-template class Pool2dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
-template class Pool2dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
 
 template <typename PoolProcess, typename T>
 __global__ void KernelPool3D(const int nthreads, const T* input_data,
@@ -478,9 +471,9 @@ __global__ void KernelMaxPool3DGrad(
  * depth, height and width, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
+class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_process, framework::Tensor* output) {
@@ -512,11 +505,7 @@ class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool3D<
-        PoolProcess,
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_depth, input_height,
         input_width, output_depth, output_height, output_width, ksize_depth,
         ksize_height, ksize_width, stride_depth, stride_height, stride_width,
@@ -531,9 +520,9 @@ class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
  * depth, height and width, respectively.
  */
 template <typename PoolProcess, class T>
-class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
+class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -569,11 +558,7 @@ class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool3DGrad<
-        PoolProcess,
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelPool3DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_depth, input_height, input_width, output_depth, output_height,
         output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
@@ -588,9 +573,9 @@ class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
  * depth, height and width, respectively.
  */
 template <class T>
-class MaxPool3dGradFunctor<platform::GPUPlace, T> {
+class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
@@ -626,10 +611,7 @@ class MaxPool3dGradFunctor<platform::GPUPlace, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool3DGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
+    KernelMaxPool3DGrad<T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_depth, input_height, input_width, output_depth, output_height,
         output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
@@ -638,25 +620,29 @@ class MaxPool3dGradFunctor<platform::GPUPlace, T> {
   }
 };
 
-template class MaxPool3dGradFunctor<platform::GPUPlace, float>;
-template class MaxPool3dGradFunctor<platform::GPUPlace, double>;
+template class MaxPool3dGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxPool3dGradFunctor<platform::CUDADeviceContext, double>;
 
-template class Pool3dFunctor<platform::GPUPlace,
+template class Pool3dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::MaxPool<float>, float>;
-template class Pool3dFunctor<platform::GPUPlace,
+template class Pool3dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::AvgPool<float>, float>;
-template class Pool3dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
-template class Pool3dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
-template class Pool3dFunctor<platform::GPUPlace,
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool3dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::MaxPool<double>, double>;
-template class Pool3dFunctor<platform::GPUPlace,
+template class Pool3dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::AvgPool<double>, double>;
-template class Pool3dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
-template class Pool3dGradFunctor<
-    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
 
 template <typename T1, typename T2>
 __global__ void KernelMaxPool2dWithIdx(
@@ -747,9 +733,9 @@ __global__ void KernelMaxPool2DWithIdxGrad(
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexFunctor<platform::GPUPlace, T1, T2> {
+class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* output, framework::Tensor* mask) {
@@ -776,10 +762,7 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T1, T2> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool2dWithIdx<
-        T1, T2><<<grid, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(
+    KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
         stride_width, padding_height, padding_width, output_data, mask_data);
@@ -792,9 +775,9 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T1, T2> {
  * height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
+class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
@@ -821,10 +804,7 @@ class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool2DWithIdxGrad<
-        T1, T2><<<grid, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(
+    KernelMaxPool2DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, output_grad_data, mask_data, input_channels, input_height,
         input_width, output_height, output_width, ksize_height, ksize_width,
         stride_height, stride_width, padding_height, padding_width,
@@ -832,10 +812,14 @@ class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
   }
 };
 
-template class MaxPool2dWithIndexFunctor<platform::GPUPlace, float, int>;
-template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, float, int>;
-template class MaxPool2dWithIndexFunctor<platform::GPUPlace, double, int>;
-template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, double, int>;
+template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, float,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, float,
+                                             int>;
+template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, double,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext,
+                                             double, int>;
 
 template <typename T1, typename T2>
 __global__ void KernelMaxPool3DWithIdx(
@@ -950,9 +934,9 @@ __global__ void KernelMaxPool3DWithIdxGrad(
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexFunctor<platform::GPUPlace, T1, T2> {
+class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* output, framework::Tensor* mask) {
@@ -985,10 +969,7 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T1, T2> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool3DWithIdx<
-        T1, T2><<<grid, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(
+    KernelMaxPool3DWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_depth, input_height,
         input_width, output_depth, output_height, output_width, ksize_depth,
         ksize_height, ksize_width, stride_depth, stride_height, stride_width,
@@ -1002,9 +983,9 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T1, T2> {
  * depth, height and width, respectively.
  */
 template <typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
+class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
@@ -1037,10 +1018,7 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool3DWithIdxGrad<
-        T1, T2><<<grid, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(
+    KernelMaxPool3DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, output_grad_data, mask_data, input_channels, input_depth,
         input_height, input_width, output_depth, output_height, output_width,
         ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
@@ -1049,10 +1027,14 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
   }
 };
 
-template class MaxPool3dWithIndexFunctor<platform::GPUPlace, float, int>;
-template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, float, int>;
-template class MaxPool3dWithIndexFunctor<platform::GPUPlace, double, int>;
-template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, double, int>;
+template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, float,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, float,
+                                             int>;
+template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, double,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext,
+                                             double, int>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/pooling.h b/paddle/operators/math/pooling.h
index 19fbd8b4bb..2759f06cb6 100644
--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
@@ -84,62 +84,58 @@ class AvgPoolGrad {
  * This is different from average pooling. So we rewrite the max_pool_grad:
  * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
  */
-template <typename Place, typename PoolProcess, typename T>
+template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool2dFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute, framework::Tensor* output);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute,
+                  framework::Tensor* output);
 };
 
-template <typename Place, typename PoolProcess, typename T>
+template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool2dGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_compute, framework::Tensor* input_grad);
 };
 
-template <typename Place, class T>
+template <typename DeviceContext, class T>
 class MaxPool2dGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* input_grad);
 };
 
-template <typename Place, typename PoolProcess, typename T>
+template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool3dFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute, framework::Tensor* output);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute,
+                  framework::Tensor* output);
 };
 
-template <typename Place, typename PoolProcess, typename T>
+template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool3dGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_compute, framework::Tensor* input_grad);
 };
 
-template <typename Place, class T>
+template <typename DeviceContext, class T>
 class MaxPool3dGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
@@ -153,38 +149,38 @@ class MaxPool3dGradFunctor {
  * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
  * NCDHW format.
  */
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPool2dWithIndexFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* output, framework::Tensor* mask);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, framework::Tensor* output,
+                  framework::Tensor* mask);
 };
 
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPool2dWithIndexGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* input_grad);
 };
 
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPool3dWithIndexFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* output, framework::Tensor* mask);
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, framework::Tensor* output,
+                  framework::Tensor* mask);
 };
 
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPool3dWithIndexGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc
index 514f2adef2..ab758d1e7f 100644
--- a/paddle/operators/math/selected_rows_functor.cc
+++ b/paddle/operators/math/selected_rows_functor.cc
@@ -19,8 +19,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 template <typename T>
-struct SelectedRowsAdd<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::SelectedRows& input2,
                   framework::SelectedRows* output) {
@@ -67,12 +67,12 @@ struct SelectedRowsAdd<platform::CPUPlace, T> {
   }
 };
 
-template struct SelectedRowsAdd<platform::CPUPlace, float>;
-template struct SelectedRowsAdd<platform::CPUPlace, double>;
+template struct SelectedRowsAdd<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAdd<platform::CPUDeviceContext, double>;
 
 template <typename T>
-struct SelectedRowsAddTensor<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::Tensor& input2, framework::Tensor* output) {
     auto in1_height = input1.height();
@@ -88,7 +88,7 @@ struct SelectedRowsAddTensor<platform::CPUPlace, T> {
     PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
     PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
 
-    SetConstant<platform::CPUPlace, T> functor;
+    SetConstant<platform::CPUDeviceContext, T> functor;
     functor(context, output, 0.0);
 
     auto* in1_data = in1_value.data<T>();
@@ -103,17 +103,16 @@ struct SelectedRowsAddTensor<platform::CPUPlace, T> {
 
     auto out_eigen = framework::EigenVector<T>::Flatten(*output);
     auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
-    out_eigen.device(*context.GetEigenDevice<platform::CPUPlace>()) =
-        out_eigen + in2_eigen;
+    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
   }
 };
 
-template struct SelectedRowsAddTensor<platform::CPUPlace, float>;
-template struct SelectedRowsAddTensor<platform::CPUPlace, double>;
+template struct SelectedRowsAddTensor<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAddTensor<platform::CPUDeviceContext, double>;
 
 template <typename T>
-struct SelectedRowsAddTo<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& input1,
                   const int64_t input2_offset,
                   framework::SelectedRows* input2) {
@@ -143,14 +142,14 @@ struct SelectedRowsAddTo<platform::CPUPlace, T> {
   }
 };
 
-template struct SelectedRowsAddTo<platform::CPUPlace, float>;
-template struct SelectedRowsAddTo<platform::CPUPlace, double>;
-template struct SelectedRowsAddTo<platform::CPUPlace, int>;
-template struct SelectedRowsAddTo<platform::CPUPlace, int64_t>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, double>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, int>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, int64_t>;
 
 template <typename T>
-struct SelectedRowsAddToTensor<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& input1,
                   framework::Tensor* input2) {
     auto in1_height = input1.height();
@@ -175,10 +174,10 @@ struct SelectedRowsAddToTensor<platform::CPUPlace, T> {
   }
 };
 
-template struct SelectedRowsAddToTensor<platform::CPUPlace, float>;
-template struct SelectedRowsAddToTensor<platform::CPUPlace, double>;
-template struct SelectedRowsAddToTensor<platform::CPUPlace, int>;
-template struct SelectedRowsAddToTensor<platform::CPUPlace, int64_t>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu
index c1dd323ba2..c44577e00a 100644
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -20,8 +20,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 template <typename T>
-struct SelectedRowsAdd<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::SelectedRows& input2,
                   framework::SelectedRows* output) {
@@ -64,16 +64,15 @@ struct SelectedRowsAdd<platform::GPUPlace, T> {
         reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
 
     auto* in2_data = in2_value.data<T>();
-    memory::Copy(
-        boost::get<platform::GPUPlace>(out_place), out_data + in1_value.numel(),
-        boost::get<platform::GPUPlace>(in2_place), in2_data,
-        in2_value.numel() * sizeof(T),
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
+    memory::Copy(boost::get<platform::GPUPlace>(out_place),
+                 out_data + in1_value.numel(),
+                 boost::get<platform::GPUPlace>(in2_place), in2_data,
+                 in2_value.numel() * sizeof(T), context.stream());
   }
 };
 
-template struct SelectedRowsAdd<platform::GPUPlace, float>;
-template struct SelectedRowsAdd<platform::GPUPlace, double>;
+template struct SelectedRowsAdd<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAdd<platform::CUDADeviceContext, double>;
 
 namespace {
 template <typename T, int block_size>
@@ -96,8 +95,8 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
 }  // namespace
 
 template <typename T>
-struct SelectedRowsAddTensor<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::Tensor& input2, framework::Tensor* output) {
     auto in1_height = input1.height();
@@ -117,30 +116,28 @@ struct SelectedRowsAddTensor<platform::GPUPlace, T> {
     auto* in2_data = input2.data<T>();
     auto* out_data = output->data<T>();
 
-    SetConstant<platform::GPUPlace, T> functor;
+    SetConstant<platform::CUDADeviceContext, T> functor;
     functor(context, output, 0.0);
 
     const int block_size = 256;
     dim3 threads(block_size, 1);
     dim3 grid(1, in1_rows.size());
-    SelectedRowsAddTensorKernel<T, block_size><<<
-        grid, threads, 0,
-        reinterpret_cast<const platform::CUDADeviceContext&>(context)
-            .stream()>>>(in1_data, in1_rows.data(), out_data, in1_row_numel);
+    SelectedRowsAddTensorKernel<
+        T, block_size><<<grid, threads, 0, context.stream()>>>(
+        in1_data, in1_rows.data(), out_data, in1_row_numel);
 
     auto out_eigen = framework::EigenVector<T>::Flatten(*output);
     auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
-    out_eigen.device(*context.GetEigenDevice<platform::GPUPlace>()) =
-        out_eigen + in2_eigen;
+    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
   }
 };
 
-template struct SelectedRowsAddTensor<platform::GPUPlace, float>;
-template struct SelectedRowsAddTensor<platform::GPUPlace, double>;
+template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
 
 template <typename T>
-struct SelectedRowsAddTo<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& input1,
                   const int64_t input2_offset,
                   framework::SelectedRows* input2) {
@@ -163,18 +160,17 @@ struct SelectedRowsAddTo<platform::GPUPlace, T> {
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
-    memory::Copy(
-        boost::get<platform::GPUPlace>(in2_place), in2_data + input2_offset,
-        boost::get<platform::GPUPlace>(in1_place), in1_data,
-        in1_value.numel() * sizeof(T),
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
+    memory::Copy(boost::get<platform::GPUPlace>(in2_place),
+                 in2_data + input2_offset,
+                 boost::get<platform::GPUPlace>(in1_place), in1_data,
+                 in1_value.numel() * sizeof(T), context.stream());
   }
 };
 
-template struct SelectedRowsAddTo<platform::GPUPlace, float>;
-template struct SelectedRowsAddTo<platform::GPUPlace, double>;
-template struct SelectedRowsAddTo<platform::GPUPlace, int>;
-template struct SelectedRowsAddTo<platform::GPUPlace, int64_t>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, double>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, int>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, int64_t>;
 
 namespace {
 template <typename T, int block_size>
@@ -197,8 +193,8 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
 }  // namespace
 
 template <typename T>
-struct SelectedRowsAddToTensor<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& input1,
                   framework::Tensor* input2) {
     auto in1_height = input1.height();
@@ -216,17 +212,16 @@ struct SelectedRowsAddToTensor<platform::GPUPlace, T> {
     const int block_size = 256;
     dim3 threads(block_size, 1);
     dim3 grid(1, in1_rows.size());
-    SelectedRowsAddToTensorKernel<T, block_size><<<
-        grid, threads, 0,
-        reinterpret_cast<const platform::CUDADeviceContext&>(context)
-            .stream()>>>(in1_data, in1_rows.data(), in2_data, in1_row_numel);
+    SelectedRowsAddToTensorKernel<
+        T, block_size><<<grid, threads, 0, context.stream()>>>(
+        in1_data, in1_rows.data(), in2_data, in1_row_numel);
   }
 };
 
-template struct SelectedRowsAddToTensor<platform::GPUPlace, float>;
-template struct SelectedRowsAddToTensor<platform::GPUPlace, double>;
-template struct SelectedRowsAddToTensor<platform::GPUPlace, int>;
-template struct SelectedRowsAddToTensor<platform::GPUPlace, int64_t>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h
index d6dc6c03c9..1149075abf 100644
--- a/paddle/operators/math/selected_rows_functor.h
+++ b/paddle/operators/math/selected_rows_functor.h
@@ -21,33 +21,33 @@ namespace math {
 
 // SelectedRows + SelectedRows will simplely concat value and rows.
 // The real computation happens in dealing with LoDTensor.
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SelectedRowsAdd {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::SelectedRows& input2,
                   framework::SelectedRows* output);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SelectedRowsAddTensor {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::SelectedRows& input1,
                   const framework::Tensor& input2, framework::Tensor* output);
 };
 
 // input2 = input1 + input2
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SelectedRowsAddTo {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::SelectedRows& input1,
                   const int64_t input2_offset, framework::SelectedRows* input2);
 };
 
 // input2 = input1 + input2
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SelectedRowsAddToTensor {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::SelectedRows& input1,
                   framework::Tensor* input2);
 };
diff --git a/paddle/operators/math/selected_rows_functor_test.cc b/paddle/operators/math/selected_rows_functor_test.cc
index a3649b6875..8c74cab0a1 100644
--- a/paddle/operators/math/selected_rows_functor_test.cc
+++ b/paddle/operators/math/selected_rows_functor_test.cc
@@ -23,7 +23,7 @@ TEST(selected_rows_functor, cpu_add) {
 
   CPUPlace cpu_place;
   CPUDeviceContext ctx(cpu_place);
-  SetConstant<CPUPlace, float> functor;
+  SetConstant<CPUDeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -47,7 +47,7 @@ TEST(selected_rows_functor, cpu_add) {
   // simplely concat two SelectedRows
   out_value->mutable_data<float>(make_ddim({7, 10}), cpu_place);
 
-  SelectedRowsAdd<CPUPlace, float> add_functor;
+  SelectedRowsAdd<CPUDeviceContext, float> add_functor;
   add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
 
   auto out_height = output->height();
@@ -85,7 +85,7 @@ TEST(selected_rows_functor, cpu_add) {
   std::unique_ptr<Tensor> tensor2{new Tensor()};
   tensor2->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
 
-  SelectedRowsAddTensor<CPUPlace, float> add_tensor_functor;
+  SelectedRowsAddTensor<CPUDeviceContext, float> add_tensor_functor;
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
 
   auto* tensor2_data = tensor2->data<float>();
@@ -112,7 +112,7 @@ TEST(selected_rows_functor, cpu_add_to) {
 
   CPUPlace cpu_place;
   CPUDeviceContext ctx(cpu_place);
-  SetConstant<CPUPlace, float> functor;
+  SetConstant<CPUDeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -137,7 +137,7 @@ TEST(selected_rows_functor, cpu_add_to) {
   // simplely concat two SelectedRows
   out_value->mutable_data<float>(make_ddim({7, 10}), cpu_place);
 
-  SelectedRowsAddTo<CPUPlace, float> add_to_functor;
+  SelectedRowsAddTo<CPUDeviceContext, float> add_to_functor;
   add_to_functor(ctx, *selected_rows1, 0, output.get());
   add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
 
@@ -173,7 +173,7 @@ TEST(selected_rows_functor, cpu_add_to) {
   tensor1->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
-  SelectedRowsAddToTensor<CPUPlace, float> add_to_tensor_functor;
+  SelectedRowsAddToTensor<CPUDeviceContext, float> add_to_tensor_functor;
   add_to_tensor_functor(ctx, *output, tensor1.get());
 
   auto* tensor1_data = tensor1->data<float>();
diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu
index 7de9291c17..777caf5635 100644
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/operators/math/selected_rows_functor_test.cu
@@ -24,7 +24,7 @@ TEST(selected_rows_functor, gpu_add) {
   GPUPlace gpu_place(0);
   CPUPlace cpu_place;
   CUDADeviceContext ctx(gpu_place);
-  SetConstant<GPUPlace, float> functor;
+  SetConstant<CUDADeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -48,7 +48,7 @@ TEST(selected_rows_functor, gpu_add) {
   // simplely concat two SelectedRows
   out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place);
 
-  SelectedRowsAdd<GPUPlace, float> add_functor;
+  SelectedRowsAdd<CUDADeviceContext, float> add_functor;
   add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
 
   auto out_height = output->height();
@@ -90,7 +90,7 @@ TEST(selected_rows_functor, gpu_add) {
   std::unique_ptr<Tensor> tensor2{new Tensor()};
   tensor2->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
 
-  SelectedRowsAddTensor<GPUPlace, float> add_tensor_functor;
+  SelectedRowsAddTensor<CUDADeviceContext, float> add_tensor_functor;
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
 
   Tensor tensor2_cpu;
@@ -122,7 +122,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   GPUPlace gpu_place(0);
   CPUPlace cpu_place;
   CUDADeviceContext ctx(gpu_place);
-  SetConstant<GPUPlace, float> functor;
+  SetConstant<CUDADeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -147,7 +147,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   // simplely concat two SelectedRows
   out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place);
 
-  SelectedRowsAddTo<GPUPlace, float> add_to_functor;
+  SelectedRowsAddTo<CUDADeviceContext, float> add_to_functor;
   add_to_functor(ctx, *selected_rows1, 0, output.get());
   add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
 
@@ -187,7 +187,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
-  SelectedRowsAddToTensor<GPUPlace, float> add_to_tensor_functor;
+  SelectedRowsAddToTensor<CUDADeviceContext, float> add_to_tensor_functor;
   add_to_tensor_functor(ctx, *output, tensor1.get());
 
   Tensor tensor1_cpu;
diff --git a/paddle/operators/math/sequence2batch.cc b/paddle/operators/math/sequence2batch.cc
index 5b3bde02fb..88977be1f8 100644
--- a/paddle/operators/math/sequence2batch.cc
+++ b/paddle/operators/math/sequence2batch.cc
@@ -19,9 +19,9 @@ namespace operators {
 namespace math {
 
 template <typename T>
-class CopyMatrixRowsFunctor<platform::CPUPlace, T> {
+class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& src, const size_t* index,
                   framework::Tensor& dst, bool is_src_index) {
     auto src_dims = src.dims();
@@ -48,13 +48,13 @@ class CopyMatrixRowsFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class CopyMatrixRowsFunctor<platform::CPUPlace, float>;
-template class CopyMatrixRowsFunctor<platform::CPUPlace, double>;
+template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, float>;
+template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, double>;
 
-template class LoDTensor2BatchFunctor<platform::CPUPlace, float>;
-template class LoDTensor2BatchFunctor<platform::CPUPlace, double>;
-template class Batch2LoDTensorFunctor<platform::CPUPlace, float>;
-template class Batch2LoDTensorFunctor<platform::CPUPlace, double>;
+template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, float>;
+template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, double>;
+template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, float>;
+template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu
index c5d968aeb2..452ae89510 100644
--- a/paddle/operators/math/sequence2batch.cu
+++ b/paddle/operators/math/sequence2batch.cu
@@ -39,9 +39,9 @@ __global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index,
 }
 
 template <typename T>
-class CopyMatrixRowsFunctor<platform::GPUPlace, T> {
+class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& src, const size_t* index,
                   framework::Tensor& dst, bool is_src_index) {
     auto src_dims = src.dims();
@@ -59,20 +59,19 @@ class CopyMatrixRowsFunctor<platform::GPUPlace, T> {
 
     dim3 threads(128, 8);
     dim3 grid(8, 1);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    auto stream = context.stream();
     CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
         src_data, dst_data, index, height, width, is_src_index);
   }
 };
 
-template class CopyMatrixRowsFunctor<platform::GPUPlace, float>;
-template class CopyMatrixRowsFunctor<platform::GPUPlace, double>;
+template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, float>;
+template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, double>;
 
-template class LoDTensor2BatchFunctor<platform::GPUPlace, float>;
-template class LoDTensor2BatchFunctor<platform::GPUPlace, double>;
-template class Batch2LoDTensorFunctor<platform::GPUPlace, float>;
-template class Batch2LoDTensorFunctor<platform::GPUPlace, double>;
+template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, float>;
+template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, double>;
+template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, float>;
+template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/sequence2batch.h b/paddle/operators/math/sequence2batch.h
index 73295ddbcb..a5c43a2c7d 100644
--- a/paddle/operators/math/sequence2batch.h
+++ b/paddle/operators/math/sequence2batch.h
@@ -26,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CopyMatrixRowsFunctor {
  public:
   // If is_src_index is true,
@@ -34,12 +34,12 @@ class CopyMatrixRowsFunctor {
   // If is_src_index is false,
   // copy the input src to the indexed rows of output dst.
   // The indexed rows are based on the input index.
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& src, const size_t* index,
-                  framework::Tensor& dst, bool is_src_index);
+  void operator()(const DeviceContext& context, const framework::Tensor& src,
+                  const size_t* index, framework::Tensor& dst,
+                  bool is_src_index);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class LoDTensor2BatchFunctor {
   // Calculate the length of each sequence and
   // sort sequence index by the length.
@@ -56,7 +56,7 @@ class LoDTensor2BatchFunctor {
   };
 
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::LoDTensor& lod_tensor,
                   framework::LoDTensor& batch, bool is_cal_batch_lod,
                   bool is_reverse = false) const {
@@ -65,7 +65,7 @@ class LoDTensor2BatchFunctor {
       PADDLE_ENFORCE_GT(lods.size(), 2UL);
       PADDLE_ENFORCE_EQ(lods[1].size(),
                         static_cast<size_t>(lod_tensor.dims()[0]));
-      CopyMatrixRowsFunctor<Place, T> to_batch;
+      CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
       to_batch(context, lod_tensor, lods[1].data(), batch, true);
       return;
     }
@@ -143,22 +143,22 @@ class LoDTensor2BatchFunctor {
     }
     batch.set_lod(batch_lods);
 
-    CopyMatrixRowsFunctor<Place, T> to_batch;
+    CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
     to_batch(context, lod_tensor, seq2batch_idx, batch, true);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class Batch2LoDTensorFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::LoDTensor& batch,
                   framework::LoDTensor& lod_tensor) const {
     auto in_lod = batch.lod();
     PADDLE_ENFORCE_GT(in_lod.size(), 2UL);
     PADDLE_ENFORCE_EQ(in_lod[1].size(),
                       static_cast<size_t>(lod_tensor.dims()[0]));
-    CopyMatrixRowsFunctor<Place, T> to_seq;
+    CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
     size_t* index = in_lod[1].data();
     to_seq(context, batch, index, lod_tensor, false);
   }
diff --git a/paddle/operators/math/sequence_pooling.cc b/paddle/operators/math/sequence_pooling.cc
index 5913c99fdb..8fb92b1a13 100644
--- a/paddle/operators/math/sequence_pooling.cc
+++ b/paddle/operators/math/sequence_pooling.cc
@@ -20,9 +20,9 @@ namespace operators {
 namespace math {
 
 template <typename T>
-class MaxSeqPoolFunctor<platform::CPUPlace, T> {
+class MaxSeqPoolFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::LoDTensor& input, framework::Tensor* output,
                   framework::Tensor* index) {
     auto in_dims = input.dims();
@@ -60,9 +60,9 @@ class MaxSeqPoolFunctor<platform::CPUPlace, T> {
 };
 
 template <typename T>
-class MaxSeqPoolGradFunctor<platform::CPUPlace, T> {
+class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& out_grad,
                   const framework::Tensor& index,
                   framework::LoDTensor* in_grad) {
@@ -80,7 +80,7 @@ class MaxSeqPoolGradFunctor<platform::CPUPlace, T> {
     const int* max_index = index.data<int>();
     T* ig_data = in_grad->data<T>();
 
-    SetConstant<platform::CPUPlace, T> set_zero;
+    SetConstant<platform::CPUDeviceContext, T> set_zero;
     set_zero(context, in_grad, static_cast<T>(0.0));
     int64_t num_seq = og_dims[0];
     int64_t dim = out_grad.numel() / num_seq;
@@ -93,10 +93,10 @@ class MaxSeqPoolGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class MaxSeqPoolFunctor<platform::CPUPlace, float>;
-template class MaxSeqPoolFunctor<platform::CPUPlace, double>;
-template class MaxSeqPoolGradFunctor<platform::CPUPlace, float>;
-template class MaxSeqPoolGradFunctor<platform::CPUPlace, double>;
+template class MaxSeqPoolFunctor<platform::CPUDeviceContext, float>;
+template class MaxSeqPoolFunctor<platform::CPUDeviceContext, double>;
+template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/sequence_pooling.cu b/paddle/operators/math/sequence_pooling.cu
index 5ed951402f..4c9e6b375c 100644
--- a/paddle/operators/math/sequence_pooling.cu
+++ b/paddle/operators/math/sequence_pooling.cu
@@ -46,9 +46,9 @@ __global__ void KeMaxSequencePool(const T* input, const size_t* starts,
 }
 
 template <typename T>
-class MaxSeqPoolFunctor<platform::GPUPlace, T> {
+class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::LoDTensor& input, framework::Tensor* output,
                   framework::Tensor* index) {
     auto in_dims = input.dims();
@@ -71,8 +71,7 @@ class MaxSeqPoolFunctor<platform::GPUPlace, T> {
 
     dim3 threads(256, 1);
     dim3 grid(num_seq, 1);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    auto stream = context.stream();
     KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
         in_data, starts.data(), out_data, max_index, num_seq, dim);
   }
@@ -91,9 +90,9 @@ __global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index,
 }
 
 template <typename T>
-class MaxSeqPoolGradFunctor<platform::GPUPlace, T> {
+class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& out_grad,
                   const framework::Tensor& index,
                   framework::LoDTensor* in_grad) {
@@ -111,7 +110,7 @@ class MaxSeqPoolGradFunctor<platform::GPUPlace, T> {
     const int* max_index = index.data<int>();
     T* ig_data = in_grad->data<T>();
 
-    SetConstant<platform::GPUPlace, T> set_zero;
+    SetConstant<platform::CUDADeviceContext, T> set_zero;
     set_zero(context, in_grad, static_cast<T>(0.0));
     int64_t num_seq = og_dims[0];
     int64_t dim = out_grad.numel() / num_seq;
@@ -119,17 +118,16 @@ class MaxSeqPoolGradFunctor<platform::GPUPlace, T> {
     unsigned int blocks = (num_seq * dim + 128 - 1) / 128;
     dim3 threads(128, 1);
     dim3 grid(blocks, 1);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    auto stream = context.stream();
     KeMaxSequencePoolGrad<T><<<grid, threads, 0, stream>>>(
         og_data, max_index, ig_data, num_seq, dim);
   }
 };
 
-template class MaxSeqPoolFunctor<platform::GPUPlace, float>;
-template class MaxSeqPoolFunctor<platform::GPUPlace, double>;
-template class MaxSeqPoolGradFunctor<platform::GPUPlace, float>;
-template class MaxSeqPoolGradFunctor<platform::GPUPlace, double>;
+template class MaxSeqPoolFunctor<platform::CUDADeviceContext, float>;
+template class MaxSeqPoolFunctor<platform::CUDADeviceContext, double>;
+template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/sequence_pooling.h b/paddle/operators/math/sequence_pooling.h
index 35dfe26de1..13ffb2ebef 100644
--- a/paddle/operators/math/sequence_pooling.h
+++ b/paddle/operators/math/sequence_pooling.h
@@ -23,18 +23,18 @@ namespace math {
 
 #define FLT_MAX __FLT_MAX__
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MaxSeqPoolFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::LoDTensor& input, framework::Tensor* output,
                   framework::Tensor* index);
 };
 
-template <typename Place, class T>
+template <typename DeviceContext, class T>
 class MaxSeqPoolGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::Tensor& out_grad,
                   const framework::Tensor& index,
                   framework::LoDTensor* in_grad);
diff --git a/paddle/operators/math/softmax.cc b/paddle/operators/math/softmax.cc
index 3e2f15d6c2..72f10f35f4 100644
--- a/paddle/operators/math/softmax.cc
+++ b/paddle/operators/math/softmax.cc
@@ -19,10 +19,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class SoftmaxFunctor<platform::CPUPlace, float>;
-template class SoftmaxFunctor<platform::CPUPlace, double>;
-template class SoftmaxGradFunctor<platform::CPUPlace, float>;
-template class SoftmaxGradFunctor<platform::CPUPlace, double>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, float>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, double>;
+template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
+template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/softmax.cu b/paddle/operators/math/softmax.cu
index 4dbab51d46..9e73f6a371 100644
--- a/paddle/operators/math/softmax.cu
+++ b/paddle/operators/math/softmax.cu
@@ -21,10 +21,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class SoftmaxFunctor<platform::GPUPlace, float>;
-template class SoftmaxFunctor<platform::GPUPlace, double>;
-template class SoftmaxGradFunctor<platform::GPUPlace, float>;
-template class SoftmaxGradFunctor<platform::GPUPlace, double>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
+template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
+template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/softmax.h b/paddle/operators/math/softmax.h
index fe10746502..471f44d340 100644
--- a/paddle/operators/math/softmax.h
+++ b/paddle/operators/math/softmax.h
@@ -19,19 +19,18 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SoftmaxFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor* X, framework::Tensor* Y);
+  void operator()(const DeviceContext& context, const framework::Tensor* X,
+                  framework::Tensor* Y);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SoftmaxGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor* y, const framework::Tensor* y_grad,
-                  framework::Tensor* x_grad);
+  void operator()(const DeviceContext& context, const framework::Tensor* y,
+                  const framework::Tensor* y_grad, framework::Tensor* x_grad);
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/softmax_impl.h b/paddle/operators/math/softmax_impl.h
index 05793eeb3e..82f597ff79 100644
--- a/paddle/operators/math/softmax_impl.h
+++ b/paddle/operators/math/softmax_impl.h
@@ -32,10 +32,10 @@ struct ValueClip {
   }
 };
 
-template <typename Place, typename T>
-void SoftmaxFunctor<Place, T>::operator()(
-    const platform::DeviceContext& context, const framework::Tensor* X,
-    framework::Tensor* Y) {
+template <typename DeviceContext, typename T>
+void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+                                                  const framework::Tensor* X,
+                                                  framework::Tensor* Y) {
   auto logits = EigenMatrix<T>::From(*X);
   auto softmax = EigenMatrix<T>::From(*Y);
 
@@ -56,19 +56,18 @@ void SoftmaxFunctor<Place, T>::operator()(
                              .broadcast(one_by_class))
                             .unaryExpr(ValueClip<T>());
 
-  softmax.device(*context.GetEigenDevice<Place>()) = shifted_logits.exp();
-  softmax.device(*context.GetEigenDevice<Place>()) =
-      (softmax *
-       softmax.sum(along_class)
-           .inverse()
-           .eval()
-           .reshape(batch_by_one)
-           .broadcast(one_by_class));
+  softmax.device(*context.eigen_device()) = shifted_logits.exp();
+  softmax.device(*context.eigen_device()) = (softmax *
+                                             softmax.sum(along_class)
+                                                 .inverse()
+                                                 .eval()
+                                                 .reshape(batch_by_one)
+                                                 .broadcast(one_by_class));
 }
 
-template <typename Place, typename T>
-void SoftmaxGradFunctor<Place, T>::operator()(
-    const platform::DeviceContext& context, const framework::Tensor* y,
+template <typename DeviceContext, typename T>
+void SoftmaxGradFunctor<DeviceContext, T>::operator()(
+    const DeviceContext& context, const framework::Tensor* y,
     const framework::Tensor* y_grad, framework::Tensor* x_grad) {
   auto softmax = EigenMatrix<T>::From(*y);
   auto softmax_grad = EigenMatrix<T>::From(*y_grad);
@@ -89,8 +88,7 @@ void SoftmaxGradFunctor<Place, T>::operator()(
                  .eval()
                  .reshape(batch_by_one)
                  .broadcast(one_by_class);
-  logits_grad.device(*context.GetEigenDevice<Place>()) =
-      (softmax_grad - dot) * softmax;
+  logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax;
 }
 
 }  // namespace math
diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
index b57d3dc141..ecd3a647e0 100644
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
@@ -17,9 +17,9 @@ namespace paddle {
 namespace operators {
 namespace math {
 template <typename T>
-class Unpool2dMaxFunctor<platform::CPUPlace, T> {
+class Unpool2dMaxFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
@@ -48,9 +48,9 @@ class Unpool2dMaxFunctor<platform::CPUPlace, T> {
   }
 };
 template <class T>
-class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
+class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
                   const framework::Tensor& output,
@@ -82,10 +82,10 @@ class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
     }
   }
 };
-template class Unpool2dMaxGradFunctor<platform::CPUPlace, float>;
-template class Unpool2dMaxGradFunctor<platform::CPUPlace, double>;
-template class Unpool2dMaxFunctor<platform::CPUPlace, float>;
-template class Unpool2dMaxFunctor<platform::CPUPlace, double>;
+template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, float>;
+template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, double>;
+template class Unpool2dMaxFunctor<platform::CPUDeviceContext, float>;
+template class Unpool2dMaxFunctor<platform::CPUDeviceContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
index 37c3c8b689..ecbde0f6a7 100644
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
@@ -67,9 +67,9 @@ __global__ void KernelUnpool2dMaxGrad(
  * All tensors are in NCHW format.
  */
 template <typename T>
-class Unpool2dMaxFunctor<platform::GPUPlace, T> {
+class Unpool2dMaxFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
@@ -83,21 +83,18 @@ class Unpool2dMaxFunctor<platform::GPUPlace, T> {
     T* output_data = output->mutable_data<T>(context.GetPlace());
     int threads = 1024;
     int grid = (input.numel() + threads - 1) / threads;
-    KernelUnpool2dMax<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(input.numel(), input_data, indices_data,
-                              input_height, input_width, output_channels,
-                              output_data, output_height, output_width);
+    KernelUnpool2dMax<T><<<grid, threads, 0, context.stream()>>>(
+        input.numel(), input_data, indices_data, input_height, input_width,
+        output_channels, output_data, output_height, output_width);
   }
 };
 /*
  * All tensors are in NCHW format.
  */
 template <typename T>
-class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
+class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
                   const framework::Tensor& output,
@@ -116,19 +113,16 @@ class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
     int threads = 1024;
     int grid = (input.numel() + threads - 1) / threads;
-    KernelUnpool2dMaxGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(input.numel(), input_data, indices_data,
-                              input_height, input_width, output_channels,
-                              output_data, output_grad_data, output_height,
-                              output_width, input_grad_data);
+    KernelUnpool2dMaxGrad<T><<<grid, threads, 0, context.stream()>>>(
+        input.numel(), input_data, indices_data, input_height, input_width,
+        output_channels, output_data, output_grad_data, output_height,
+        output_width, input_grad_data);
   }
 };
-template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
-template class Unpool2dMaxGradFunctor<platform::GPUPlace, double>;
-template class Unpool2dMaxFunctor<platform::GPUPlace, float>;
-template class Unpool2dMaxFunctor<platform::GPUPlace, double>;
+template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, float>;
+template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, double>;
+template class Unpool2dMaxFunctor<platform::CUDADeviceContext, float>;
+template class Unpool2dMaxFunctor<platform::CUDADeviceContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
index 7077d7c227..0f0ff1371e 100644
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
@@ -18,18 +18,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 namespace math {
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class Unpool2dMaxFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& indices, framework::Tensor* output);
 };
-template <typename Place, class T>
+template <typename DeviceContext, class T>
 class Unpool2dMaxGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& indices,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad,
diff --git a/paddle/operators/math/vol2col.cc b/paddle/operators/math/vol2col.cc
index 99eb7fd46d..d574ed9234 100644
--- a/paddle/operators/math/vol2col.cc
+++ b/paddle/operators/math/vol2col.cc
@@ -25,9 +25,9 @@ namespace math {
  *                    output_depth, output_height, output_width]
  */
 template <class T>
-class Vol2ColFunctor<platform::CPUPlace, T> {
+class Vol2ColFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& vol,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
@@ -111,9 +111,9 @@ class Vol2ColFunctor<platform::CPUPlace, T> {
  *                    output_depth, output_height, output_width]
  */
 template <class T>
-class Col2VolFunctor<platform::CPUPlace, T> {
+class Col2VolFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
@@ -190,10 +190,10 @@ class Col2VolFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class Vol2ColFunctor<platform::CPUPlace, float>;
-template class Vol2ColFunctor<platform::CPUPlace, double>;
-template class Col2VolFunctor<platform::CPUPlace, float>;
-template class Col2VolFunctor<platform::CPUPlace, double>;
+template class Vol2ColFunctor<platform::CPUDeviceContext, float>;
+template class Vol2ColFunctor<platform::CPUDeviceContext, double>;
+template class Col2VolFunctor<platform::CPUDeviceContext, float>;
+template class Col2VolFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/vol2col.cu b/paddle/operators/math/vol2col.cu
index dae3be858e..b029442fe4 100644
--- a/paddle/operators/math/vol2col.cu
+++ b/paddle/operators/math/vol2col.cu
@@ -68,9 +68,9 @@ __global__ void vol2col(int num_kernels, const T* data_vol, int depth,
  *                    output_depth, output_height, output_width]
  */
 template <class T>
-class Vol2ColFunctor<platform::GPUPlace, T> {
+class Vol2ColFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& vol,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
@@ -117,9 +117,7 @@ class Vol2ColFunctor<platform::GPUPlace, T> {
 
     const int threads = 1024;
     const int blocks = (num_outputs + 1024 - 1) / 1024;
-    vol2col<T><<<blocks, threads, 0,
-                 reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                     .stream()>>>(
+    vol2col<T><<<blocks, threads, 0, context.stream()>>>(
         num_outputs, vol.data<T>(), input_depth, input_height, input_width,
         dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
         filter_width, strides[0], strides[1], strides[2], paddings[0],
@@ -196,9 +194,9 @@ __global__ void col2vol(int num_kernels, const T* data_col, int depth,
  *                    output_depth, output_height, output_width]
  */
 template <class T>
-class Col2VolFunctor<platform::GPUPlace, T> {
+class Col2VolFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& col,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
@@ -245,9 +243,7 @@ class Col2VolFunctor<platform::GPUPlace, T> {
     const int threads = 1024;
     const int blocks = (num_kernels + 1024 - 1) / 1024;
 
-    col2vol<T><<<blocks, threads, 0,
-                 reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                     .stream()>>>(
+    col2vol<T><<<blocks, threads, 0, context.stream()>>>(
         num_kernels, col.data<T>(), input_depth, input_height, input_width,
         dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
         filter_width, strides[0], strides[1], strides[2], paddings[0],
@@ -256,10 +252,10 @@ class Col2VolFunctor<platform::GPUPlace, T> {
   }
 };
 
-template class Vol2ColFunctor<platform::GPUPlace, float>;
-template class Vol2ColFunctor<platform::GPUPlace, double>;
-template class Col2VolFunctor<platform::GPUPlace, float>;
-template class Col2VolFunctor<platform::GPUPlace, double>;
+template class Vol2ColFunctor<platform::CUDADeviceContext, float>;
+template class Vol2ColFunctor<platform::CUDADeviceContext, double>;
+template class Col2VolFunctor<platform::CUDADeviceContext, float>;
+template class Col2VolFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/vol2col.h b/paddle/operators/math/vol2col.h
index dc64d1d977..dcd80370e8 100644
--- a/paddle/operators/math/vol2col.h
+++ b/paddle/operators/math/vol2col.h
@@ -63,22 +63,20 @@ namespace math {
  * \note The caller needs to ensure that volShape.inputChannels is equal to
  *       colShape.inputChannels.
  */
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class Vol2ColFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& vol,
+  void operator()(const DeviceContext& context, const framework::Tensor& vol,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
                   framework::Tensor* col) const;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class Col2VolFunctor {
  public:
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& col,
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc
index 62c3152304..f46db3c567 100644
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <iostream>
 
-template <typename Place>
+template <typename DeviceContext, typename Place>
 void testVol2col() {
   paddle::framework::Tensor input;
   paddle::framework::Tensor input_tmp;
@@ -24,18 +24,7 @@ void testVol2col() {
   paddle::framework::Tensor output_tmp;
 
   auto* place = new Place();
-  paddle::platform::DeviceContext* context;
-  if (paddle::platform::is_cpu_place(*place)) {
-    context =
-        new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
-  } else {
-#ifdef PADDLE_WITH_CUDA
-    context =
-        new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
-#else
-    PADDLE_THROW("no GPU support");
-#endif  // PADDLE_WITH_CUDA
-  }
+  DeviceContext* context = new DeviceContext(*place);
 
   /**
    * input = [[0, 1, 2,
@@ -88,7 +77,7 @@ void testVol2col() {
                               output_depth, output_height, output_width},
                              *place);
 
-  paddle::operators::math::Vol2ColFunctor<Place, float> vol2col;
+  paddle::operators::math::Vol2ColFunctor<DeviceContext, float> vol2col;
   vol2col(*context, input, dilations, strides, paddings, &output);
 
   float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
@@ -113,7 +102,7 @@ void testVol2col() {
     CopyFrom(input_tmp, *place, *context, &input);
   }
 
-  paddle::operators::math::Col2VolFunctor<Place, float> col2vol;
+  paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol;
   col2vol(*context, output, dilations, strides, paddings, &input);
 
   float* in_ptr;
@@ -130,8 +119,9 @@ void testVol2col() {
 }
 
 TEST(math, vol2col) {
-  testVol2col<paddle::platform::CPUPlace>();
+  testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #ifdef PADDLE_WITH_CUDA
-  testVol2col<paddle::platform::GPUPlace>();
+  testVol2col<paddle::platform::CUDADeviceContext,
+              paddle::platform::GPUPlace>();
 #endif  // PADDLE_WITH_CUDA
 }
diff --git a/paddle/operators/matmul_op.cc b/paddle/operators/matmul_op.cc
index 5a1a615420..ee0bc0c370 100644
--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
@@ -206,7 +206,8 @@ class MatMulOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad,
             ops::MatMulOpGrad);
-REGISTER_OP_CPU_KERNEL(matmul,
-                       ops::MatMulKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    matmul_grad, ops::MatMulGradKernel<paddle::platform::CPUPlace, float>);
+    matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    matmul_grad,
+    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/matmul_op.cu.cc b/paddle/operators/matmul_op.cu.cc
index b7e66382f0..6a3772c004 100644
--- a/paddle/operators/matmul_op.cu.cc
+++ b/paddle/operators/matmul_op.cu.cc
@@ -15,7 +15,8 @@
 #include "paddle/operators/matmul_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(matmul,
-                       ops::MatMulKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    matmul_grad, ops::MatMulGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    matmul, ops::MatMulKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    matmul_grad,
+    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h
index 1e4aa48b70..de9da487b3 100644
--- a/paddle/operators/matmul_op.h
+++ b/paddle/operators/matmul_op.h
@@ -27,7 +27,7 @@ using DDim = framework::DDim;
 using framework::make_ddim;
 using framework::vectorize;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MatMulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -38,8 +38,9 @@ class MatMulKernel : public framework::OpKernel<T> {
     bool transpose_x = context.Attr<bool>("transpose_X");
     bool transpose_y = context.Attr<bool>("transpose_Y");
 
-    math::MatMulFunctor<Place, T>()(context.device_context(), x, transpose_x, y,
-                                    transpose_y, T(1), out, T(0));
+    math::MatMulFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), x, transpose_x, y,
+        transpose_y, T(1), out, T(0));
   }
 };
 
@@ -68,17 +69,16 @@ Tensor CombineBatchAndM(const Tensor& input) {
 // Reshape a rank-3 tensor from P x M x N to M x (P * N).
 // (Warning: This requires transposing data and writes into new memory.)
 // Identity op if the tensor is not of rank 3.
-template <typename Place, typename T>
-Tensor CombineBatchAndN(const framework::ExecutionContext& context,
-                        const Tensor& input) {
+template <typename DeviceContext, typename T>
+Tensor CombineBatchAndN(const DeviceContext& context, const Tensor& input) {
   Tensor output;
   auto in_dims = input.dims();
   if (in_dims.size() == 3) {
     output.Resize({in_dims[1], in_dims[0], in_dims[2]});
     output.mutable_data<T>(context.GetPlace());
     std::vector<int> axis = {1, 0, 2};
-    math::Transpose<Place, T, 3> trans;
-    trans(context.device_context(), input, &output, axis);
+    math::Transpose<DeviceContext, T, 3> trans;
+    trans(context, input, &output, axis);
     std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
     output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
   } else {
@@ -112,7 +112,7 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context,
 //
 // To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
 // to X: (P * M) x K, dOut: (P * M) x N.
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MatMulGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -178,24 +178,23 @@ class MatMulGradKernel : public framework::OpKernel<T> {
     Tensor Y = Reshape<T>(y, make_ddim(y_dims));
     Tensor dOut = Reshape<T>(dout, make_ddim(dout_dims));
 
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (dx) {
       dx->mutable_data<T>(context.GetPlace());
       const Tensor& dOut_for_dX =
           (x_dims.size() == 2 && y_dims.size() == 3)
-              ? CombineBatchAndN<Place, T>(context, dOut)
+              ? CombineBatchAndN<DeviceContext, T>(dev_ctx, dOut)
               : dOut;
       if (x_dims.size() == 2 && y_dims.size() == 3) {
         Y = transpose_y ? CombineBatchAndM<T>(Y)
-                        : CombineBatchAndN<Place, T>(context, Y);
+                        : CombineBatchAndN<DeviceContext, T>(dev_ctx, Y);
       }
       if (transpose_x) {
-        math::MatMulFunctor<Place, T>()(context.device_context(), Y,
-                                        transpose_y, dOut_for_dX, transpose_x,
-                                        T(1), dx, T(0));
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, Y, transpose_y, dOut_for_dX, transpose_x, T(1), dx, T(0));
       } else {
-        math::MatMulFunctor<Place, T>()(context.device_context(), dOut_for_dX,
-                                        transpose_x, Y, !transpose_y, T(1), dx,
-                                        T(0));
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, dOut_for_dX, transpose_x, Y, !transpose_y, T(1), dx, T(0));
       }
     }
 
@@ -205,18 +204,16 @@ class MatMulGradKernel : public framework::OpKernel<T> {
                                       ? CombineBatchAndM<T>(dOut)
                                       : dOut;
       if (y_dims.size() == 2 && x_dims.size() == 3) {
-        X = transpose_x ? CombineBatchAndN<Place, T>(context, X)
+        X = transpose_x ? CombineBatchAndN<DeviceContext, T>(dev_ctx, X)
                         : CombineBatchAndM<T>(X);
         dOut = CombineBatchAndM<T>(dOut);
       }
       if (transpose_y) {
-        math::MatMulFunctor<Place, T>()(context.device_context(), dOut_for_dY,
-                                        transpose_y, X, transpose_x, T(1), dy,
-                                        T(0));
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, dOut_for_dY, transpose_y, X, transpose_x, T(1), dy, T(0));
       } else {
-        math::MatMulFunctor<Place, T>()(context.device_context(), X,
-                                        !transpose_x, dOut_for_dY, transpose_y,
-                                        T(1), dy, T(0));
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, X, !transpose_x, dOut_for_dY, transpose_y, T(1), dy, T(0));
       }
     }
   }
diff --git a/paddle/operators/maxout_op.cc b/paddle/operators/maxout_op.cc
index e203a25d54..011616e615 100644
--- a/paddle/operators/maxout_op.cc
+++ b/paddle/operators/maxout_op.cc
@@ -40,23 +40,28 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
         "the number of channels divided by groups.."
         )DOC");
     AddComment(R"DOC(
-        Assumed the input shape is (N, Ci, H, W).
-        The output shape is (N, Co, H, W). Then `Co = Ci / groups`.
+MaxOut Operator.
 
-       math:
-       y_{si+j} = \max_k x_{gsi + sk + j}
-       g = groups
-       s = input.size / num_channels
-       0 \le i < num_channels / groups
-       0 \le j < s
-       0 \le k < groups
+Assumed the input shape is (N, Ci, H, W).
+The output shape is (N, Co, H, W).
+Then $Co = Ci / groups$ and the operator formula is as follows:
 
-    Please refer to Paper:
-      - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
-      - Multi-digit Number Recognition from Street View \
-        Imagery using Deep Convolutional Neural Networks: \
-        https://arxiv.org/pdf/1312.6082v4.pdf
-        )DOC");
+$$
+y_{si+j} = \max_k x_{gsi + sk + j} \\
+g = groups \\
+s = \frac{input.size}{num\_channels} \\
+0 \le i < \frac{num\_channels}{groups} \\
+0 \le j < s \\
+0 \le k < groups
+$$
+
+Please refer to Paper:
+  - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
+  - Multi-digit Number Recognition from Street View \
+    Imagery using Deep Convolutional Neural Networks: \
+    https://arxiv.org/pdf/1312.6082v4.pdf
+
+)DOC");
   }
 };
 
@@ -96,7 +101,8 @@ class MaxOutOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
             ops::MaxOutOpGrad);
-REGISTER_OP_CPU_KERNEL(maxout,
-                       ops::MaxOutKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    maxout_grad, ops::MaxOutGradKernel<paddle::platform::CPUPlace, float>);
+    maxout, ops::MaxOutKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    maxout_grad,
+    ops::MaxOutGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/maxout_op.cu.cc b/paddle/operators/maxout_op.cu.cc
index decd43913d..2904f0ff96 100644
--- a/paddle/operators/maxout_op.cu.cc
+++ b/paddle/operators/maxout_op.cu.cc
@@ -15,9 +15,10 @@
 #include "paddle/operators/maxout_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(maxout,
-                       ops::MaxOutKernel<paddle::platform::GPUPlace, float>,
-                       ops::MaxOutKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
-    maxout_grad, ops::MaxOutGradKernel<paddle::platform::GPUPlace, float>,
-    ops::MaxOutGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    maxout, ops::MaxOutKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MaxOutKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    maxout_grad,
+    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/maxout_op.h b/paddle/operators/maxout_op.h
index 44a0d073dd..e8b12552b9 100644
--- a/paddle/operators/maxout_op.h
+++ b/paddle/operators/maxout_op.h
@@ -23,7 +23,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MaxOutKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -31,12 +31,13 @@ class MaxOutKernel : public framework::OpKernel<T> {
     Tensor* out = context.Output<Tensor>("Out");
     int groups = context.template Attr<int>("groups");
 
-    math::MaxOutFunctor<Place, T> maxout_forward;
-    maxout_forward(context.device_context(), *in_x, out, groups);
+    math::MaxOutFunctor<DeviceContext, T> maxout_forward;
+    maxout_forward(context.template device_context<DeviceContext>(), *in_x, out,
+                   groups);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MaxOutGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -46,14 +47,13 @@ class MaxOutGradKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
     int groups = context.template Attr<int>("groups");
-    auto& device_ctx = context.device_context();
-    math::SetConstant<Place, T> zero;
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
       zero(device_ctx, in_x_grad, static_cast<T>(0.0));
-      math::MaxOutGradFunctor<Place, T> maxout_backward;
-      maxout_backward(context.device_context(), *in_x, in_x_grad, *out,
-                      *out_grad, groups);
+      math::MaxOutGradFunctor<DeviceContext, T> maxout_backward;
+      maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups);
     }
   }
 };
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index dcc5b4286f..8932d700c2 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -76,8 +76,9 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker);
 REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
-REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel<paddle::platform::CPUPlace, float>,
-                       ops::MeanKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(mean_grad,
-                       ops::MeanGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::MeanGradKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MeanKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    mean_grad, ops::MeanGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MeanGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/mean_op.cu b/paddle/operators/mean_op.cu
index ca089938c0..93062bf540 100644
--- a/paddle/operators/mean_op.cu
+++ b/paddle/operators/mean_op.cu
@@ -17,8 +17,9 @@
 #include "paddle/operators/mean_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel<paddle::platform::GPUPlace, float>,
-                       ops::MeanKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(mean_grad,
-                       ops::MeanGradKernel<paddle::platform::GPUPlace, float>,
-                       ops::MeanGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h
index c99286a5b9..351b345959 100644
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MeanKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -38,13 +38,14 @@ class MeanKernel : public framework::OpKernel<T> {
 
     auto X = EigenVector<T>::Flatten(*input);
     auto y = EigenScalar<T>::From(*output);
-    auto& place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     y.device(place) = X.mean();
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MeanGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -56,7 +57,8 @@ class MeanGradKernel : public framework::OpKernel<T> {
     T ig_size = static_cast<T>(IG->numel());
     Eigen::DSizes<int, 1> bcast(ig_size);
 
-    EigenVector<T>::Flatten(*IG).device(context.GetEigenDevice<Place>()) =
+    EigenVector<T>::Flatten(*IG).device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
         (EigenVector<T>::From(*OG) / ig_size).broadcast(bcast);
   }
 };
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index 4684c20208..27f0c8de20 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -102,5 +102,5 @@ class MinusGradMaker : public framework::GradOpDescMakerBase {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradMaker);
-REGISTER_OP_CPU_KERNEL(minus,
-                       ops::MinusKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    minus, ops::MinusKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/minus_op.cu b/paddle/operators/minus_op.cu
index a8375cc630..3b202ea92e 100644
--- a/paddle/operators/minus_op.cu
+++ b/paddle/operators/minus_op.cu
@@ -14,5 +14,6 @@
 
 #include "paddle/operators/minus_op.h"
 
-REGISTER_OP_GPU_KERNEL(
-    minus, paddle::operators::MinusKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    minus,
+    paddle::operators::MinusKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/minus_op.h b/paddle/operators/minus_op.h
index bd9a2790aa..78e1e1be6d 100644
--- a/paddle/operators/minus_op.h
+++ b/paddle/operators/minus_op.h
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MinusKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -28,7 +28,8 @@ class MinusKernel : public framework::OpKernel<T> {
     auto* out_tensor = context.Output<framework::Tensor>("Out");
 
     out_tensor->mutable_data<T>(context.GetPlace());
-    auto& dev = context.GetEigenDevice<Place>();
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
     framework::EigenVector<T>::Flatten(*out_tensor).device(dev) =
         framework::EigenVector<T>::Flatten(*left_tensor) -
         framework::EigenVector<T>::Flatten(*right_tensor);
diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc
index 28528848af..f0a42491bf 100644
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -115,6 +115,6 @@ REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp,
 
 REGISTER_OP_CPU_KERNEL(
     modified_huber_loss,
-    ops::ModifiedHuberLossKernel<paddle::platform::CPUPlace, float>);
+    ops::ModifiedHuberLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(modified_huber_loss_grad,
                        ops::ModifiedHuberLossGradCPUKernel<float>);
diff --git a/paddle/operators/modified_huber_loss_op.cu b/paddle/operators/modified_huber_loss_op.cu
index 8854e166cd..40a8447da4 100644
--- a/paddle/operators/modified_huber_loss_op.cu
+++ b/paddle/operators/modified_huber_loss_op.cu
@@ -71,8 +71,8 @@ class ModifiedHuberLossGradGPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     modified_huber_loss,
-    ops::ModifiedHuberLossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(modified_huber_loss_grad,
-                       ops::ModifiedHuberLossGradGPUKernel<float>);
+    ops::ModifiedHuberLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(modified_huber_loss_grad,
+                        ops::ModifiedHuberLossGradGPUKernel<float>);
diff --git a/paddle/operators/modified_huber_loss_op.h b/paddle/operators/modified_huber_loss_op.h
index aba75efad9..157ae0682e 100644
--- a/paddle/operators/modified_huber_loss_op.h
+++ b/paddle/operators/modified_huber_loss_op.h
@@ -46,7 +46,7 @@ struct ModifiedHuberLossForward {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ModifiedHuberLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -57,7 +57,8 @@ class ModifiedHuberLossKernel : public framework::OpKernel<T> {
 
     out0->mutable_data<T>(context.GetPlace());
     out1->mutable_data<T>(context.GetPlace());
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto x = EigenVector<T>::Flatten(*in0);
     auto y = EigenVector<T>::Flatten(*in1);
diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc
index 1995400619..2ab48fedec 100644
--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
@@ -71,8 +71,12 @@ class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor, default Tensor<float>) "
              "Input learning rate");
 
-    AddOutput("ParamOut", "(Tensor) Output updated parameter");
-    AddOutput("VelocityOut", "(Tensor) Output updated velocity");
+    AddOutput("ParamOut",
+              "(Tensor) This output is updated parameter. "
+              "It shared memory with Input(Param).");
+    AddOutput("VelocityOut",
+              "(Tensor) This output is updated velocity. "
+              "It shared memory with Input(Velocity).");
 
     AddAttr<float>("mu", "(float) Momentum coefficient");
     AddAttr<bool>("use_nesterov",
@@ -101,5 +105,5 @@ $$
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(momentum, ops::MomentumOp, ops::MomentumOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    momentum, ops::MomentumOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(momentum, ops::MomentumOpKernel<float>,
+                       ops::MomentumOpKernel<double>);
diff --git a/paddle/operators/momentum_op.cu b/paddle/operators/momentum_op.cu
index efc24e795e..00f1253465 100644
--- a/paddle/operators/momentum_op.cu
+++ b/paddle/operators/momentum_op.cu
@@ -12,9 +12,67 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
-#include "paddle/operators/momentum_op.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void MomentumKernel(const T* p, const T* g, const T* v,
+                               const T* learning_rate, const T mu,
+                               const int64_t num, bool use_nesterov, T* p_out,
+                               T* v_out) {
+  T lr = learning_rate[0];
+  if (use_nesterov) {
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+         i += blockDim.x * gridDim.x) {
+      T g_val = g[i];
+      T v_new = v[i] * mu + g_val;
+      v_out[i] = v_new;
+      p_out[i] = p[i] - (g_val - v_new * mu) * lr;
+    }
+  } else {
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+         i += blockDim.x * gridDim.x) {
+      T v_new = v[i] * mu + g[i];
+      v_out[i] = v_new;
+      p_out[i] = p[i] - lr * v_new;
+    }
+  }
+}
+
+template <typename T>
+class MomentumOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
+    auto param = ctx.Input<framework::Tensor>("Param");
+    auto velocity = ctx.Input<framework::Tensor>("Velocity");
+    auto grad = ctx.Input<framework::Tensor>("Grad");
+    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
+    T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
+
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
+    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
+
+    auto* p = param->data<T>();
+    auto* v = velocity->data<T>();
+    auto* g = grad->data<T>();
+    auto* lr = learning_rate->data<T>();
+
+    int block = 512;
+    int grid = (param->numel() + block - 1) / block;
+    MomentumKernel<T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
+        p, g, v, lr, mu, param->numel(), use_nesterov, p_out, v_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    momentum, ops::MomentumOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(momentum, ops::MomentumOpCUDAKernel<float>,
+                        ops::MomentumOpCUDAKernel<double>);
diff --git a/paddle/operators/momentum_op.h b/paddle/operators/momentum_op.h
index 8f7f5eb5c2..da69532ea5 100644
--- a/paddle/operators/momentum_op.h
+++ b/paddle/operators/momentum_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename T>
 class MomentumOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -33,7 +33,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
     param_out->mutable_data<T>(ctx.GetPlace());
     velocity_out->mutable_data<T>(ctx.GetPlace());
 
-    float mu = ctx.Attr<float>("mu");
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
     bool use_nesterov = ctx.Attr<bool>("use_nesterov");
 
     auto p_out = framework::EigenVector<T>::Flatten(*param_out);
@@ -42,18 +42,13 @@ class MomentumOpKernel : public framework::OpKernel<T> {
     auto p = framework::EigenVector<T>::Flatten(*param);
     auto v = framework::EigenVector<T>::Flatten(*velocity);
     auto g = framework::EigenVector<T>::Flatten(*grad);
-    auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
+    auto* lr = learning_rate->data<T>();
 
-    auto place = ctx.GetEigenDevice<Place>();
-
-    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
-
-    v_out.device(place) = v * mu + g;
+    v_out = v * mu + g;
     if (use_nesterov) {
-      p_out.device(place) = p - g * lr.broadcast(grad_dsize) +
-                            v_out * mu * lr.broadcast(grad_dsize);
+      p_out = p - (g - v_out * mu) * lr[0];
     } else {
-      p_out.device(place) = p - lr.broadcast(grad_dsize) * v_out;
+      p_out = p - lr[0] * v_out;
     }
   }
 };
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 3c39ae10dc..bc4a5fdf0b 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -149,6 +149,7 @@ REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker,
                   ops::MulOpShapeInference,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(mul_grad, ops::MulOpGrad);
-REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(mul_grad,
-                       ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/mul_op.cu.cc b/paddle/operators/mul_op.cu.cc
index 66dc3d6d10..6095de58d0 100644
--- a/paddle/operators/mul_op.cu.cc
+++ b/paddle/operators/mul_op.cu.cc
@@ -15,6 +15,7 @@
 #include "paddle/operators/mul_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(mul_grad,
-                       ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    mul_grad, ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index 0eb9df41e9..1b467dca83 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -23,7 +23,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -46,15 +46,16 @@ class MulKernel : public framework::OpKernel<T> {
     if (z_dim.size() != 2) {
       z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
     }
-    math::matmul<Place, T>(context.device_context(), x_matrix, false, y_matrix,
-                           false, 1, z, 0);
+    math::matmul<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), x_matrix, false,
+        y_matrix, false, 1, z, 0);
     if (z_dim.size() != 2) {
       z->Resize(z_dim);
     }
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MulGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -77,6 +78,7 @@ class MulGradKernel : public framework::OpKernel<T> {
 
     Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
     if (dx) {
       dx->mutable_data<T>(ctx.GetPlace());
       Tensor dx_matrix = dx->dims().size() > 2
@@ -84,8 +86,8 @@ class MulGradKernel : public framework::OpKernel<T> {
                              : *dx;
 
       // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      math::matmul<Place, T>(ctx.device_context(), dout_mat, false, y_matrix,
-                             true, 1, &dx_matrix, 0);
+      math::matmul<DeviceContext, T>(dev_ctx, dout_mat, false, y_matrix, true,
+                                     1, &dx_matrix, 0);
     }
     if (dy) {
       dy->mutable_data<T>(ctx.GetPlace());
@@ -93,8 +95,8 @@ class MulGradKernel : public framework::OpKernel<T> {
                              ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
                              : *dy;
       // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      math::matmul<Place, T>(ctx.device_context(), x_matrix, true, dout_mat,
-                             false, 1, &dy_matrix, 0);
+      math::matmul<DeviceContext, T>(dev_ctx, x_matrix, true, dout_mat, false,
+                                     1, &dy_matrix, 0);
     }
   }
 };
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index 8e7f544e0d..b1ee8051c4 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -119,7 +119,8 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<false>);
 REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
 REGISTER_OP_CPU_KERNEL(
-    multiplex, ops::MultiplexCPUKernel<paddle::platform::CPUPlace, float>);
+    multiplex,
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     multiplex_grad,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUPlace, float>);
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index 10dff8d021..47986e9ff8 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -36,7 +36,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
     CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
     auto stream = ctx.cuda_device_context().stream();
-    Place place = boost::get<Place>(ctx.GetPlace());
+    platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
       int32_t k = index[i];
       PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
@@ -60,7 +60,8 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
       if (d_ins[i]) {
         d_ins[i]->mutable_data<T>(ctx.GetPlace());
         auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(ctx.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+        t.device(*ctx.template device_context<Place>().eigen_device()) =
+            t.constant(static_cast<T>(0));
       }
     }
 
@@ -72,7 +73,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
     auto* index = index_t_cpu.data<int32_t>();
 
     auto stream = ctx.cuda_device_context().stream();
-    Place place = boost::get<Place>(ctx.GetPlace());
+    platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
       size_t k = static_cast<size_t>(index[i]);
       if (d_ins[k]) {
@@ -87,8 +88,9 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
-    multiplex, ops::MultiplexGPUKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    multiplex,
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     multiplex_grad,
-    ops::MultiplexGradGPUKernel<paddle::platform::GPUPlace, float>);
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/multiplex_op.h b/paddle/operators/multiplex_op.h
index ab3cafaa32..3443151161 100644
--- a/paddle/operators/multiplex_op.h
+++ b/paddle/operators/multiplex_op.h
@@ -22,7 +22,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MultiplexCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -35,7 +35,7 @@ class MultiplexCPUKernel : public framework::OpKernel<T> {
     auto rows = ins[0]->dims()[0];
     auto cols = ins[0]->numel() / rows;
     auto index = ids->data<int32_t>();
-    Place place = boost::get<Place>(ctx.GetPlace());
+    platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
       int32_t k = index[i];
       PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
@@ -47,7 +47,7 @@ class MultiplexCPUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class MultiplexGradCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -60,14 +60,15 @@ class MultiplexGradCPUKernel : public framework::OpKernel<T> {
       if (d_ins[i]) {
         d_ins[i]->mutable_data<T>(ctx.GetPlace());
         auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(ctx.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+        t.device(*ctx.template device_context<DeviceContext>().eigen_device()) =
+            t.constant(static_cast<T>(0));
       }
     }
 
     auto rows = ins[0]->dims()[0];
     auto cols = ins[0]->numel() / rows;
     auto* index = ids->data<int32_t>();
-    Place place = boost::get<Place>(ctx.GetPlace());
+    platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
       size_t k = static_cast<size_t>(index[i]);
       if (d_ins[k]) {
diff --git a/paddle/operators/nccl_op.cu.cc b/paddle/operators/nccl_op.cu.cc
index 4f0a2a79ed..6ca6db7253 100644
--- a/paddle/operators/nccl_op.cu.cc
+++ b/paddle/operators/nccl_op.cu.cc
@@ -204,6 +204,6 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
-REGISTER_OP_GPU_KERNEL(ncclBcast, ops::NCCLBcastKernel<float>);
-REGISTER_OP_GPU_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
+REGISTER_OP_CUDA_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
+REGISTER_OP_CUDA_KERNEL(ncclBcast, ops::NCCLBcastKernel<float>);
+REGISTER_OP_CUDA_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc
index bb7ae20286..d747cc0cf5 100644
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -33,9 +33,9 @@
 #include "paddle/platform/place.h"
 
 USE_NO_KERNEL_OP(ncclInit);
-USE_GPU_ONLY_OP(ncclAllReduce);
-USE_GPU_ONLY_OP(ncclReduce);
-USE_GPU_ONLY_OP(ncclBcast);
+USE_CUDA_ONLY_OP(ncclAllReduce);
+USE_CUDA_ONLY_OP(ncclReduce);
+USE_CUDA_ONLY_OP(ncclBcast);
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc
index 952da10434..5ad1610fde 100644
--- a/paddle/operators/nce_op.cc
+++ b/paddle/operators/nce_op.cc
@@ -67,7 +67,7 @@ class NCEOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        ctx.device_context());
+        ctx.GetPlace());
   }
 };
 
@@ -170,7 +170,7 @@ class NCEOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        ctx.device_context());
+        ctx.GetPlace());
   }
 };
 
diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h
index ea92a797fe..6636dad060 100644
--- a/paddle/operators/nce_op.h
+++ b/paddle/operators/nce_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 void PrepareSamples(const framework::ExecutionContext& context) {
   auto label = context.Input<Tensor>("Label");
   const int64_t* label_data = label->data<int64_t>();
@@ -49,7 +49,7 @@ void PrepareSamples(const framework::ExecutionContext& context) {
 
   int num_label = label_dims.size() == 2 ? label_dims[1] : 1;
   int index = 0;
-  for (size_t i = 0; i < label_dims[0]; ++i) {
+  for (int64_t i = 0; i < label_dims[0]; ++i) {
     int j = 0;
     for (; j < num_label; ++j) {
       sample_labels_data[index++] = label_data[i * num_label + j];
@@ -67,11 +67,11 @@ void PrepareSamples(const framework::ExecutionContext& context) {
   }
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class NCEKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PrepareSamples<Place, T>(context);
+    PrepareSamples<DeviceContext, T>(context);
     auto sample_labels = context.Output<Tensor>("SampleLabels");
     const int64_t* sample_labels_data = sample_labels->data<int64_t>();
     auto sample_out = context.Output<Tensor>("SampleLogits");
@@ -86,7 +86,7 @@ class NCEKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
     int num_neg_samples = context.Attr<int>("num_neg_samples");
     int num_total_classes = context.Attr<int>("num_total_classes");
-    int num_true_class = 1;
+    int64_t num_true_class = 1;
     if (label != nullptr) {
       num_true_class = label->dims()[1];
     }
@@ -95,18 +95,18 @@ class NCEKernel : public framework::OpKernel<T> {
     auto bias = context.Input<Tensor>("Bias");
     if (bias != nullptr) {
       const T* bias_data = bias->data<T>();
-      for (size_t i = 0; i < sample_labels->numel(); ++i) {
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
         sample_out_data[i] = bias_data[sample_labels_data[i]];
       }
     } else {
-      for (size_t i = 0; i < sample_labels->numel(); ++i) {
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
         sample_out_data[i] = 0;
       }
     }
     // forward mul
     auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
     auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
-    for (size_t i = 0; i < sample_labels->numel(); ++i) {
+    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
       Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
           (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) *
            weight_mat.chip(sample_labels_data[i], 0))
@@ -115,8 +115,8 @@ class NCEKernel : public framework::OpKernel<T> {
       sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
     }
     // forward cost
-    for (size_t i = 0; i < sample_labels->dims()[0]; ++i) {
-      size_t j = 0;
+    for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) {
+      int64_t j = 0;
       out_data[i] = 0;
       T w = sample_weight == nullptr ? 1. : sample_weight_data[i];
       // for true classes
@@ -135,7 +135,7 @@ class NCEKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class NCEGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -162,7 +162,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
     T* sample_grad_data =
         sample_grad.mutable_data<T>(sample_labels->dims(), context.GetPlace());
     // backward cost
-    for (size_t i = 0; i < sample_labels->numel(); ++i) {
+    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
       T o = sample_out_data[i];
       T w = sample_weight == nullptr
                 ? 1
@@ -177,7 +177,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
     if (d_bias != nullptr) {
       T* d_bias_data = d_bias->mutable_data<T>(context.GetPlace());
       std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0);
-      for (size_t i = 0; i < sample_labels->numel(); ++i) {
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
         d_bias_data[sample_labels_data[i]] += sample_grad_data[i];
       }
     }
@@ -188,7 +188,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
       std::fill(d_w_data, d_w_data + d_w->numel(), 0.0);
       auto d_w_matrix = EigenMatrix<T>::From(*d_w);
       auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
-      for (size_t i = 0; i < sample_labels->numel(); ++i) {
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
         d_w_matrix.chip(sample_labels_data[i], 0) +=
             x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) *
             sample_grad_data[i];
@@ -200,7 +200,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
       d_x->mutable_data<T>(context.GetPlace());
       auto d_x_matrix = EigenMatrix<T>::From(*d_x);
       auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
-      for (size_t i = 0; i < sample_labels->numel(); ++i) {
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
         d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) +=
             w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
       }
diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h
index ebeb262d96..8935751f15 100644
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -38,7 +38,10 @@ namespace operators {
 class NetOp : public framework::OperatorBase {
  public:
   static const char kAll[];
-  NetOp() : framework::OperatorBase("plain_net", {}, {}, {}) {}
+  NetOp()
+      : framework::OperatorBase("plain_net", framework::VariableNameMap{},
+                                framework::VariableNameMap{},
+                                framework::AttributeMap{}) {}
 
   NetOp(const std::string& type, const framework::VariableNameMap& inputs,
         const framework::VariableNameMap& outputs,
diff --git a/paddle/operators/net_op_test.cc b/paddle/operators/net_op_test.cc
index 63bebd5b44..22fba9568d 100644
--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
@@ -38,10 +38,10 @@ TEST(OpKernel, all) {
 
   net->AppendOp(std::unique_ptr<TestOp>(
       new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                 {{"Out", {"y"}}}, {})));
+                 {{"Out", {"y"}}}, framework::AttributeMap{})));
   net->AppendOp(std::unique_ptr<TestOp>(
       new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
-                 {{"Out", {"z"}}}, {})));
+                 {{"Out", {"z"}}}, framework::AttributeMap{})));
 
   net->CompleteAddOp();
   AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"},
@@ -58,7 +58,7 @@ TEST(NetOp, insert_op) {
   NetOp net;
   auto op1 = std::unique_ptr<framework::NOP>(
       new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                         {{"Out", {"y"}}}, {}));
+                         {{"Out", {"y"}}}, framework::AttributeMap{}));
   net.AppendOp(*op1);
   net.InsertOp(0, *op1);
   ASSERT_EQ(2UL, net.ops_.size());
@@ -68,10 +68,12 @@ TEST(NetOp, insert_op) {
 
 TEST(NetOp, Clone) {
   NetOp net;
-  net.AppendOp(
-      std::unique_ptr<framework::NOP>(new framework::NOP{"empty", {}, {}, {}}));
-  net.AppendOp(std::unique_ptr<framework::NOP>(
-      new framework::NOP{"empty2", {}, {}, {}}));
+  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
+      "empty", framework::VariableNameMap{}, framework::VariableNameMap{},
+      framework::AttributeMap{}}));
+  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
+      "empty2", framework::VariableNameMap{}, framework::VariableNameMap{},
+      framework::AttributeMap{}}));
   net.CompleteAddOp(true);
   auto new_net_op = net.Clone();
   ASSERT_NE(new_net_op, nullptr);
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index adb75df6ef..936dde22c3 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -134,6 +134,7 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker);
 REGISTER_OPERATOR(pad_grad, ops::PadOpGrad);
-REGISTER_OP_CPU_KERNEL(pad, ops::PadKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(pad_grad,
-                       ops::PadGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    pad, ops::PadKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    pad_grad, ops::PadGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/pad_op.cu b/paddle/operators/pad_op.cu
index 555a7dba23..c309fb625c 100644
--- a/paddle/operators/pad_op.cu
+++ b/paddle/operators/pad_op.cu
@@ -16,6 +16,7 @@
 #include "paddle/operators/pad_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(pad, ops::PadKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(pad_grad,
-                       ops::PadGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    pad, ops::PadKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    pad_grad, ops::PadGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/pad_op.h b/paddle/operators/pad_op.h
index 9534dbf545..1b95942af3 100644
--- a/paddle/operators/pad_op.h
+++ b/paddle/operators/pad_op.h
@@ -26,7 +26,7 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 
-template <typename Place, typename T, size_t D>
+template <typename DeviceContext, typename T, size_t D>
 void PadFunction(const framework::ExecutionContext& context) {
   auto pads = context.Attr<std::vector<int>>("paddings");
   Eigen::array<std::pair<int, int>, D> paddings;
@@ -42,33 +42,34 @@ void PadFunction(const framework::ExecutionContext& context) {
 
   auto x_tensor = EigenTensor<T, D>::From(*x);
   auto out_tensor = EigenTensor<T, D>::From(*out);
-  auto place = context.GetEigenDevice<Place>();
+  auto& place =
+      *context.template device_context<DeviceContext>().eigen_device();
   out_tensor.device(place) = x_tensor.pad(paddings, pad_value);
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PadKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     int rank = context.Input<Tensor>("X")->dims().size();
     switch (rank) {
       case 1:
-        PadFunction<Place, T, 1>(context);
+        PadFunction<DeviceContext, T, 1>(context);
         break;
       case 2:
-        PadFunction<Place, T, 2>(context);
+        PadFunction<DeviceContext, T, 2>(context);
         break;
       case 3:
-        PadFunction<Place, T, 3>(context);
+        PadFunction<DeviceContext, T, 3>(context);
         break;
       case 4:
-        PadFunction<Place, T, 4>(context);
+        PadFunction<DeviceContext, T, 4>(context);
         break;
       case 5:
-        PadFunction<Place, T, 5>(context);
+        PadFunction<DeviceContext, T, 5>(context);
         break;
       case 6:
-        PadFunction<Place, T, 6>(context);
+        PadFunction<DeviceContext, T, 6>(context);
         break;
       default:
         PADDLE_THROW(
@@ -77,7 +78,7 @@ class PadKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T, size_t D>
+template <typename DeviceContext, typename T, size_t D>
 void PadGradFunction(const framework::ExecutionContext& context) {
   auto pads = context.Attr<std::vector<int>>("paddings");
   Eigen::array<std::pair<int, int>, D> paddings;
@@ -91,12 +92,13 @@ void PadGradFunction(const framework::ExecutionContext& context) {
     d_x->mutable_data<T>(context.GetPlace());
     auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
     auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0);
   }
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PadGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -104,22 +106,22 @@ class PadGradKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
     switch (rank) {
       case 1:
-        PadGradFunction<Place, T, 1>(context);
+        PadGradFunction<DeviceContext, T, 1>(context);
         break;
       case 2:
-        PadGradFunction<Place, T, 2>(context);
+        PadGradFunction<DeviceContext, T, 2>(context);
         break;
       case 3:
-        PadGradFunction<Place, T, 3>(context);
+        PadGradFunction<DeviceContext, T, 3>(context);
         break;
       case 4:
-        PadGradFunction<Place, T, 4>(context);
+        PadGradFunction<DeviceContext, T, 4>(context);
         break;
       case 5:
-        PadGradFunction<Place, T, 5>(context);
+        PadGradFunction<DeviceContext, T, 5>(context);
         break;
       case 6:
-        PadGradFunction<Place, T, 6>(context);
+        PadGradFunction<DeviceContext, T, 6>(context);
         break;
       default:
         PADDLE_THROW(
diff --git a/paddle/operators/pool_cudnn_op.cc b/paddle/operators/pool_cudnn_op.cc
index be9fcc5661..77407f5cdf 100644
--- a/paddle/operators/pool_cudnn_op.cc
+++ b/paddle/operators/pool_cudnn_op.cc
@@ -19,19 +19,21 @@ namespace ops = paddle::operators;
 REGISTER_OP(pool2d_cudnn, ops::PoolOp, ops::Pool2dOpMaker, pool2d_cudnn_grad,
             ops::PoolOpGrad);
 
-REGISTER_OP_CPU_KERNEL(pool2d_cudnn,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(pool2d_cudnn_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>)
+REGISTER_OP_CPU_KERNEL(
+    pool2d_cudnn, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    pool2d_cudnn_grad,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>)
 
 REGISTER_OP(pool3d_cudnn, ops::PoolOp, ops::Pool3dOpMaker, pool3d_cudnn_grad,
             ops::PoolOpGrad);
 
-REGISTER_OP_CPU_KERNEL(pool3d_cudnn,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(pool3d_cudnn_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>)
+REGISTER_OP_CPU_KERNEL(
+    pool3d_cudnn, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    pool3d_cudnn_grad,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>)
diff --git a/paddle/operators/pool_cudnn_op.cu.cc b/paddle/operators/pool_cudnn_op.cu.cc
index 66dd194ccd..fc2b37bd0f 100644
--- a/paddle/operators/pool_cudnn_op.cu.cc
+++ b/paddle/operators/pool_cudnn_op.cu.cc
@@ -162,12 +162,12 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel<float>,
-                       ops::PoolCudnnOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>,
-                       ops::PoolCudnnGradOpKernel<double>);
-
-REGISTER_OP_GPU_KERNEL(pool3d_cudnn, ops::PoolCudnnOpKernel<float>,
-                       ops::PoolCudnnOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(pool3d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>,
-                       ops::PoolCudnnGradOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel<float>,
+                        ops::PoolCudnnOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>,
+                        ops::PoolCudnnGradOpKernel<double>);
+
+REGISTER_OP_CUDA_KERNEL(pool3d_cudnn, ops::PoolCudnnOpKernel<float>,
+                        ops::PoolCudnnOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(pool3d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>,
+                        ops::PoolCudnnGradOpKernel<double>);
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index e26ffd86e5..45fa20280c 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -216,19 +216,19 @@ namespace ops = paddle::operators;
 REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
             ops::PoolOpGrad);
 
-REGISTER_OP_CPU_KERNEL(pool2d,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(pool2d_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>)
+REGISTER_OP_CPU_KERNEL(
+    pool2d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    pool2d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>)
 
 REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
             ops::PoolOpGrad);
 
-REGISTER_OP_CPU_KERNEL(pool3d,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(pool3d_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    pool3d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    pool3d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/pool_op.cu.cc b/paddle/operators/pool_op.cu.cc
index 1010cb7622..39a9dfbf79 100644
--- a/paddle/operators/pool_op.cu.cc
+++ b/paddle/operators/pool_op.cu.cc
@@ -16,16 +16,18 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(pool2d,
-                       ops::PoolKernel<paddle::platform::GPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(pool2d_grad,
-                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pool2d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pool2d_grad,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
 
-REGISTER_OP_GPU_KERNEL(pool3d,
-                       ops::PoolKernel<paddle::platform::GPUPlace, float>,
-                       ops::PoolKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(pool3d_grad,
-                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>,
-                       ops::PoolGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pool3d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pool3d_grad,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
index 63492a89e8..ab85d587a3 100644
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -50,7 +50,7 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
                 framework::OpAttrChecker* op_checker);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -67,41 +67,41 @@ class PoolKernel : public framework::OpKernel<T> {
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
       }
     }
-
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     switch (ksize.size()) {
       case 2: {
         if (pooling_type == "max") {
           paddle::operators::math::Pool2dFunctor<
-              Place, paddle::operators::math::MaxPool<T>, T>
+              DeviceContext, paddle::operators::math::MaxPool<T>, T>
               pool2d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(context.device_context(), *in_x, ksize, strides,
-                         paddings, pool_process, out);
+          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
 
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool2dFunctor<
-              Place, paddle::operators::math::AvgPool<T>, T>
+              DeviceContext, paddle::operators::math::AvgPool<T>, T>
               pool2d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
-          pool2d_forward(context.device_context(), *in_x, ksize, strides,
-                         paddings, pool_process, out);
+          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
         }
       } break;
       case 3: {
         if (pooling_type == "max") {
           paddle::operators::math::Pool3dFunctor<
-              Place, paddle::operators::math::MaxPool<T>, T>
+              DeviceContext, paddle::operators::math::MaxPool<T>, T>
               pool3d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
-          pool3d_forward(context.device_context(), *in_x, ksize, strides,
-                         paddings, pool_process, out);
+          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool3dFunctor<
-              Place, paddle::operators::math::AvgPool<T>, T>
+              DeviceContext, paddle::operators::math::AvgPool<T>, T>
               pool3d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
-          pool3d_forward(context.device_context(), *in_x, ksize, strides,
-                         paddings, pool_process, out);
+          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
         }
       } break;
       default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
@@ -109,7 +109,7 @@ class PoolKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -130,42 +130,43 @@ class PoolGradKernel : public framework::OpKernel<T> {
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
       }
     }
-
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
       auto temp = framework::EigenVector<T>::Flatten(*in_x_grad);
-      temp.device(context.GetEigenDevice<Place>()) =
+      temp.device(
+          *context.template device_context<DeviceContext>().eigen_device()) =
           temp.constant(static_cast<T>(0));
 
       switch (ksize.size()) {
         case 2: {
           if (pooling_type == "max") {
-            paddle::operators::math::MaxPool2dGradFunctor<Place, T>
+            paddle::operators::math::MaxPool2dGradFunctor<DeviceContext, T>
                 pool2d_backward;
-            pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
-                            ksize, strides, paddings, in_x_grad);
+            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, in_x_grad);
           } else if (pooling_type == "avg") {
             paddle::operators::math::Pool2dGradFunctor<
-                Place, paddle::operators::math::AvgPoolGrad<T>, T>
+                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
                 pool2d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
-                            ksize, strides, paddings, pool_process, in_x_grad);
+            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, pool_process, in_x_grad);
           }
         } break;
         case 3: {
           if (pooling_type == "max") {
-            paddle::operators::math::MaxPool3dGradFunctor<Place, T>
+            paddle::operators::math::MaxPool3dGradFunctor<DeviceContext, T>
                 pool3d_backward;
-            pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
-                            ksize, strides, paddings, in_x_grad);
+            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, in_x_grad);
           } else if (pooling_type == "avg") {
             paddle::operators::math::Pool3dGradFunctor<
-                Place, paddle::operators::math::AvgPoolGrad<T>, T>
+                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
                 pool3d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
-                            ksize, strides, paddings, pool_process, in_x_grad);
+            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, pool_process, in_x_grad);
           }
         } break;
         default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index b9c42a6912..1a2383f8b8 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -266,12 +266,15 @@ REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
 
 REGISTER_OP_CPU_KERNEL(
     max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, double, int>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
+                                int>);
 REGISTER_OP_CPU_KERNEL(
     max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float, int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, double, int>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
+                                    int>)
 
 REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
             ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad,
@@ -279,9 +282,12 @@ REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
 
 REGISTER_OP_CPU_KERNEL(
     max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, double, int>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
+                                int>);
 REGISTER_OP_CPU_KERNEL(
     max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float, int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, double, int>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
+                                    int>)
diff --git a/paddle/operators/pool_with_index_op.cu.cc b/paddle/operators/pool_with_index_op.cu.cc
index 335064a7ee..4c9804da63 100644
--- a/paddle/operators/pool_with_index_op.cu.cc
+++ b/paddle/operators/pool_with_index_op.cu.cc
@@ -16,20 +16,28 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, double, int>);
-REGISTER_OP_GPU_KERNEL(
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
+                                int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
+                                int>);
+REGISTER_OP_CUDA_KERNEL(
     max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float, int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, double, int>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
+                                    int>)
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float, int>,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, double, int>);
-REGISTER_OP_GPU_KERNEL(
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
+                                int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
+                                int>);
+REGISTER_OP_CUDA_KERNEL(
     max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float, int>,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, double, int>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
+                                    int>)
diff --git a/paddle/operators/pool_with_index_op.h b/paddle/operators/pool_with_index_op.h
index 40766c7e82..4f4087d1dd 100644
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
@@ -24,7 +24,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -35,6 +35,8 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
@@ -44,23 +46,23 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
 
     switch (ksize.size()) {
       case 2: {
-        paddle::operators::math::MaxPool2dWithIndexFunctor<Place, T1, T2>
+        paddle::operators::math::MaxPool2dWithIndexFunctor<DeviceContext, T1,
+                                                           T2>
             pool2d_forward;
-        pool2d_forward(context.device_context(), *in_x, ksize, strides,
-                       paddings, out, mask);
+        pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
       } break;
       case 3: {
-        paddle::operators::math::MaxPool3dWithIndexFunctor<Place, T1, T2>
+        paddle::operators::math::MaxPool3dWithIndexFunctor<DeviceContext, T1,
+                                                           T2>
             pool3d_forward;
-        pool3d_forward(context.device_context(), *in_x, ksize, strides,
-                       paddings, out, mask);
+        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
       } break;
       default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
     }
   }
 };
 
-template <typename Place, typename T1, typename T2>
+template <typename DeviceContext, typename T1, typename T2>
 class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -81,18 +83,20 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
 
     if (in_x_grad) {
       in_x_grad->mutable_data<T1>(context.GetPlace());
-      auto& device_ctx = context.device_context();
+      auto& device_ctx = context.template device_context<DeviceContext>();
       math::set_constant(device_ctx, in_x_grad, 0);
 
       switch (ksize.size()) {
         case 2: {
-          paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T1, T2>
+          paddle::operators::math::MaxPool2dWithIndexGradFunctor<DeviceContext,
+                                                                 T1, T2>
               pool2d_backward;
           pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides,
                           paddings, in_x_grad);
         } break;
         case 3: {
-          paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T1, T2>
+          paddle::operators::math::MaxPool3dWithIndexGradFunctor<DeviceContext,
+                                                                 T1, T2>
               pool3d_backward;
           pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
                           paddings, in_x_grad);
diff --git a/paddle/operators/positive_negative_pair_op.h b/paddle/operators/positive_negative_pair_op.h
index 2efd3777e0..977e59b7d2 100644
--- a/paddle/operators/positive_negative_pair_op.h
+++ b/paddle/operators/positive_negative_pair_op.h
@@ -22,7 +22,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PositiveNegativePairKernel : public framework::OpKernel<T> {
  public:
   struct PredictionResult {
diff --git a/paddle/operators/precision_recall_op.h b/paddle/operators/precision_recall_op.h
index 4a871ce674..c0d55405a3 100644
--- a/paddle/operators/precision_recall_op.h
+++ b/paddle/operators/precision_recall_op.h
@@ -26,7 +26,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 enum StateVariable { TP = 0, FP, TN, FN };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PrecisionRecallKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
index 055c471b45..317a2a4015 100644
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -85,7 +85,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad,
             ops::PReluGradOp);
-REGISTER_OP_CPU_KERNEL(prelu,
-                       ops::PReluKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(prelu_grad,
-                       ops::PReluGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    prelu, ops::PReluKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    prelu_grad,
+    ops::PReluGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/prelu_op.cu b/paddle/operators/prelu_op.cu
index 9e391dabae..12033dee0e 100644
--- a/paddle/operators/prelu_op.cu
+++ b/paddle/operators/prelu_op.cu
@@ -14,8 +14,9 @@
 
 #include "paddle/operators/prelu_op.h"
 
-REGISTER_OP_GPU_KERNEL(
-    prelu, paddle::operators::PReluKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    prelu_grad,
-    paddle::operators::PReluGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    prelu,
+    paddle::operators::PReluKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(prelu_grad,
+                        paddle::operators::PReluGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/prelu_op.h b/paddle/operators/prelu_op.h
index 5ad31c2203..56f9a553ec 100644
--- a/paddle/operators/prelu_op.h
+++ b/paddle/operators/prelu_op.h
@@ -39,7 +39,7 @@ class PReluFunctor {
   const T* alpha_;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PReluKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -54,9 +54,9 @@ class PReluKernel : public framework::OpKernel<T> {
 
     int numel = x->numel();
 
-    Transform<Place> trans;
-    trans(context.device_context(), x_ptr, x_ptr + numel, o_ptr,
-          PReluFunctor<T>(alpha_ptr));
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x_ptr,
+          x_ptr + numel, o_ptr, PReluFunctor<T>(alpha_ptr));
   }
 };
 
@@ -76,7 +76,7 @@ class PReluGradFunctor {
   const T* alpha_;
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class PReluGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -92,9 +92,9 @@ class PReluGradKernel : public framework::OpKernel<T> {
     const T* out_ptr = out->data<T>();
     int numel = dx->numel();
 
-    Transform<Place> trans;
-    trans(context.device_context(), out_ptr, out_ptr + numel, dout_ptr, dx_ptr,
-          PReluGradFunctor<T>(alpha_ptr));
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), out_ptr,
+          out_ptr + numel, dout_ptr, dx_ptr, PReluGradFunctor<T>(alpha_ptr));
 
     // TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready
   }
diff --git a/paddle/operators/proximal_adagrad_op.cc b/paddle/operators/proximal_adagrad_op.cc
index 36e460103a..cc350f6d26 100644
--- a/paddle/operators/proximal_adagrad_op.cc
+++ b/paddle/operators/proximal_adagrad_op.cc
@@ -114,4 +114,4 @@ REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad, ops::ProximalAdagradOp,
                              ops::ProximalAdagradOpMaker);
 REGISTER_OP_CPU_KERNEL(
     proximal_adagrad,
-    ops::ProximalAdagradOpKernel<paddle::platform::CPUPlace, float>);
+    ops::ProximalAdagradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/proximal_adagrad_op.cu b/paddle/operators/proximal_adagrad_op.cu
index d0ae039518..42a178f94b 100644
--- a/paddle/operators/proximal_adagrad_op.cu
+++ b/paddle/operators/proximal_adagrad_op.cu
@@ -15,6 +15,6 @@ specific language governing permissions and limitations under the License. */
 #include "paddle/operators/proximal_adagrad_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     proximal_adagrad,
-    ops::ProximalAdagradOpKernel<paddle::platform::GPUPlace, float>);
+    ops::ProximalAdagradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/proximal_adagrad_op.h b/paddle/operators/proximal_adagrad_op.h
index 7a1560e8cb..523924d80e 100644
--- a/paddle/operators/proximal_adagrad_op.h
+++ b/paddle/operators/proximal_adagrad_op.h
@@ -24,7 +24,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ProximalAdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -45,20 +45,20 @@ class ProximalAdagradOpKernel : public framework::OpKernel<T> {
 
     auto p_out = EigenVector<T>::Flatten(*param_out);
     auto m_out = EigenVector<T>::Flatten(*moment_out);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<int, 1> grad_dsize(grad->numel());
 
-    m_out.device(place) = m + g * g;
+    m_out.device(*place) = m + g * g;
     auto prox_param = p - lr.broadcast(grad_dsize) * g / m_out.sqrt();
     if (l1 > static_cast<T>(0)) {
-      p_out.device(place) =
+      p_out.device(*place) =
           prox_param.sign() *
           (((prox_param.abs() - (lr * l1).broadcast(grad_dsize))
                 .cwiseMax(static_cast<T>(0.0))) /
            (static_cast<T>(1.0) + (lr * l2).broadcast(grad_dsize)));
     } else {
-      p_out.device(place) =
+      p_out.device(*place) =
           prox_param / (static_cast<T>(1.0) + (lr * l2).broadcast(grad_dsize));
     }
   }
diff --git a/paddle/operators/proximal_gd_op.cc b/paddle/operators/proximal_gd_op.cc
index 5693d0ec9e..0b26beb3ac 100644
--- a/paddle/operators/proximal_gd_op.cc
+++ b/paddle/operators/proximal_gd_op.cc
@@ -94,4 +94,5 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(proximal_gd, ops::ProximalGDOp,
                              ops::ProximalGDOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    proximal_gd, ops::ProximalGDOpKernel<paddle::platform::CPUPlace, float>);
+    proximal_gd,
+    ops::ProximalGDOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/proximal_gd_op.cu b/paddle/operators/proximal_gd_op.cu
index 26f4ebaa0f..b7dd840d19 100644
--- a/paddle/operators/proximal_gd_op.cu
+++ b/paddle/operators/proximal_gd_op.cu
@@ -15,5 +15,6 @@ specific language governing permissions and limitations under the License. */
 #include "paddle/operators/proximal_gd_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    proximal_gd, ops::ProximalGDOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    proximal_gd,
+    ops::ProximalGDOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/proximal_gd_op.h b/paddle/operators/proximal_gd_op.h
index bebda02041..64648b3cca 100644
--- a/paddle/operators/proximal_gd_op.h
+++ b/paddle/operators/proximal_gd_op.h
@@ -24,7 +24,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ProximalGDOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -42,7 +42,7 @@ class ProximalGDOpKernel : public framework::OpKernel<T> {
     auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
 
     auto p_out = EigenVector<T>::Flatten(*param_out);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<int, 1> grad_dsize(grad->numel());
 
diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index 912f88f455..b80b175792 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -123,7 +123,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad,
             ops::RankLossGradOp);
-REGISTER_OP_CPU_KERNEL(rank_loss,
-                       ops::RankLossKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    rank_loss_grad, ops::RankLossGradKernel<paddle::platform::CPUPlace, float>);
+    rank_loss, ops::RankLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    rank_loss_grad,
+    ops::RankLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/rank_loss_op.cu b/paddle/operators/rank_loss_op.cu
index 5382e3a629..5aee66443d 100644
--- a/paddle/operators/rank_loss_op.cu
+++ b/paddle/operators/rank_loss_op.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/operators/rank_loss_op.h"
 
-REGISTER_OP_GPU_KERNEL(
-    rank_loss,
-    paddle::operators::RankLossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    rank_loss_grad,
-    paddle::operators::RankLossGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(rank_loss,
+                        paddle::operators::RankLossKernel<
+                            paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(rank_loss_grad,
+                        paddle::operators::RankLossGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h
index 703c77a0b2..ea24b61fd9 100644
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class RankLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -35,13 +35,13 @@ class RankLossKernel : public framework::OpKernel<T> {
     auto left = framework::EigenVector<T>::Flatten(*left_t);
     auto right = framework::EigenVector<T>::Flatten(*right_t);
 
-    auto& dev = ctx.GetEigenDevice<Place>();
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
     out.device(dev) =
         (1. + (left - right).exp()).log() - label * (left - right);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class RankLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -55,7 +55,7 @@ class RankLossGradKernel : public framework::OpKernel<T> {
     auto* left_t = ctx.Input<framework::Tensor>("Left");
     auto* right_t = ctx.Input<framework::Tensor>("Right");
 
-    auto& dev = ctx.GetEigenDevice<Place>();
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
     auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
     auto label = framework::EigenVector<T>::Flatten(*label_t);
     auto left = framework::EigenVector<T>::Flatten(*left_t);
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 8b60b9c912..29f9163643 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -408,7 +408,8 @@ class RecurrentGradOp : public RecurrentBase {
             attrs["value"] = 0.0f;
 
             auto zero_op = framework::OpRegistry::CreateOp(
-                "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs);
+                "fill_constant", framework::VariableNameMap{},
+                {{"Out", {pg_names[param_id]}}}, attrs);
             zero_op->Run(scope, dev_ctx);
           }
 
@@ -417,7 +418,7 @@ class RecurrentGradOp : public RecurrentBase {
 
           auto sum_op = framework::OpRegistry::CreateOp(
               "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-              {{"Out", {pg_names[param_id]}}}, {});
+              {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
           sum_op->Run(cur_scope, dev_ctx);
 
           cur_scope.Rename(new_inside_name, inside_grad_name);
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
index c69e416e10..eed482c1b4 100644
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -72,11 +72,13 @@ class RecvOp : public framework::OperatorBase {
     // FIXME(typhoonzero): do not copy
     framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor);
 
-    auto *block = Attr<framework::BlockDescBind *>("OptimizeBlock");
-    auto *program = block->Program();
+    std::string program_str = Attr<std::string>("OptimizeProgram");
+    framework::ProgramDesc program_desc;
+    program_desc.ParseFromString(program_str);
+    framework::ProgramDescBind program(program_desc);
     framework::Executor executor(dev_ctx);
     // Run sub graph to get optimized tensor
-    executor.Run(*program, &recv_scope, block->ID(),
+    executor.Run(program, &recv_scope, 0, /*global_block*/
                  false /*create_local_scope*/);
 
     auto *out_var = recv_scope.FindVar("Out");
@@ -108,8 +110,8 @@ This operator will recv tensor from send_op
                          "IP address to listen on.")
         .SetDefault("127.0.0.1:6164")
         .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
-    AddAttr<framework::BlockDescBind *>("OptimizeBlock", "type BlockDescBind*",
-                                        "optimize network run in server");
+    AddAttr<std::string>("OptimizeProgram", "type string",
+                         "Serialized ProgramDesc string for recv to run.");
   }
 };
 
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
index 2589a54cfc..b754637bf2 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -180,12 +180,13 @@ REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad,
 REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad,
             ops::ReduceGradOp);
 
-#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)     \
-  REGISTER_OP_CPU_KERNEL(                                                  \
-      reduce_type,                                                         \
-      ops::ReduceKernel<paddle::platform::CPUPlace, float, ops::functor>); \
-  REGISTER_OP_CPU_KERNEL(reduce_type##_grad,                               \
-                         ops::ReduceGradKernel<paddle::platform::CPUPlace, \
-                                               float, ops::grad_functor>);
+#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
+  REGISTER_OP_CPU_KERNEL(reduce_type,                                          \
+                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
+                                           float, ops::functor>);              \
+  REGISTER_OP_CPU_KERNEL(                                                      \
+      reduce_type##_grad,                                                      \
+      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,         \
+                            ops::grad_functor>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL);
diff --git a/paddle/operators/reduce_op.cu b/paddle/operators/reduce_op.cu
index d306e1a240..a10ace5253 100644
--- a/paddle/operators/reduce_op.cu
+++ b/paddle/operators/reduce_op.cu
@@ -17,12 +17,13 @@
 
 namespace ops = paddle::operators;
 
-#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)     \
-  REGISTER_OP_GPU_KERNEL(                                                  \
-      reduce_type,                                                         \
-      ops::ReduceKernel<paddle::platform::GPUPlace, float, ops::functor>); \
-  REGISTER_OP_GPU_KERNEL(reduce_type##_grad,                               \
-                         ops::ReduceGradKernel<paddle::platform::GPUPlace, \
-                                               float, ops::grad_functor>);
+#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)    \
+  REGISTER_OP_CUDA_KERNEL(                                                \
+      reduce_type, ops::ReduceKernel<paddle::platform::CUDADeviceContext, \
+                                     float, ops::functor>);               \
+  REGISTER_OP_CUDA_KERNEL(                                                \
+      reduce_type##_grad,                                                 \
+      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,   \
+                            ops::grad_functor>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL);
diff --git a/paddle/operators/reduce_op.h b/paddle/operators/reduce_op.h
index dd6547542d..47ce910f28 100644
--- a/paddle/operators/reduce_op.h
+++ b/paddle/operators/reduce_op.h
@@ -32,55 +32,55 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
 
 struct SumFunctor {
-  template <typename Place, typename X, typename Y, typename Dim>
-  void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
     y.device(place) = x.sum(dim);
   }
 };
 
 struct SumGradFunctor {
-  template <typename Place, typename X, typename Y, typename DX, typename DY,
-            typename Dim>
-  void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy,
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
                   const Dim& dim, int size) {
     dx.device(place) = dy.broadcast(dim);
   }
 };
 
 struct MeanFunctor {
-  template <typename Place, typename X, typename Y, typename Dim>
-  void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
     y.device(place) = x.mean(dim);
   }
 };
 
 struct MeanGradFunctor {
-  template <typename Place, typename X, typename Y, typename DX, typename DY,
-            typename Dim>
-  void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy,
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
                   const Dim& dim, int size) {
     dx.device(place) = dy.broadcast(dim) / dx.constant(size);
   }
 };
 
 struct MaxFunctor {
-  template <typename Place, typename X, typename Y, typename Dim>
-  void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
     y.device(place) = x.maximum(dim);
   }
 };
 
 struct MinFunctor {
-  template <typename Place, typename X, typename Y, typename Dim>
-  void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
     y.device(place) = x.minimum(dim);
   }
 };
 
 struct MaxOrMinGradFunctor {
-  template <typename Place, typename X, typename Y, typename DX, typename DY,
-            typename Dim>
-  void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy,
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
                   const Dim& dim, int size) {
     auto equals = x == y.broadcast(dim);
     auto ones = dx.constant(1);
@@ -91,7 +91,7 @@ struct MaxOrMinGradFunctor {
   }
 };
 
-template <typename Place, typename T, typename Functor>
+template <typename DeviceContext, typename T, typename Functor>
 class ReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -139,7 +139,8 @@ class ReduceKernel : public framework::OpKernel<T> {
       dims = framework::make_ddim(dims_vector);
     }
 
-    auto& place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
 
     if (D == 1) {
@@ -152,7 +153,7 @@ class ReduceKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T, typename Functor>
+template <typename DeviceContext, typename T, typename Functor>
 class ReduceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -201,7 +202,8 @@ class ReduceGradKernel : public framework::OpKernel<T> {
     Eigen::array<int, D> broadcast_dim;
     for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
     broadcast_dim[dim] = input0->dims()[dim];
-    auto& place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
     functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
             broadcast_dim[dim]);
diff --git a/paddle/operators/reshape_op.cu b/paddle/operators/reshape_op.cu
index dca6c15007..b7329238c0 100644
--- a/paddle/operators/reshape_op.cu
+++ b/paddle/operators/reshape_op.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/operators/reshape_op.h"
 
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     reshape,
     paddle::operators::ReshapeKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     reshape_grad,
     paddle::operators::ReshapeGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h
index 73fd1da642..92d8cbbb56 100644
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ReshapeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -33,7 +33,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ReshapeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
diff --git a/paddle/operators/rmsprop_op.cc b/paddle/operators/rmsprop_op.cc
index a9c45f639c..fc3f9b8988 100644
--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
@@ -116,5 +116,5 @@ http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
-REGISTER_OP_CPU_KERNEL(rmsprop,
-                       ops::RmspropOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/rmsprop_op.cu b/paddle/operators/rmsprop_op.cu
index 52634a5481..2a9fd6e104 100644
--- a/paddle/operators/rmsprop_op.cu
+++ b/paddle/operators/rmsprop_op.cu
@@ -16,5 +16,5 @@
 #include "paddle/operators/rmsprop_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(rmsprop,
-                       ops::RmspropOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/rmsprop_op.h b/paddle/operators/rmsprop_op.h
index 7bf2129010..16a561835d 100644
--- a/paddle/operators/rmsprop_op.h
+++ b/paddle/operators/rmsprop_op.h
@@ -24,7 +24,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class RmspropOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -51,7 +51,7 @@ class RmspropOpKernel : public framework::OpKernel<T> {
     auto p_out = EigenVector<T>::Flatten(*param_out);
     auto mom_out = EigenVector<T>::Flatten(*moment_out);
     auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<int, 1> grad_dsize(grad->numel());
 
diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc
index 2b5e66c96b..75fcea8401 100644
--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
@@ -157,9 +157,10 @@ namespace ops = paddle::operators;
 REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
             ops::ROIPoolGradOp);
 REGISTER_OP_CPU_KERNEL(
-    roi_pool, ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, float>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, double>);
+    roi_pool,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     roi_pool_grad,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUPlace, float>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, double>);
+    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/roi_pool_op.cu b/paddle/operators/roi_pool_op.cu
index 9a4c8ca752..a874befe4d 100644
--- a/paddle/operators/roi_pool_op.cu
+++ b/paddle/operators/roi_pool_op.cu
@@ -177,7 +177,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
     if (x_grad) {
       x_grad->mutable_data<T>(ctx.GetPlace());
       math::SetConstant<Place, T> set_zero;
-      set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
+      set_zero(ctx.cuda_device_context(), x_grad, static_cast<T>(0));
 
       int output_grad_size = out_grad->numel();
       int blocks = NumBlocks(output_grad_size);
@@ -199,10 +199,11 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    roi_pool, ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, float>,
-    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    roi_pool,
+    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
     roi_pool_grad,
-    ops::GPUROIPoolGradOpKernel<paddle::platform::GPUPlace, float>,
-    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, double>);
+    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/roi_pool_op.h b/paddle/operators/roi_pool_op.h
index 3812c66c65..09a9d3d870 100644
--- a/paddle/operators/roi_pool_op.h
+++ b/paddle/operators/roi_pool_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CPUROIPoolOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -126,7 +126,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -145,8 +145,9 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
       const T* out_grad_data = out_grad->data<T>();
       const int64_t* argmax_data = argmax->data<int64_t>();
       T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<Place, T> set_zero;
-      set_zero(ctx.device_context(), in_grad, static_cast<T>(0));
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(ctx.template device_context<DeviceContext>(), in_grad,
+               static_cast<T>(0));
 
       auto in_stride = framework::stride(in->dims());
       auto argmax_stride = framework::stride(argmax->dims());
diff --git a/paddle/operators/row_conv_op.cc b/paddle/operators/row_conv_op.cc
new file mode 100644
index 0000000000..5203a5079c
--- /dev/null
+++ b/paddle/operators/row_conv_op.cc
@@ -0,0 +1,260 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/row_conv_op.h"
+#include "paddle/framework/eigen.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+class RowConvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of RowConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                   "Input(Filter) of RowConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of RowConvOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto filter_dims = ctx->GetInputDim("Filter");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(filter_dims.size(), 2, "Input(Y)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(
+        x_dims[1], filter_dims[1],
+        "The 2nd dimension of Input(X) and Input(Filter) should be same.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class RowConvGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                   "Input(Filter) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Gradient of output(Out) should not be null.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      auto x_dims = ctx->GetInputDim("X");
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+
+    auto filter_grad_name = framework::GradVarName("Filter");
+    if (ctx->HasOutput(filter_grad_name)) {
+      auto filter_dims = ctx->GetInputDim("Filter");
+      ctx->SetOutputDim(filter_grad_name, filter_dims);
+    }
+  }
+};
+
+class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RowConvOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor), the input(X) is a LodTensor, which supports "
+             "variable time-length input sequences. The underlying tensor "
+             "in this LoDTensor is a matrix with shape (T x N), where T "
+             "is the total time steps in this mini-batch and N is the input "
+             "data dimension.");
+    AddInput("Filter",
+             "(Tensor), the input(Filter) is a learnable parameter. It "
+             "is a 2-D tensor with shape (future_context x N), where, "
+             "future_context is the future context length and N is the data "
+             "dimension.");
+    AddOutput("Out",
+              "(LoDTensor), the output(Out) is a LodTensor, which supports "
+              "variable time-length input sequences. The underlying tensor "
+              "in this LodTensor is a matrix with shape T x N, i.e., the "
+              "same shape as X.");
+    AddComment(R"DOC(
+Row-convolution Operator.
+
+The row convolution is called lookahead convolution.  This operator was 
+introduced in the following paper for DeepSpeech2:
+http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf 
+
+The main motivation is that a bidirectional RNN, useful in DeepSpeech 
+like speech models, learns representation for a sequence by performing a 
+forward and a backward pass through the entire sequence. However, unlike 
+unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
+and low-latency setting. The lookahead convolution incorporates information 
+from future subsequences in a computationally efficient manner to improve 
+unidirectional recurrent neural networks. The row convolution operator is 
+different from the 1D sequence convolution, and is computed as follows:
+
+Given an input sequence $in$ of length $t$ and input dimension $d$, 
+and a filter ($W$) of size $context \times d$, 
+the output sequence is convolved as:
+
+$$
+out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :}
+$$
+
+)DOC");
+  }
+};
+
+template <typename T>
+class RowConvKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<LoDTensor>("X");
+    auto *filter = context.Input<Tensor>("Filter");
+    auto *out = context.Output<LoDTensor>("Out");
+
+    out->mutable_data<T>(context.GetPlace());
+
+    auto batch_indices = x->lod()[0];
+    auto input_dim = x->dims()[1];  // 'in' is of size T x N
+    size_t num_sequence = batch_indices.size() - 1;
+
+    auto future_context = filter->dims()[0];
+    auto weights = EigenMatrix<T>::From(*filter);
+
+    for (size_t i = 0; i < num_sequence; i++) {
+      int start = static_cast<int>(batch_indices[i]);
+      int end = static_cast<int>(batch_indices[i + 1]);
+      int current_timesteps = end - start;
+      Tensor cur_input_sequence =
+          x->Slice(start, end);  // Current input sequence
+      Tensor cur_output_sequence =
+          out->Slice(start, end);  // Current output sequence
+      auto cip_seq = EigenMatrix<T>::From(cur_input_sequence);
+      auto cot_seq = EigenMatrix<T>::From(cur_output_sequence);
+
+      for (int k = 0; k < current_timesteps;
+           k++) {  // For different time steps in the same sequence
+        for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+             w++) {
+          for (int d = 0; d < input_dim; d++) {
+            if (w == 0) {
+              cot_seq(k, d) = weights(w, d) * cip_seq(k + w, d);
+            } else {
+              cot_seq(k, d) += weights(w, d) * cip_seq(k + w, d);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class RowConvGradKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<LoDTensor>("X");
+    auto *filter = context.Input<Tensor>("Filter");
+    auto *d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto *dx = context.Output<LoDTensor>(framework::GradVarName("X"));
+    auto *d_filter = context.Output<Tensor>(framework::GradVarName("Filter"));
+
+    auto input_dim = x->dims()[1];  // 'x' is of size T x N
+    auto batch_indices = x->lod()[0];
+    size_t num_sequence = batch_indices.size() - 1;
+    auto future_context = filter->dims()[0];
+
+    if (d_filter) {
+      d_filter->mutable_data<T>(context.GetPlace());
+      auto dweights =
+          EigenMatrix<T>::From(*d_filter);  // Gradient of weight matrix
+      dweights.setZero();
+
+      for (size_t i = 0; i < num_sequence; i++) {  // For different sequences
+        int start = static_cast<int>(batch_indices[i]);
+        int end = static_cast<int>(batch_indices[i + 1]);
+
+        Tensor cur_input = x->Slice(start, end);  // Current input sequence
+        Tensor cur_doutput =
+            d_out->Slice(start, end);  // Current output grad sequence
+
+        auto cur_ip = EigenMatrix<T>::From(cur_input);
+        auto cur_dout = EigenMatrix<T>::From(cur_doutput);
+        int current_timesteps = end - start;
+
+        for (int k = 0; k < current_timesteps;
+             k++) {  // For different time steps in the same sequence
+          for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+               w++) {
+            // For dweights (Updating the gradient of weight matrix)
+            for (int d = 0; d < input_dim; d++) {
+              dweights(w, d) += cur_ip(k + w, d) * cur_dout(k, d);
+            }
+          }
+        }
+      }
+    }
+
+    if (dx) {
+      dx->mutable_data<T>(context.GetPlace());
+      auto weights = EigenMatrix<T>::From(*filter);
+      for (size_t i = 0; i < num_sequence; i++) {  // For different sequences
+        int start = static_cast<int>(batch_indices[i]);
+        int end = static_cast<int>(batch_indices[i + 1]);
+
+        Tensor cur_doutput =
+            d_out->Slice(start, end);  // Current output grad sequence
+        Tensor cur_dinput =
+            dx->Slice(start, end);  // Current input grad sequence
+
+        auto cur_dout = EigenMatrix<T>::From(cur_doutput);
+        auto cur_dip = EigenMatrix<T>::From(cur_dinput);
+        cur_dip.setZero();
+        int current_timesteps = end - start;
+
+        for (int k = 0; k < current_timesteps;
+             k++) {  // For different time steps in the same sequence
+          for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+               w++) {
+            // For dinput (Updating the gradient wrt input)
+            for (int d = 0; d < input_dim; d++) {
+              cur_dip(k + w, d) += weights(w, d) * cur_dout(k, d);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad,
+            ops::RowConvGradOp);
+REGISTER_OP_CPU_KERNEL(
+    row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    row_conv_grad,
+    ops::RowConvGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu
new file mode 100644
index 0000000000..56a98ff299
--- /dev/null
+++ b/paddle/operators/row_conv_op.cu
@@ -0,0 +1,410 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/row_conv_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using framework::Tensor;
+
+namespace {
+
+inline int DivUp(int x, int y) { return (x + y - 1) / y; }
+
+// Forward prop (shared memory version, for small future_context)
+template <typename T>
+__global__ void RowConvForwardSharedMemory(const T *in, const T *wt,
+                                           int num_sequence, int input_dim,
+                                           int future_context,
+                                           const size_t *batch_indices,
+                                           T *out) {
+  int blx = blockDim.x;
+  int bly = blockDim.y;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int d = blockIdx.x * blx + thx;  // index along input dim
+
+  extern __shared__ T mem[];
+  T *sw = mem;
+
+  if (thy < future_context) {
+    sw[thy * blx + thx] =
+        (d < input_dim) ? wt[thy * input_dim + d] : static_cast<T>(0);
+  }
+  __syncthreads();
+
+  for (size_t i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+           w++) {
+        sum += (d < input_dim)
+                   ? sw[w * blx + thx] * in[(start + k + w) * input_dim + d]
+                   : static_cast<T>(0);
+      }
+      if (d < input_dim) {
+        out[(start + k) * input_dim + d] = sum;
+      }
+    }
+  }
+}
+
+// Forward prop (naive version)
+template <typename T>
+__global__ void RowConvForward(const T *in, const T *wt, int num_sequence,
+                               int input_dim, int future_context,
+                               const size_t *batch_indices, T *out) {
+  int d = blockIdx.x * blockDim.x + threadIdx.x;  // index along input_dim
+  int bly = blockDim.y;
+  int thy = threadIdx.y;
+
+  if (d >= input_dim) return;
+
+  for (size_t i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+           w++) {
+        sum += (wt[w * input_dim + d] * in[(start + k + w) * input_dim + d]);
+      }
+      out[(start + k) * input_dim + d] = sum;
+    }
+  }
+}
+
+// Compute input gradient (shared memory version, for small future_context)
+template <typename T>
+__global__ void RowConvGradInputSharedMemory(const T *dout, const T *wt,
+                                             int num_sequence, int input_dim,
+                                             int future_context,
+                                             const size_t *batch_indices,
+                                             T *din) {
+  int blx = blockDim.x;
+  int bly = blockDim.y;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int d = blockIdx.x * blx + thx;  // index along input dim
+
+  extern __shared__ T mem[];
+  T *sw = mem;
+  if (thy < future_context) {
+    sw[thy * blx + thx] =
+        (d < input_dim) ? wt[thy * input_dim + d] : static_cast<T>(0);
+  }
+  __syncthreads();
+
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) {
+        sum += (d < input_dim)
+                   ? (sw[w * blx + thx] * dout[(k + start - w) * input_dim + d])
+                   : static_cast<T>(0);
+      }
+      if (d < input_dim) {
+        din[(k + start) * input_dim + d] = sum;
+      }
+    }
+  }
+}
+
+// Compute input gradient (Naive version)
+template <typename T>
+__global__ void RowConvGradInput(const T *dout, const T *wt, int num_sequence,
+                                 int input_dim, int future_context,
+                                 const size_t *batch_indices, T *din) {
+  int d = blockIdx.x * blockDim.x + threadIdx.x;  // index along input_dim
+  int bly = blockDim.y;
+  int thy = threadIdx.y;
+
+  if (d >= input_dim) return;
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) {
+        sum += (wt[w * input_dim + d] * dout[(k + start - w) * input_dim + d]);
+      }
+      din[(k + start) * input_dim + d] = sum;
+    }
+  }
+}
+
+// Compute W gradient (small future_context version)
+template <typename T>
+__global__ void RowConvGradFilterImproved(const T *in, const T *dout,
+                                          int num_sequence, int input_dim,
+                                          int future_context, int block_x,
+                                          int block_y,
+                                          const size_t *batch_indices,
+                                          T *dfilter) {
+  int blx = blockDim.x;
+  int bly = blockDim.y;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int gx = blockIdx.x * blx;
+  int d = gx + thx;  // index along input dim
+
+  extern __shared__ T mem[];
+
+  int xdim_sh_in = block_y;
+  int xdim_sh_dout = block_y;
+  // int xdim_sh_dfilter = future_context;
+  int ydim_sh_in = block_x;
+  int ydim_sh_dout = block_x + future_context - 1;
+  int ydim_sh_dfilter = block_y;
+
+  T *sh_in = mem;
+  T *sh_dout = &mem[xdim_sh_in * ydim_sh_in];
+  T *sh_dfilter = &mem[xdim_sh_in * ydim_sh_in + xdim_sh_dout * ydim_sh_dout];
+
+  if (thy < future_context) {
+    sh_dfilter[thy * ydim_sh_dfilter + thx] = static_cast<T>(0);
+  }
+  __syncthreads();
+
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    int scaled_cur_steps =
+        ((current_timesteps + block_x - 1) / block_x) * block_x;
+
+    for (int k = thy; k < scaled_cur_steps; k += block_x) {
+      int pos = start + k;
+      sh_in[thx * ydim_sh_in + thy] =
+          (d < input_dim && pos < end) ? in[pos * input_dim + d] : T(0);
+      sh_dout[thx * ydim_sh_dout + thy + future_context - 1] =
+          (d < input_dim && pos < end) ? dout[pos * input_dim + d] : T(0);
+      __syncthreads();
+
+      if (thy < future_context - 1) {
+        int pos_offset = pos - future_context + 1;
+        sh_dout[thx * ydim_sh_dout + thy] =
+            (d < input_dim && pos_offset >= start)
+                ? dout[pos_offset * input_dim + d]
+                : T(0);
+      }
+      __syncthreads();
+
+      for (int w = 0; w < future_context; w++) {
+        T val = sh_in[thy * ydim_sh_in + thx] *
+                sh_dout[thy * ydim_sh_dout + thx + future_context - 1 - w];
+        __syncthreads();
+
+        for (int offset = 16; offset > 0;
+             offset = offset / 2) {  // blockDim.x is 32.
+          val += __shfl_down(val, offset);
+        }
+        __syncthreads();
+
+        if (thx == 0) {
+          sh_dfilter[w * ydim_sh_dfilter + thy] += val;
+        }
+        __syncthreads();
+      }
+    }
+  }
+  for (int w = thy; (w < future_context) && (d < input_dim); w += bly) {
+    dfilter[w * input_dim + d] += sh_dfilter[w * ydim_sh_dfilter + thx];
+  }
+}
+
+// Compute weight(filter) gradient
+template <typename T>
+__global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence,
+                                  int input_dim, int future_context,
+                                  int block_x, int block_y,
+                                  const size_t *batch_indices, T *dfilter) {
+  int blx = blockDim.x;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int gx = blockIdx.x * blx;
+  int d = gx + thx;  // index along input dim
+  extern __shared__ T mem[];
+  T *sh_in = mem;
+  T *sh_dout = &mem[block_x * block_y];
+
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    int scaled_cur_steps =
+        ((current_timesteps + block_x - 1) / block_x) * block_x;
+
+    for (int k = thy; k < scaled_cur_steps; k += block_x) {
+      int pos = start + k;
+      sh_in[thx * block_y + thy] =
+          (d < input_dim && pos < end) ? in[pos * input_dim + d] : 0.0;
+      __syncthreads();
+
+      for (int w = 0; w < future_context; w++) {
+        sh_dout[thx * block_y + thy] =
+            (d < input_dim && (k - w) >= 0 && (k - w) < current_timesteps)
+                ? dout[(pos - w) * input_dim + d]
+                : 0.0;
+        __syncthreads();
+
+        T val = sh_in[thy * block_y + thx] * sh_dout[thy * block_y + thx];
+        __syncthreads();
+
+        for (int offset = 16; offset > 0;
+             offset = offset / 2) {  // blockDim.x is 32.
+          val += __shfl_down(val, offset);
+        }
+        __syncthreads();
+
+        if (thx == 0 && (gx + thy) < input_dim) {
+          dfilter[w * input_dim + gx + thy] += val;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
+
+template <typename T>
+class RowConvKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<LoDTensor>("X");
+    auto *Filter = context.Input<Tensor>("Filter");
+    auto *Out = context.Output<LoDTensor>("Out");
+
+    const T *in = X->data<T>();
+    const T *weight = Filter->data<T>();
+    T *out = Out->mutable_data<T>(context.GetPlace());
+
+    auto batch_indices = X->lod()[0];
+    int input_dim = X->dims()[1];
+    int num_sequence = batch_indices.size() - 1;
+    int future_context = Filter->dims()[0];
+    size_t *idx = batch_indices.data();
+    auto stream = context.cuda_device_context().stream();
+
+    if (future_context <= 32) {
+      dim3 block_dim = dim3(32, 32);
+      dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+      int mem_per_block = (future_context * block_dim.x) * sizeof(T);
+      RowConvForwardSharedMemory<
+          T><<<grid_dim, block_dim, mem_per_block, stream>>>(
+          in, weight, num_sequence, input_dim, future_context, idx, out);
+    } else {
+      dim3 block_dim = dim3(32, 32);
+      dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+      RowConvForward<T><<<grid_dim, block_dim, 0, stream>>>(
+          in, weight, num_sequence, input_dim, future_context, idx, out);
+    }
+  }
+};
+
+template <typename T>
+class RowConvGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<LoDTensor>("X");
+    auto *Filter = context.Input<Tensor>("Filter");
+    auto *dOut = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    const T *in = X->data<T>();
+    const T *weights = Filter->data<T>();
+    const T *dout = dOut->data<T>();
+
+    Tensor *dX = context.Output<LoDTensor>(framework::GradVarName("X"));
+    Tensor *dFilter = context.Output<Tensor>(framework::GradVarName("Filter"));
+
+    auto batch_indices = X->lod()[0];
+    int input_dim = X->dims()[1];
+    int num_sequence = batch_indices.size() - 1;
+    int future_context = Filter->dims()[0];
+    size_t *idx = batch_indices.data();
+
+    auto &device_ctx = context.cuda_device_context();
+    math::SetConstant<platform::CUDADeviceContext, T> zero;
+
+    if (dFilter) {
+      T *dfilter = dFilter->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, dFilter, static_cast<T>(0.0));
+
+      if (future_context <= 32) {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        int block_x = block_dim.x;
+        int block_y = block_dim.y;
+        int mem_per_block =
+            (block_y * block_x + block_y * (block_x + future_context - 1) +
+             future_context * block_y) *
+            sizeof(T);
+        RowConvGradFilterImproved<
+            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
+            idx, dfilter);
+      } else {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        int block_x = block_dim.x;
+        int block_y = block_dim.y;
+        int mem_per_block =
+            (block_x * block_y * 2) * sizeof(T);  // For 2 arrays of size 32x32
+        RowConvGradFilter<
+            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
+            idx, dfilter);
+      }
+    }
+
+    if (dX) {
+      T *din = dX->mutable_data<T>(context.GetPlace());
+      if (future_context <= 32) {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        int mem_per_block = (future_context * block_dim.x) * sizeof(T);
+        RowConvGradInputSharedMemory<
+            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+            dout, weights, num_sequence, input_dim, future_context, idx, din);
+      } else {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        RowConvGradInput<T><<<grid_dim, block_dim, 0, device_ctx.stream()>>>(
+            dout, weights, num_sequence, input_dim, future_context, idx, din);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    row_conv, ops::RowConvKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    row_conv_grad,
+    ops::RowConvGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/row_conv_op.h b/paddle/operators/row_conv_op.h
new file mode 100644
index 0000000000..80912ad8f7
--- /dev/null
+++ b/paddle/operators/row_conv_op.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class RowConvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+
+template <typename DeviceContext, typename T>
+class RowConvGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index e5c10fec4d..d848be823e 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -75,8 +75,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
                   ops::ScaleGradMaker);
-REGISTER_OP_CPU_KERNEL(scale,
-                       ops::ScaleKernel<paddle::platform::CPUPlace, float>,
-                       ops::ScaleKernel<paddle::platform::CPUPlace, double>,
-                       ops::ScaleKernel<paddle::platform::CPUPlace, int>,
-                       ops::ScaleKernel<paddle::platform::CPUPlace, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/scale_op.cu b/paddle/operators/scale_op.cu
index 0d70775159..0c7980430f 100644
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
@@ -14,8 +14,10 @@
 
 #include "paddle/operators/scale_op.h"
 
-REGISTER_OP_GPU_KERNEL(
-    scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>,
-    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>,
-    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int>,
-    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    scale,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
+                                   int64_t>);
diff --git a/paddle/operators/scale_op.h b/paddle/operators/scale_op.h
index 4931294c9d..02a8c97a83 100644
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
@@ -19,7 +19,7 @@
 
 namespace paddle {
 namespace operators {
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class ScaleKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& context) const {
@@ -31,7 +31,8 @@ class ScaleKernel : public framework::OpKernel<T> {
 
     auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
     auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& dev = context.GetEigenDevice<Place>();
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
     eigen_out.device(dev) = scale * eigen_in;
   }
 };
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index ce4b794bc3..573bbcd187 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -87,10 +87,15 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Updates", "The updated value of updates op");
     AddOutput("Out", "The output of add op");
     AddComment(R"DOC(
-Scatter Operator by selecting from the first axis,
+Scatter Operator.
 
-Out = Ref
+This operator obtains output by updating the input on selected indices on the first axis:
+
+$$
+Out = Ref \\
 Out[Index] = Ref[Index] + Updates
+$$
+
 )DOC");
   }
 };
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
index 3b32ae2fb7..6b43a1389f 100644
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
@@ -59,5 +59,5 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>);
-REGISTER_OP_GPU_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel<float>);
diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc
index ac03eb3752..3e2e2051af 100644
--- a/paddle/operators/send_recv_op_test.cc
+++ b/paddle/operators/send_recv_op_test.cc
@@ -85,7 +85,10 @@ void StartServerNet() {
 
   paddle::framework::AttributeMap attrs;
   attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
-  attrs.insert({"OptimizeBlock", block});
+  std::string program_proto;
+  PADDLE_ENFORCE(program.Proto()->SerializeToString(&program_proto));
+
+  attrs.insert({"OptimizeProgram", program_proto});
   recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}},
                                                     {{"Out", {"Out"}}}, attrs);
   paddle::platform::CPUDeviceContext ctx(place);
diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc
index b862056ad4..ede9754697 100644
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
@@ -148,8 +148,9 @@ class SeqExpandOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(seq_expand, ops::SeqExpandOp, ops::SeqExpandOpMaker,
             seq_expand_grad, ops::SeqExpandOpGrad);
-REGISTER_OP_CPU_KERNEL(seq_expand,
-                       ops::SeqExpandKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    seq_expand,
+    ops::SeqExpandKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     seq_expand_grad,
-    ops::SeqExpandGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SeqExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/seq_expand_op.cu b/paddle/operators/seq_expand_op.cu
index f1e4b82a76..8e67ce9ccb 100644
--- a/paddle/operators/seq_expand_op.cu
+++ b/paddle/operators/seq_expand_op.cu
@@ -16,8 +16,9 @@
 #include "paddle/operators/seq_expand_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(seq_expand,
-                       ops::SeqExpandKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    seq_expand,
+    ops::SeqExpandKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     seq_expand_grad,
-    ops::SeqExpandGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SeqExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h
index 4ef0d02cf8..fbee0db454 100644
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/seq_expand_op.h
@@ -23,7 +23,7 @@ namespace operators {
 
 using LoDTensor = framework::LoDTensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SeqExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -37,7 +37,8 @@ class SeqExpandKernel : public framework::OpKernel<T> {
                       "The size of last lod level in Input(Y)"
                       "must be equal to dims[0] of Input(X).");
     out->set_lod(y->lod());
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
     size_t element_len = framework::product(x_dims) / x_dims[0];
     T* out_data = out->mutable_data<T>(context.GetPlace());
     auto out_starts = out->lod().back();
@@ -50,7 +51,7 @@ class SeqExpandKernel : public framework::OpKernel<T> {
       Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
           out_t(out_data, scale, element_len);
       Eigen::array<int, 2> cast({{scale, 1}});
-      out_t.device(place) = x_t.broadcast(cast);
+      out_t.device(*place) = x_t.broadcast(cast);
       x_data += element_len;
       out_data += element_len * scale;
     }
@@ -69,7 +70,7 @@ class SeqExpandKernel : public framework::OpKernel<T> {
  *    Grad(X).lod = Input(X).lod
  *
  * */
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SeqExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -89,8 +90,9 @@ class SeqExpandGradKernel : public framework::OpKernel<T> {
       d_out_t(d_out_data, static_cast<int>(repeat), element_len);
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>
       d_x_t(d_x_data, static_cast<int>(element_len));
-      auto place = context.GetEigenDevice<Place>();
-      d_x_t.device(place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
+      auto place =
+          context.template device_context<DeviceContext>().eigen_device();
+      d_x_t.device(*place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
       d_out_data += (repeat * element_len);
       d_x_data += element_len;
     }
diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc
index d1de0b4447..9c7e5456e8 100644
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -129,7 +129,7 @@ REGISTER_OP(sequence_concat, ops::SequenceConcatOp, ops::SequenceConcatOpMaker,
             sequence_concat_grad, ops::SequenceConcatGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_concat,
-    ops::SequenceConcatOpKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     sequence_concat_grad,
-    ops::SequenceConcatGradOpKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceConcatGradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_concat_op.cu.cc b/paddle/operators/sequence_concat_op.cu.cc
index 9ca99c2258..144bdb5af6 100644
--- a/paddle/operators/sequence_concat_op.cu.cc
+++ b/paddle/operators/sequence_concat_op.cu.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/operators/sequence_concat_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     sequence_concat,
-    ops::SequenceConcatOpKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    sequence_concat_grad,
-    ops::SequenceConcatGradOpKernel<paddle::platform::GPUPlace, float>);
+    ops::SequenceConcatOpKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(sequence_concat_grad,
+                        ops::SequenceConcatGradOpKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_concat_op.h b/paddle/operators/sequence_concat_op.h
index 09212070aa..8445224f46 100644
--- a/paddle/operators/sequence_concat_op.h
+++ b/paddle/operators/sequence_concat_op.h
@@ -59,7 +59,7 @@ LoD ConcatLoD(const std::vector<const T*> ins, const size_t level) {
   return out_lod;
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceConcatOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -119,7 +119,7 @@ class SequenceConcatOpKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceConcatGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/operators/sequence_conv_op.cc
index c5533732d4..f5c4f1c133 100644
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
@@ -179,9 +179,10 @@ REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
             sequence_conv_grad, ops::SequenceConvGradOp);
 
 REGISTER_OP_CPU_KERNEL(
-    sequence_conv, ops::SequenceConvKernel<paddle::platform::CPUPlace, float>,
-    ops::SequenceConvKernel<paddle::platform::CPUPlace, double>);
+    sequence_conv,
+    ops::SequenceConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceConvKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::CPUPlace, float>,
-    ops::SequenceConvGradKernel<paddle::platform::CPUPlace, double>);
+    ops::SequenceConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceConvGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/sequence_conv_op.cu.cc b/paddle/operators/sequence_conv_op.cu.cc
index c8136dbcb3..eacba79ace 100644
--- a/paddle/operators/sequence_conv_op.cu.cc
+++ b/paddle/operators/sequence_conv_op.cu.cc
@@ -15,10 +15,11 @@
 #include "paddle/operators/sequence_conv_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    sequence_conv, ops::SequenceConvKernel<paddle::platform::GPUPlace, float>,
-    ops::SequenceConvKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    sequence_conv,
+    ops::SequenceConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
     sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::GPUPlace, float>,
-    ops::SequenceConvGradKernel<paddle::platform::GPUPlace, double>);
+    ops::SequenceConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/sequence_conv_op.h b/paddle/operators/sequence_conv_op.h
index b8fbe2647c..bb584b7bfa 100644
--- a/paddle/operators/sequence_conv_op.h
+++ b/paddle/operators/sequence_conv_op.h
@@ -23,7 +23,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceConvKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -56,21 +56,23 @@ class SequenceConvKernel : public framework::OpKernel<T> {
     Tensor col;
     col.mutable_data<T>(col_shape, context.GetPlace());
     // Because if padding_trainable is false, padding data should be zeros.
-    math::SetConstant<Place, T> set_zero;
-    set_zero(context.device_context(), &col, static_cast<T>(0));
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    set_zero(dev_ctx, &col, static_cast<T>(0));
 
-    math::ContextProjectFunctor<Place, T> seq_project_functor;
+    math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
 
-    seq_project_functor(context.device_context(), *in, *padding_data,
-                        padding_trainable, context_start, context_length,
-                        context_stride, up_pad, down_pad, &col);
+    seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
+                        context_start, context_length, context_stride, up_pad,
+                        down_pad, &col);
 
-    math::matmul<Place, T>(context.device_context(), col, false, filter, false,
-                           static_cast<T>(1.0), out, static_cast<T>(0.0));
+    math::matmul<DeviceContext, T>(dev_ctx, col, false, filter, false,
+                                   static_cast<T>(1.0), out,
+                                   static_cast<T>(0.0));
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceConvGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -95,7 +97,8 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
     int down_pad = std::max(0, context_start + context_length - 1);
     int sequence_width = static_cast<int>(in->dims()[1]);
 
-    math::SetConstant<Place, T> set_zero;
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     // use col_shape in the im2col calculation
     framework::DDim col_shape = {in->dims()[0],
                                  sequence_width * context_length};
@@ -104,38 +107,36 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
     if (in_g || filter_g || (padding_trainable && padding_data_g)) {
       col.mutable_data<T>(col_shape, context.GetPlace());
       // Because if padding_trainable is false, padding data should be zeros.
-      set_zero(context.device_context(), &col, static_cast<T>(0));
-      math::matmul<Place, T>(context.device_context(), *out_g, false, *filter,
-                             true, T(1.0), &col, T(1.0));
+      set_zero(dev_ctx, &col, static_cast<T>(0));
+      math::matmul<DeviceContext, T>(dev_ctx, *out_g, false, *filter, true,
+                                     T(1.0), &col, T(1.0));
     }
-    math::ContextProjectFunctor<Place, T> seq_project_functor;
-    math::ContextProjectGradFunctor<Place, T> seq_project_grad_functor;
+    math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
+    math::ContextProjectGradFunctor<DeviceContext, T> seq_project_grad_functor;
 
     if (in_g) {
       in_g->mutable_data<T>(context.GetPlace());
       in_g->set_lod(in->lod());
-      set_zero(context.device_context(), in_g, static_cast<T>(0));
+      set_zero(dev_ctx, in_g, static_cast<T>(0));
 
-      seq_project_grad_functor(context.device_context(), *in_g,
-                               padding_trainable, context_start, context_length,
-                               context_stride, up_pad, down_pad, false, true,
-                               padding_data_g, &col);
+      seq_project_grad_functor(dev_ctx, *in_g, padding_trainable, context_start,
+                               context_length, context_stride, up_pad, down_pad,
+                               false, true, padding_data_g, &col);
     }
 
     if (padding_trainable && padding_data_g) {
       padding_data_g->mutable_data<T>(context.GetPlace());
-      set_zero(context.device_context(), padding_data_g, static_cast<T>(0));
+      set_zero(dev_ctx, padding_data_g, static_cast<T>(0));
 
       LoDTensor* input = const_cast<LoDTensor*>(in);
-      seq_project_grad_functor(context.device_context(), *input,
-                               padding_trainable, context_start, context_length,
-                               context_stride, up_pad, down_pad, true, false,
-                               padding_data_g, &col);
+      seq_project_grad_functor(
+          dev_ctx, *input, padding_trainable, context_start, context_length,
+          context_stride, up_pad, down_pad, true, false, padding_data_g, &col);
     }
 
     if (filter_g) {
       filter_g->mutable_data<T>(context.GetPlace());
-      set_zero(context.device_context(), filter_g, static_cast<T>(0));
+      set_zero(dev_ctx, filter_g, static_cast<T>(0));
 
       Tensor filter_grad = *filter_g;
       LoDTensor out_grad = *out_g;
@@ -145,12 +146,12 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
         padding_data = context.Input<Tensor>("PaddingData");
       }
 
-      seq_project_functor(context.device_context(), *in, *padding_data,
-                          padding_trainable, context_start, context_length,
-                          context_stride, up_pad, down_pad, &col);
+      seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
+                          context_start, context_length, context_stride, up_pad,
+                          down_pad, &col);
 
-      math::matmul<Place, T>(context.device_context(), col, true, out_grad,
-                             false, T(1.0), &filter_grad, T(1.0));
+      math::matmul<DeviceContext, T>(dev_ctx, col, true, out_grad, false,
+                                     T(1.0), &filter_grad, T(1.0));
     }
   }
 };
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
index a2f4257037..3526e45a1b 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -58,12 +58,12 @@ Sequence Pool Operator.
 
 The SequencePoolOp pools features of all time-steps of each instance.
 It supports six pooling types:
-1. AVERAGE: Out[i] = $$avg(X_i)$$
-2. SUM:     Out[i] = $$\sum_jX_{ij}$$
-3. SQRT:    Out[i] = $$\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$
+1. AVERAGE: $$Out[i] = \frac{\sum_i X_i}{N}$$
+2. SUM:     $$Out[i] = \sum_jX_{ij}$$
+3. SQRT:    $$Out[i] = \frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$
 4. LAST:    Out[i] = last instance in i-th sequence X[i]
 5. FIRST:   Out[i] = first instance in i-th sequence X[i]
-6. MAX:     Out[i] = $$max(X_i)$$
+6. MAX:     $$Out[i] = max(X_i)$$
 
 The following example explains how this works:
 For a mini-batch of 3 variable-length sentences,
@@ -123,7 +123,8 @@ namespace ops = paddle::operators;
 REGISTER_OP(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker,
             sequence_pool_grad, ops::SequencePoolGradOp);
 REGISTER_OP_CPU_KERNEL(
-    sequence_pool, ops::SequencePoolKernel<paddle::platform::CPUPlace, float>);
+    sequence_pool,
+    ops::SequencePoolKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     sequence_pool_grad,
-    ops::SequencePoolGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SequencePoolGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_pool_op.cu b/paddle/operators/sequence_pool_op.cu
index 66850772d5..fcd6508435 100644
--- a/paddle/operators/sequence_pool_op.cu
+++ b/paddle/operators/sequence_pool_op.cu
@@ -17,8 +17,9 @@
 #include "paddle/operators/sequence_pool_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    sequence_pool, ops::SequencePoolKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    sequence_pool,
+    ops::SequencePoolKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     sequence_pool_grad,
-    ops::SequencePoolGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SequencePoolGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
index 7f136d8cf0..7519aa1d72 100644
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -30,7 +30,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequencePoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -54,17 +54,18 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     auto lod_level_0 = lod[0];
 
     out->mutable_data<T>(context.GetPlace());
-
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (pooltype == "MAX") {
-      math::MaxSeqPoolFunctor<Place, T> max_pool;
+      math::MaxSeqPoolFunctor<DeviceContext, T> max_pool;
       auto* index = context.Output<Tensor>("MaxIndex");
       index->Resize({dims});
       index->mutable_data<int>(context.GetPlace());
-      max_pool(context.device_context(), *in, out, index);
+      max_pool(dev_ctx, *in, out, index);
       return;
     }
 
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
       Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
                               static_cast<int>(lod_level_0[i + 1]));
@@ -91,7 +92,7 @@ class SequencePoolKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequencePoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -105,20 +106,23 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
     int64_t w = in->numel() / dims[0];
 
     in_g->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
 
     if (pooltype == "MAX") {
-      math::MaxSeqPoolGradFunctor<Place, T> max_pool_grad;
+      math::MaxSeqPoolGradFunctor<DeviceContext, T> max_pool_grad;
       auto* index = context.Input<Tensor>("MaxIndex");
-      max_pool_grad(context.device_context(), *out_g, *index, in_g);
+      max_pool_grad(dev_ctx, *out_g, *index, in_g);
       return;
     }
 
     if (pooltype == "LAST" || pooltype == "FIRST") {
       // set X@Grad be zero at first when pooltype is LAST/FIRST
-      math::SetConstant<Place, T> functor;
-      functor(context.device_context(), in_g, 0);
+      math::SetConstant<DeviceContext, T> functor;
+      functor(dev_ctx, in_g, 0);
     }
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
       auto in_g_t =
           in_g->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc
index 255683a572..481db8f9e5 100644
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
@@ -125,7 +125,7 @@ REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker,
             sequence_slice_grad, ops::SequenceSliceGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_slice,
-    ops::SequenceSliceOpKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     sequence_slice_grad,
-    ops::SequenceSliceGradOpKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceSliceGradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_slice_op.cu b/paddle/operators/sequence_slice_op.cu
index a9f59dadba..43a21d619f 100755
--- a/paddle/operators/sequence_slice_op.cu
+++ b/paddle/operators/sequence_slice_op.cu
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/operators/sequence_slice_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     sequence_slice,
-    ops::SequenceSliceOpKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+    ops::SequenceSliceOpKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     sequence_slice_grad,
-    ops::SequenceSliceGradOpKernel<paddle::platform::GPUPlace, float>);
+    ops::SequenceSliceGradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h
index 428ef556da..14bcaebbb4 100644
--- a/paddle/operators/sequence_slice_op.h
+++ b/paddle/operators/sequence_slice_op.h
@@ -39,7 +39,7 @@ inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data,
   return out_lod;
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceSliceOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -108,7 +108,7 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -143,8 +143,9 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
     if (x_grad) {
       x_grad->mutable_data<T>(ctx.GetPlace());
       x_grad->set_lod(in->lod());
-      math::SetConstant<Place, T> set_zero;
-      set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(ctx.template device_context<DeviceContext>(), x_grad,
+               static_cast<T>(0));
 
       auto out_grad_stride = framework::stride(out_grad->dims());
 
diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/operators/sequence_softmax_op.cc
index 32c1502566..37d5452e6b 100644
--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
@@ -99,7 +99,7 @@ REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp,
             ops::SequenceSoftmaxGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_softmax,
-    ops::SequenceSoftmaxKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_softmax_op.cu.cc b/paddle/operators/sequence_softmax_op.cu.cc
index 7023795a3b..5f65b4daf9 100644
--- a/paddle/operators/sequence_softmax_op.cu.cc
+++ b/paddle/operators/sequence_softmax_op.cu.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/operators/sequence_softmax_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     sequence_softmax,
-    ops::SequenceSoftmaxKernel<paddle::platform::GPUPlace, float>)
-REGISTER_OP_GPU_KERNEL(
+    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, float>)
+REGISTER_OP_CUDA_KERNEL(
     sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_softmax_op.h b/paddle/operators/sequence_softmax_op.h
index 1b68dd0662..e889e88cb3 100644
--- a/paddle/operators/sequence_softmax_op.h
+++ b/paddle/operators/sequence_softmax_op.h
@@ -23,7 +23,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceSoftmaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -52,12 +52,13 @@ class SequenceSoftmaxKernel : public framework::OpKernel<T> {
       framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
       x_i.Resize(dims_i);
       out_i.Resize(dims_i);
-      math::SoftmaxFunctor<Place, T>()(ctx.device_context(), &x_i, &out_i);
+      math::SoftmaxFunctor<DeviceContext, T>()(
+          ctx.template device_context<DeviceContext>(), &x_i, &out_i);
     }
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -83,8 +84,9 @@ class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
       out_i.Resize(dims_i);
       out_grad_i.Resize(dims_i);
       x_grad_i.Resize(dims_i);
-      math::SoftmaxGradFunctor<Place, T>()(ctx.device_context(), &out_i,
-                                           &out_grad_i, &x_grad_i);
+      math::SoftmaxGradFunctor<DeviceContext, T>()(
+          ctx.template device_context<DeviceContext>(), &out_i, &out_grad_i,
+          &x_grad_i);
     }
   }
 };
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 5576d7b8be..121bf60b27 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -62,8 +62,8 @@ $$param\_out = param - learning\_rate * grad$$
 };
 
 template <typename T>
-struct SparseSGDFunctor<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SparseSGDFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& input,
                   const framework::Tensor& learning_rate,
                   framework::Tensor* output) {
@@ -90,13 +90,14 @@ struct SparseSGDFunctor<platform::CPUPlace, T> {
   }
 };
 
-template struct SparseSGDFunctor<platform::CPUPlace, float>;
-template struct SparseSGDFunctor<platform::CPUPlace, double>;
+template struct SparseSGDFunctor<platform::CPUDeviceContext, float>;
+template struct SparseSGDFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker);
-REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<paddle::platform::CPUPlace, float>,
-                       ops::SGDOpKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    sgd, ops::SGDOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SGDOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
index 7b6c5ec306..a3c0db7e50 100644
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -41,8 +41,8 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
 }  // namespace
 
 template <typename T>
-struct SparseSGDFunctor<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SparseSGDFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& input,
                   const framework::Tensor& learning_rate,
                   framework::Tensor* output) {
@@ -62,21 +62,19 @@ struct SparseSGDFunctor<platform::GPUPlace, T> {
     const int block_size = 256;
     dim3 threads(block_size, 1);
     dim3 grid(1, in_rows.size());
-    SparseSGDFunctorKernel<
-        T, 256><<<grid, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(in_data, in_rows.data(),
-                                   learning_rate.data<T>(), out_data,
-                                   in_row_numel);
+    SparseSGDFunctorKernel<T, 256><<<grid, threads, 0, context.stream()>>>(
+        in_data, in_rows.data(), learning_rate.data<T>(), out_data,
+        in_row_numel);
   }
 };
 
-template struct SparseSGDFunctor<platform::GPUPlace, float>;
-template struct SparseSGDFunctor<platform::GPUPlace, double>;
+template struct SparseSGDFunctor<platform::CUDADeviceContext, float>;
+template struct SparseSGDFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<paddle::platform::GPUPlace, float>,
-                       ops::SGDOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    sgd, ops::SGDOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SGDOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index 78b595fc6c..c920025a91 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -20,15 +20,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SparseSGDFunctor {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                   const framework::SelectedRows& input,
                   const framework::Tensor& learning_rate,
                   framework::Tensor* output);
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SGDOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -46,7 +46,8 @@ class SGDOpKernel : public framework::OpKernel<T> {
       auto g = framework::EigenVector<T>::Flatten(*grad);
       auto o = framework::EigenVector<T>::Flatten(*param_out);
       auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
-      auto place = ctx.GetEigenDevice<Place>();
+      auto& place =
+          *ctx.template device_context<DeviceContext>().eigen_device();
 
       Eigen::DSizes<int, 1> grad_dsize(grad->numel());
       o.device(place) = p - lr.broadcast(grad_dsize) * g;
@@ -56,8 +57,9 @@ class SGDOpKernel : public framework::OpKernel<T> {
       // It's better to find a more elegant solution.
       PADDLE_ENFORCE_EQ(param, param_out);
       auto* grad = ctx.Input<framework::SelectedRows>("Grad");
-      SparseSGDFunctor<Place, T> functor;
-      functor(ctx.device_context(), *grad, *learning_rate, param_out);
+      SparseSGDFunctor<DeviceContext, T> functor;
+      functor(ctx.template device_context<DeviceContext>(), *grad,
+              *learning_rate, param_out);
     } else {
       PADDLE_THROW("Unsupported Variable Type of Grad");
     }
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
index 782f4c7936..b8a1bf122a 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -142,7 +142,7 @@ REGISTER_OP(sigmoid_cross_entropy_with_logits,
             ops::SigmoidCrossEntropyWithLogitsGradOp);
 REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
                        ops::SigmoidCrossEntropyWithLogitsKernel<
-                           paddle::platform::CPUPlace, float>);
+                           paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
                        ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::CPUPlace, float>);
+                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
index 32a39956a1..1b569c93ed 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -16,9 +16,9 @@
 #include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(sigmoid_cross_entropy_with_logits,
-                       ops::SigmoidCrossEntropyWithLogitsKernel<
-                           paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits,
+                        ops::SigmoidCrossEntropyWithLogitsKernel<
+                            paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                        ops::SigmoidCrossEntropyWithLogitsGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
index 2a9d9bbc77..8fe7c5ba82 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -32,7 +32,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto labels = framework::EigenVector<T>::Flatten(*Labels);
     auto out = framework::EigenVector<T>::Flatten(*Out);
-    auto place = context.GetEigenDevice<Place>();
+    auto &place = *context.device_context<DeviceContext>().eigen_device();
 
     // term1 = max(x, 0)
     auto term1 = x.cwiseMax(static_cast<T>(0));
@@ -46,7 +46,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
 };
 
 // dX = sigmoid(X) - labels
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -62,7 +62,8 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
     auto labels = framework::EigenVector<T>::Flatten(*Labels);
     auto dout = framework::EigenVector<T>::Flatten(*dOut);
     auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+    auto &place =
+        *context.template device_context<DeviceContext>().eigen_device();
 
     auto sigmoid_x = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
     dx.device(place) = dout * (sigmoid_x - labels);
diff --git a/paddle/operators/sign_op.cc b/paddle/operators/sign_op.cc
index 08bf2e4e7c..d5a7ccb77e 100644
--- a/paddle/operators/sign_op.cc
+++ b/paddle/operators/sign_op.cc
@@ -67,5 +67,5 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker<float>,
                   ops::SignGradMaker);
-REGISTER_OP_CPU_KERNEL(sign,
-                       ops::SignKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    sign, ops::SignKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sign_op.cu b/paddle/operators/sign_op.cu
index 4d0638cb97..9bc1c65d21 100644
--- a/paddle/operators/sign_op.cu
+++ b/paddle/operators/sign_op.cu
@@ -14,5 +14,6 @@
 
 #include "paddle/operators/sign_op.h"
 
-REGISTER_OP_GPU_KERNEL(
-    sign, paddle::operators::SignKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    sign,
+    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sign_op.h b/paddle/operators/sign_op.h
index ab5cd4bac0..2e476ed665 100644
--- a/paddle/operators/sign_op.h
+++ b/paddle/operators/sign_op.h
@@ -19,7 +19,7 @@
 
 namespace paddle {
 namespace operators {
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SignKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& context) const {
@@ -29,7 +29,8 @@ class SignKernel : public framework::OpKernel<T> {
 
     auto eigen_out = framework::EigenVector<T>::Flatten(*out);
     auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     eigen_out.device(place) = eigen_in.sign();
   }
 };
diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc
index 50543fcc14..56e8d9058f 100644
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -138,7 +138,8 @@ REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp,
             ops::SmoothL1LossOpMaker<float>, smooth_l1_loss_grad,
             ops::SmoothL1LossGradOp);
 REGISTER_OP_CPU_KERNEL(
-    smooth_l1_loss, ops::SmoothL1LossKernel<paddle::platform::CPUPlace, float>);
+    smooth_l1_loss,
+    ops::SmoothL1LossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     smooth_l1_loss_grad,
-    ops::SmoothL1LossGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SmoothL1LossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/smooth_l1_loss_op.cu b/paddle/operators/smooth_l1_loss_op.cu
index 1c3172f438..8e94ebac64 100644
--- a/paddle/operators/smooth_l1_loss_op.cu
+++ b/paddle/operators/smooth_l1_loss_op.cu
@@ -17,8 +17,9 @@
 #include "paddle/operators/smooth_l1_loss_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    smooth_l1_loss, ops::SmoothL1LossKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    smooth_l1_loss,
+    ops::SmoothL1LossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     smooth_l1_loss_grad,
-    ops::SmoothL1LossGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SmoothL1LossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/smooth_l1_loss_op.h b/paddle/operators/smooth_l1_loss_op.h
index 39d0070b6c..1a70c9c63c 100644
--- a/paddle/operators/smooth_l1_loss_op.h
+++ b/paddle/operators/smooth_l1_loss_op.h
@@ -44,7 +44,7 @@ struct SmoothL1LossForward {
   T sigma2;
 };
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class SmoothL1LossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -57,7 +57,8 @@ class SmoothL1LossKernel : public framework::OpKernel<T> {
 
     out0->mutable_data<T>(context.GetPlace());
     out1->mutable_data<T>(context.GetPlace());
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
 
     auto sigma = static_cast<T>(context.Attr<AttrType>("sigma"));
     T sigma2 = sigma * sigma;
@@ -67,12 +68,12 @@ class SmoothL1LossKernel : public framework::OpKernel<T> {
     auto y = EigenVector<T>::Flatten(*in1);
     auto diff = EigenVector<T>::Flatten(*out0);
 
-    diff.device(place) = x - y;
+    diff.device(*place) = x - y;
     // multiply inside weight
     if (has_weight) {
       auto inside_weight = EigenVector<T>::Flatten(*in2);
       // cache diff, reused in bp
-      diff.device(place) = diff * inside_weight;
+      diff.device(*place) = diff * inside_weight;
     }
 
     auto in_counts = in0->numel();
@@ -81,12 +82,12 @@ class SmoothL1LossKernel : public framework::OpKernel<T> {
                                    context.GetPlace());
     auto errors = EigenVector<T>::Flatten(ptensor_errors);
     // apply smooth l1 forward
-    errors.device(place) = diff.unaryExpr(SmoothL1LossForward<T>(sigma2));
+    errors.device(*place) = diff.unaryExpr(SmoothL1LossForward<T>(sigma2));
 
     // multiply outside weight
     if (has_weight) {
       auto outside_weight = EigenVector<T>::Flatten(*in3);
-      errors.device(place) = errors * outside_weight;
+      errors.device(*place) = errors * outside_weight;
     }
     auto loss = EigenVector<T>::Flatten(*out1);
     // first dimension of 'X' is the number of samples
@@ -94,7 +95,7 @@ class SmoothL1LossKernel : public framework::OpKernel<T> {
         framework::make_ddim({static_cast<int>(in0->dims()[0]),
                               static_cast<int>(in_counts / in0->dims()[0])});
     auto errors_mat_view = EigenMatrix<T>::From(ptensor_errors, mat_dims);
-    loss.device(place) = errors_mat_view.sum(Eigen::array<int, 1>({{1}}));
+    loss.device(*place) = errors_mat_view.sum(Eigen::array<int, 1>({{1}}));
   }
 };
 
@@ -114,7 +115,7 @@ struct SmoothL1LossBackward {
   T sigma2;
 };
 
-template <typename Place, typename T, typename AttrType = T>
+template <typename DeviceContext, typename T, typename AttrType = T>
 class SmoothL1LossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -126,7 +127,8 @@ class SmoothL1LossGradKernel : public framework::OpKernel<T> {
     T sigma2 = sigma * sigma;
     bool has_weight = (in0 != nullptr) && (in1 != nullptr);
 
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
 
     auto in_dims = in2->dims();
     auto counts = in2->numel();
@@ -139,7 +141,7 @@ class SmoothL1LossGradKernel : public framework::OpKernel<T> {
                                  context.GetPlace());
     auto diff = EigenVector<T>::Flatten(ptensor_diff);
     // apply smooth l1 backwoard
-    diff.device(place) = EigenVector<T>::Flatten(*in2).unaryExpr(
+    diff.device(*place) = EigenVector<T>::Flatten(*in2).unaryExpr(
         SmoothL1LossBackward<T>(sigma2));
 
     // compute weights
@@ -147,11 +149,11 @@ class SmoothL1LossGradKernel : public framework::OpKernel<T> {
     ptensor_weights.mutable_data<T>(mat_dims, context.GetPlace());
     auto weights = EigenMatrix<T>::From(ptensor_weights);
     // initialize to 1.0
-    weights.device(place) = weights.constant(static_cast<T>(1.0));
+    weights.device(*place) = weights.constant(static_cast<T>(1.0));
     if (has_weight) {
       auto inside_weight = EigenMatrix<T>::From(*in0, mat_dims);
       auto outside_weight = EigenMatrix<T>::From(*in1, mat_dims);
-      weights.device(place) = inside_weight * outside_weight;
+      weights.device(*place) = inside_weight * outside_weight;
     }
 
     // compute gradients
@@ -167,13 +169,13 @@ class SmoothL1LossGradKernel : public framework::OpKernel<T> {
     if (out0) {
       out0->mutable_data<T>(context.GetPlace());
       auto x_grad = EigenMatrix<T>::From(*out0, mat_dims);
-      x_grad.device(place) = gradients;
+      x_grad.device(*place) = gradients;
     }
 
     if (out1) {
       out1->mutable_data<T>(context.GetPlace());
       auto y_grad = EigenMatrix<T>::From(*out1, mat_dims);
-      y_grad.device(place) = -1 * gradients;
+      y_grad.device(*place) = -1 * gradients;
     }
   }
 };
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 93e0525bad..0988c83d43 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -89,7 +89,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad,
             ops::SoftmaxOpGrad);
-REGISTER_OP_CPU_KERNEL(softmax,
-                       ops::SoftmaxKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    softmax_grad, ops::SoftmaxGradKernel<paddle::platform::CPUPlace, float>);
+    softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    softmax_grad,
+    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/softmax_op.cu.cc b/paddle/operators/softmax_op.cu.cc
index 013ace19ae..7b9882cbcf 100644
--- a/paddle/operators/softmax_op.cu.cc
+++ b/paddle/operators/softmax_op.cu.cc
@@ -16,7 +16,8 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(softmax,
-                       ops::SoftmaxKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    softmax_grad, ops::SoftmaxGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    softmax, ops::SoftmaxKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    softmax_grad,
+    ops::SoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 44d1e63f1b..0f8998b99e 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -21,7 +21,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SoftmaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -31,11 +31,12 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     Y->mutable_data<T>(context.GetPlace());
 
-    math::SoftmaxFunctor<Place, T>()(context.device_context(), X, Y);
+    math::SoftmaxFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), X, Y);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SoftmaxGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -46,7 +47,8 @@ class SoftmaxGradKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     dX->mutable_data<T>(context.GetPlace());
 
-    math::SoftmaxGradFunctor<Place, T>()(context.device_context(), Y, dY, dX);
+    math::SoftmaxGradFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), Y, dY, dX);
   }
 };
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
index b1faddac3f..6100c63f9a 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -69,10 +69,10 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
     softmax->mutable_data<T>(context.GetPlace());
     loss->mutable_data<T>(context.GetPlace());
 
-    math::SoftmaxFunctor<platform::GPUPlace, T>()(context.device_context(),
-                                                  logits, softmax);
-    math::CrossEntropyFunctor<platform::GPUPlace, T>()(
-        context.device_context(), loss, softmax, labels,
+    math::SoftmaxFunctor<platform::CUDADeviceContext, T>()(
+        context.cuda_device_context(), logits, softmax);
+    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
+        context.cuda_device_context(), loss, softmax, labels,
         context.Attr<bool>("soft_label"));
   }
 };
@@ -98,18 +98,18 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
 
     if (context.Attr<bool>("soft_label")) {
       const T* label_data = labels->data<T>();
-      SoftCrossEntropyGradientKernel<T><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              context.device_context())
-                              .stream()>>>(logit_grad_data, loss_grad_data,
-                                           label_data, batch_size, class_num);
+      SoftCrossEntropyGradientKernel<
+          T><<<grid, block, 0,
+               context.template device_context<platform::CUDADeviceContext>()
+                   .stream()>>>(logit_grad_data, loss_grad_data, label_data,
+                                batch_size, class_num);
     } else {
       const int64_t* label_data = labels->data<int64_t>();
-      CrossEntropyGrad<T><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              context.device_context())
-                              .stream()>>>(logit_grad_data, loss_grad_data,
-                                           label_data, batch_size, class_num);
+      CrossEntropyGrad<
+          T><<<grid, block, 0,
+               context.template device_context<platform::CUDADeviceContext>()
+                   .stream()>>>(logit_grad_data, loss_grad_data, label_data,
+                                batch_size, class_num);
     }
   }
 };
@@ -118,9 +118,9 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy,
-                       ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
-                       ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
-REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy_grad,
-                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
-                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy,
+                        ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
+                        ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy_grad,
+                        ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
+                        ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
index c4ab3f74b4..9c3431605b 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -40,11 +40,12 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
     softmax->mutable_data<T>(context.GetPlace());
     loss->mutable_data<T>(context.GetPlace());
 
-    math::SoftmaxFunctor<platform::CPUPlace, T>()(context.device_context(),
-                                                  logits, softmax);
-    math::CrossEntropyFunctor<platform::CPUPlace, T>()(
-        context.device_context(), loss, softmax, labels,
-        context.Attr<bool>("soft_label"));
+    auto& dev_ctx =
+        context.template device_context<platform::CPUDeviceContext>();
+    math::SoftmaxFunctor<platform::CPUDeviceContext, T>()(dev_ctx, logits,
+                                                          softmax);
+    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
+        dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"));
   }
 };
 
@@ -62,14 +63,15 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     const int class_num = logit_grad->dims()[1];
     auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
     auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
-
+    auto& place = *context.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
     if (context.Attr<bool>("soft_label")) {
       auto lbl_mat = EigenMatrix<T>::From(*labels);
-      logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
+      logit_grad_mat.device(place) =
           out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) *
           (logit_grad_mat - lbl_mat);
     } else {
-      logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
+      logit_grad_mat.device(place) =
           logit_grad_mat *
           out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num));
 
diff --git a/paddle/operators/split_op.cu.cc b/paddle/operators/split_op.cu.cc
index 93d1fc3c44..dbad0bbf68 100644
--- a/paddle/operators/split_op.cu.cc
+++ b/paddle/operators/split_op.cu.cc
@@ -14,5 +14,5 @@ limitations under the License. */
 
 #include "paddle/operators/split_op.h"
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(split,
-                       ops::SplitOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/split_op.h b/paddle/operators/split_op.h
index fa26e5f677..a38c435d53 100644
--- a/paddle/operators/split_op.h
+++ b/paddle/operators/split_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SplitOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
index bec2a2c18a..50bc6da196 100644
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -115,7 +115,7 @@ REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp,
             ops::SquaredL2DistanceGradOp);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_distance,
-    ops::SquaredL2DistanceKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    squared_l2_distance_grad,
-    ops::SquaredL2DistanceGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SquaredL2DistanceKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(squared_l2_distance_grad,
+                       ops::SquaredL2DistanceGradKernel<
+                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/squared_l2_distance_op.cu b/paddle/operators/squared_l2_distance_op.cu
index 3fe62f1a9c..ecc82ed1e4 100644
--- a/paddle/operators/squared_l2_distance_op.cu
+++ b/paddle/operators/squared_l2_distance_op.cu
@@ -17,9 +17,9 @@
 #include "paddle/operators/squared_l2_distance_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     squared_l2_distance,
-    ops::SquaredL2DistanceKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    squared_l2_distance_grad,
-    ops::SquaredL2DistanceGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SquaredL2DistanceKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(squared_l2_distance_grad,
+                        ops::SquaredL2DistanceGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/squared_l2_distance_op.h b/paddle/operators/squared_l2_distance_op.h
index 259ef40296..5bd5f4819a 100644
--- a/paddle/operators/squared_l2_distance_op.h
+++ b/paddle/operators/squared_l2_distance_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SquaredL2DistanceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -51,7 +51,8 @@ class SquaredL2DistanceKernel : public framework::OpKernel<T> {
     auto sub_result = EigenMatrix<T>::From(*out0);
     auto z = EigenVector<T>::Flatten(*out1);
 
-    auto place = context.GetEigenDevice<Place>();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     auto x_dims = x.dimensions();
     auto y_dims = y.dimensions();
     // buffer the substraction result
@@ -67,7 +68,7 @@ class SquaredL2DistanceKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -89,7 +90,8 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
                     sub_result;
 
     // propagate back to input
-    auto eigen_place = context.GetEigenDevice<Place>();
+    auto& eigen_place =
+        *context.template device_context<DeviceContext>().eigen_device();
     if (x_g) {
       x_g->mutable_data<T>(context.GetPlace());
       // eigen matrix
diff --git a/paddle/operators/squared_l2_norm_op.cc b/paddle/operators/squared_l2_norm_op.cc
index 3c10e6159f..3cff61a02f 100644
--- a/paddle/operators/squared_l2_norm_op.cc
+++ b/paddle/operators/squared_l2_norm_op.cc
@@ -72,7 +72,7 @@ REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker,
             squared_l2_norm_grad, ops::SquaredL2NormGradOp);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_norm,
-    ops::SquaredL2NormKernel<paddle::platform::CPUPlace, float>);
+    ops::SquaredL2NormKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_norm_grad,
-    ops::SquaredL2NormGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SquaredL2NormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/squared_l2_norm_op.cu b/paddle/operators/squared_l2_norm_op.cu
index d384e9c28c..2d6567d090 100644
--- a/paddle/operators/squared_l2_norm_op.cu
+++ b/paddle/operators/squared_l2_norm_op.cu
@@ -16,9 +16,9 @@
 #include "paddle/operators/squared_l2_norm_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
     squared_l2_norm,
-    ops::SquaredL2NormKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+    ops::SquaredL2NormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     squared_l2_norm_grad,
-    ops::SquaredL2NormGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SquaredL2NormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/squared_l2_norm_op.h b/paddle/operators/squared_l2_norm_op.h
index 48d7b1c2d5..0ced7e7d70 100644
--- a/paddle/operators/squared_l2_norm_op.h
+++ b/paddle/operators/squared_l2_norm_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 // Out = sum(square(X))
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SquaredL2NormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -30,14 +30,15 @@ class SquaredL2NormKernel : public framework::OpKernel<T> {
 
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto out = framework::EigenScalar<T>::From(*Out);
-    auto place = context.GetEigenDevice<Place>();
+    auto *place =
+        context.template device_context<DeviceContext>().eigen_device();
 
-    out.device(place) = x.square().sum();
+    out.device(*place) = x.square().sum();
   }
 };
 
 // dX = X
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SquaredL2NormGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -53,10 +54,11 @@ class SquaredL2NormGradKernel : public framework::OpKernel<T> {
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto dout = framework::EigenVector<T>::Flatten(*dOut);
     auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+    auto *place =
+        context.template device_context<DeviceContext>().eigen_device();
 
     Eigen::DSizes<int, 1> x_dsize(X->numel());
-    dx.device(place) = (dout.broadcast(x_dsize) * x) * static_cast<T>(2.0);
+    dx.device(*place) = (dout.broadcast(x_dsize) * x) * static_cast<T>(2.0);
   }
 };
 
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index 744b2fe3f2..cd52672f78 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -195,7 +195,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker,
                   ops::SumOpVarTypeInference);
-REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel<paddle::platform::CPUPlace, float>,
-                       ops::SumKernel<paddle::platform::CPUPlace, double>,
-                       ops::SumKernel<paddle::platform::CPUPlace, int>,
-                       ops::SumKernel<paddle::platform::CPUPlace, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/sum_op.cu b/paddle/operators/sum_op.cu
index 5c30dd4d47..873155076c 100644
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
@@ -13,7 +13,8 @@ limitations under the License. */
 #include "paddle/operators/sum_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>,
-                       ops::SumKernel<paddle::platform::GPUPlace, double>,
-                       ops::SumKernel<paddle::platform::GPUPlace, int>,
-                       ops::SumKernel<paddle::platform::GPUPlace, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    sum, ops::SumKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
index ed6c80ce60..eaa36aa1ae 100644
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -26,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class SumKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -43,12 +43,14 @@ class SumKernel : public framework::OpKernel<T> {
       auto result = EigenVector<T>::Flatten(*out);
 
       if (!in_place) {
-        math::SetConstant<Place, T> constant_functor;
-        constant_functor(context.device_context(), out, 0.0);
+        math::SetConstant<DeviceContext, T> constant_functor;
+        constant_functor(context.template device_context<DeviceContext>(), out,
+                         0.0);
       }
 
-      math::SelectedRowsAddToTensor<Place, T> functor;
-      auto place = context.GetEigenDevice<Place>();
+      math::SelectedRowsAddToTensor<DeviceContext, T> functor;
+      auto &place =
+          *context.template device_context<DeviceContext>().eigen_device();
       // If in_place, just skip the first tensor
       for (int i = in_place ? 1 : 0; i < N; i++) {
         if (in_vars[i]->IsType<framework::LoDTensor>()) {
@@ -60,7 +62,7 @@ class SumKernel : public framework::OpKernel<T> {
           result.device(place) = result + in;
         } else if (in_vars[i]->IsType<framework::SelectedRows>()) {
           auto &in_t = in_vars[i]->Get<framework::SelectedRows>();
-          functor(context.device_context(), in_t, out);
+          functor(context.template device_context<DeviceContext>(), in_t, out);
         } else {
           PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
         }
@@ -82,14 +84,14 @@ class SumKernel : public framework::OpKernel<T> {
       out_value->Resize(framework::make_ddim(in_dim_vec));
       out_value->mutable_data<T>(context.GetPlace());
 
-      math::SelectedRowsAddTo<Place, T> functor;
+      math::SelectedRowsAddTo<DeviceContext, T> functor;
 
       int64_t offset = 0;
       for (int i = 0; i < N; i++) {
         PADDLE_ENFORCE_EQ(out->height(),
                           in_vars[i]->Get<SelectedRows>().height());
-        functor(context.device_context(), in_vars[i]->Get<SelectedRows>(),
-                offset, out);
+        functor(context.template device_context<DeviceContext>(),
+                in_vars[i]->Get<SelectedRows>(), offset, out);
         offset += in_vars[i]->Get<SelectedRows>().value().numel();
       }
     } else if (out_var->IsType<framework::LoDTensorArray>()) {
@@ -112,7 +114,8 @@ class SumKernel : public framework::OpKernel<T> {
               PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
               auto in = EigenVector<T>::Flatten(in_array[i]);
               auto result = EigenVector<T>::Flatten(out_array[i]);
-              result.device(context.GetEigenDevice<Place>()) = result + in;
+              result.device(*context.template device_context<DeviceContext>()
+                                 .eigen_device()) = result + in;
             }
           }
         }
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
index 4eb8b60f47..2835b84f75 100644
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -60,12 +60,16 @@ class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
         "(Tensor) the subscript index in tensor array. The number of element "
         "should be 1");
     AddOutput("Out", "(TensorArray) the tensor array will be written");
-    AddComment(R"DOC(Write a LoDTensor to a LoDTensor array.
+    AddComment(R"DOC(
+WriteToArray Operator.
 
-Assume T is LoDTensor, i is the subscript of the array, and A is the array. The
+This operator writes a LoDTensor to a LoDTensor array.
+
+Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The
 equation is
 
-A[i] = T
+$$A[i] = T$$
+
 )DOC");
   }
 };
@@ -144,12 +148,16 @@ class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor) the subscript index in tensor array. The number of "
              "element should be 1");
     AddOutput("Out", "(LoDTensor) the tensor will be read from.");
-    AddComment(R"DOC(Read a LoDTensor from a LoDTensor Array
+    AddComment(R"DOC(
+ReadFromArray Operator.
 
-Assume T is LoDTensor, i is th e subscript of the array, and A is the array. The
+Read a LoDTensor from a LoDTensor Array.
+
+Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The
 equation is
 
-T = A[i]
+$$T = A[i]$$
+
 )DOC");
   }
 };
diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu
index 7851c71bbe..453bd07267 100644
--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
@@ -317,4 +317,4 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>);
diff --git a/paddle/operators/top_k_op.h b/paddle/operators/top_k_op.h
index bc8563717a..e9cd9bbd4d 100644
--- a/paddle/operators/top_k_op.h
+++ b/paddle/operators/top_k_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class TopkKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc
index 94de3d5069..de5ff561ad 100644
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
@@ -112,8 +112,8 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(transpose, ops::TransposeOp, ops::TransposeOpMaker, transpose_grad,
             ops::TransposeOpGrad);
-REGISTER_OP_CPU_KERNEL(transpose,
-                       ops::TransposeKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     transpose_grad,
-    ops::TransposeGradKernel<paddle::platform::CPUPlace, float>);
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/transpose_op.cu.cc b/paddle/operators/transpose_op.cu.cc
index af3f581462..7d23f1493e 100644
--- a/paddle/operators/transpose_op.cu.cc
+++ b/paddle/operators/transpose_op.cu.cc
@@ -15,8 +15,9 @@
 #include "paddle/operators/transpose_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(transpose,
-                       ops::TransposeKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
+REGISTER_OP_CUDA_KERNEL(
+    transpose,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
     transpose_grad,
-    ops::TransposeGradKernel<paddle::platform::GPUPlace, float>);
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/transpose_op.h b/paddle/operators/transpose_op.h
index e296032f41..d995271a6b 100644
--- a/paddle/operators/transpose_op.h
+++ b/paddle/operators/transpose_op.h
@@ -20,33 +20,33 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
-inline void TransCompute(const int dim, const platform::DeviceContext& dev_ctx,
+template <typename DeviceContext, typename T>
+inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
                          const framework::Tensor& in, framework::Tensor* out,
                          const std::vector<int>& axis) {
   switch (dim) {
     case 1:
-      math::Transpose<Place, T, 1> trans1;
+      math::Transpose<DeviceContext, T, 1> trans1;
       trans1(dev_ctx, in, out, axis);
       break;
     case 2:
-      math::Transpose<Place, T, 2> trans2;
+      math::Transpose<DeviceContext, T, 2> trans2;
       trans2(dev_ctx, in, out, axis);
       break;
     case 3:
-      math::Transpose<Place, T, 3> trans3;
+      math::Transpose<DeviceContext, T, 3> trans3;
       trans3(dev_ctx, in, out, axis);
       break;
     case 4:
-      math::Transpose<Place, T, 4> trans4;
+      math::Transpose<DeviceContext, T, 4> trans4;
       trans4(dev_ctx, in, out, axis);
       break;
     case 5:
-      math::Transpose<Place, T, 5> trans5;
+      math::Transpose<DeviceContext, T, 5> trans5;
       trans5(dev_ctx, in, out, axis);
       break;
     case 6:
-      math::Transpose<Place, T, 6> trans6;
+      math::Transpose<DeviceContext, T, 6> trans6;
       trans6(dev_ctx, in, out, axis);
       break;
     default:
@@ -54,7 +54,7 @@ inline void TransCompute(const int dim, const platform::DeviceContext& dev_ctx,
   }
 }
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class TransposeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -64,12 +64,12 @@ class TransposeKernel : public framework::OpKernel<T> {
 
     std::vector<int> axis = context.Attr<std::vector<int>>("axis");
     int ndims = axis.size();
-    auto& dev_ctx = context.device_context();
-    TransCompute<Place, T>(ndims, dev_ctx, *x, out, axis);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    TransCompute<DeviceContext, T>(ndims, dev_ctx, *x, out, axis);
   }
 };
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class TransposeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -88,8 +88,9 @@ class TransposeGradKernel : public framework::OpKernel<T> {
     }
 
     int ndims = axis.size();
-    auto& dev_ctx = context.device_context();
-    TransCompute<Place, T>(ndims, dev_ctx, *out_grad, x_grad, reversed_axis);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad, x_grad,
+                                   reversed_axis);
   }
 };
 
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index fff1dc7ccd..2a49ee471f 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -67,7 +67,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
-        ctx.device_context());
+        ctx.GetPlace());
   }
 };
 
diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu
index 8b20bb8287..cfe9d293cf 100644
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
@@ -63,6 +63,6 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(uniform_random,
-                       paddle::operators::GPUUniformRandomKernel<float>,
-                       paddle::operators::GPUUniformRandomKernel<double>);
+REGISTER_OP_CUDA_KERNEL(uniform_random,
+                        paddle::operators::GPUUniformRandomKernel<float>,
+                        paddle::operators::GPUUniformRandomKernel<double>);
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index 89c48e071c..49df2a530c 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -135,9 +135,10 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
             ops::UnpoolOpGrad);
-REGISTER_OP_CPU_KERNEL(unpool,
-                       ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
-                       ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    unpool_grad, ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
-    ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
+    unpool, ops::UnpoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::UnpoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    unpool_grad,
+    ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
index 18aafb7dc7..9b002e35c4 100644
--- a/paddle/operators/unpool_op.cu.cc
+++ b/paddle/operators/unpool_op.cu.cc
@@ -15,9 +15,10 @@ limitations under the License. */
 #include "paddle/operators/unpool_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(unpool,
-                       ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
-                       ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(
-    unpool_grad, ops::UnpoolGradKernel<paddle::platform::GPUPlace, float>,
-    ops::UnpoolGradKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    unpool, ops::UnpoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::UnpoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    unpool_grad,
+    ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
index 243eb7e532..ee18b118c9 100644
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class UnpoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -32,15 +32,16 @@ class UnpoolKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     T* output_data = out->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (output_data) {
-      math::SetConstant<Place, T> set_zero;
-      set_zero(context.device_context(), out, static_cast<T>(0));
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(dev_ctx, out, static_cast<T>(0));
     }
-    math::Unpool2dMaxFunctor<Place, T> unpool2d_max_forward;
-    unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
+    math::Unpool2dMaxFunctor<DeviceContext, T> unpool2d_max_forward;
+    unpool2d_max_forward(dev_ctx, *in_x, *in_y, out);
   }
 };
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class UnpoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -56,15 +57,14 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
 
-    auto& device_ctx = context.device_context();
-    math::SetConstant<Place, T> zero;
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
       zero(device_ctx, in_x_grad, static_cast<T>(0));
     }
-    math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
-    unpool2d_max_backward(context.device_context(), *in_x, *in_y, *out,
-                          *out_grad, in_x_grad);
+    math::Unpool2dMaxGradFunctor<DeviceContext, T> unpool2d_max_backward;
+    unpool2d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad);
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
index 9b3f21cf94..b8e44bcc5a 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -187,7 +187,8 @@ class WhileGradOp : public framework::OperatorBase {
             attrs["value"] = 0.0f;
 
             auto zero_op = framework::OpRegistry::CreateOp(
-                "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs);
+                "fill_constant", framework::VariableNameMap{},
+                {{"Out", {pg_names[param_id]}}}, attrs);
             zero_op->Run(scope, dev_ctx);
           }
         }
@@ -195,7 +196,7 @@ class WhileGradOp : public framework::OperatorBase {
         auto new_inside_name = cur_scope.Rename(inside_grad_name);
         auto sum_op = framework::OpRegistry::CreateOp(
             "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-            {{"Out", {pg_names[param_id]}}}, {});
+            {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
         sum_op->Run(cur_scope, dev_ctx);
         cur_scope.Rename(new_inside_name, inside_grad_name);
       }
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index f157188a4f..5b0c52a30d 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -38,7 +38,7 @@ public:
     real torch_learningRate = optConfig_.learning_method() == "torch_momentum"
                                   ? 1.0 - paraConfig.momentum()
                                   : 1.0;
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
     sgdUpdate(learningRate_ * paraConfig.learning_rate() *
                   (firstTime_ ? 1.0 : torch_learningRate),
               paraConfig.momentum(),
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
index 1898598e49..d60cb36383 100644
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -30,7 +30,7 @@ void sgdUpdateCpu(real learningRate,
                   const real* grad,
                   real* momentumVec) {
   decayRate *= learningRate;
-#ifdef PADDLE_USE_MKLML
+#ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
 #endif
   for (size_t i = 0; i < size; ++i) {
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 7afcdfce93..2c7f964216 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -15,12 +15,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-template <>
-Eigen::DefaultDevice* DeviceContext::GetEigenDevice<
-    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return reinterpret_cast<const CPUDeviceContext*>(this)->eigen_device();
-}
-
 CPUDeviceContext::CPUDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
@@ -37,12 +31,6 @@ Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
 
 #ifdef PADDLE_WITH_CUDA
 
-template <>
-Eigen::GpuDevice*
-DeviceContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
-}
-
 class EigenCudaStreamDevice : public Eigen::StreamInterface {
  public:
   EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
@@ -122,10 +110,6 @@ Place CUDADeviceContext::GetPlace() const { return place_; }
 
 void CUDADeviceContext::Wait() const {
   PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-}
-
-void CUDADeviceContext::Finish() const {
-  Wait();
   PADDLE_ENFORCE(cudaGetLastError());
 }
 
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 526d089e35..596d9d0bba 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -27,27 +27,12 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-template <typename T>
-struct EigenDeviceConverter;
-
-template <>
-struct EigenDeviceConverter<platform::CPUPlace> {
-  using EigenDeviceType = Eigen::DefaultDevice;
-};
-
 class DeviceContext {
  public:
   virtual ~DeviceContext() {}
   virtual Place GetPlace() const = 0;
 
-  template <typename PlaceType,
-            typename DeviceType =
-                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
-  DeviceType* GetEigenDevice() const;
-
   virtual void Wait() const {}
-
-  virtual void Finish() const {}
 };
 
 class CPUDeviceContext : public DeviceContext {
@@ -64,10 +49,6 @@ class CPUDeviceContext : public DeviceContext {
 };
 
 #ifdef PADDLE_WITH_CUDA
-template <>
-struct EigenDeviceConverter<platform::GPUPlace> {
-  using EigenDeviceType = Eigen::GpuDevice;
-};
 
 class EigenCudaStreamDevice;
 
@@ -79,9 +60,6 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Wait for all operations completion in the stream. */
   void Wait() const override;
 
-  /*! \brief  Check potential errors for the cuda kernel calls. */
-  void Finish() const override;
-
   /*! \brief  Return place in the device context. */
   Place GetPlace() const override;
 
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index 8bf5174c4a..4893cd92f6 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -22,9 +22,8 @@ TEST(Device, Init) {
 
   int count = paddle::platform::GetCUDADeviceCount();
   for (int i = 0; i < count; i++) {
-    DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
-    Eigen::GpuDevice* gpu_device =
-        device_context->template GetEigenDevice<GPUPlace>();
+    CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
+    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
     ASSERT_NE(nullptr, gpu_device);
     delete device_context;
   }
diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc
index 761d9edd87..76ec82e108 100644
--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/platform/dynload/cudnn.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/platform/dynload/cudnn.h>
+#include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
@@ -41,6 +42,21 @@ CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 #endif
 
+#ifdef PADDLE_USE_DSO
+bool HasCUDNN() {
+  std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle);
+  return cudnn_dso_handle != nullptr;
+}
+
+void EnforceCUDNNLoaded(const char* fn_name) {
+  PADDLE_ENFORCE(cudnn_dso_handle != nullptr,
+                 "Cannot load cudnn shared library. Cannot invoke method %s",
+                 fn_name);
+}
+#else
+bool HasCUDNN() { return true; }
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h
index 61caac5450..8c937b37d7 100644
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
@@ -25,9 +25,11 @@ namespace dynload {
 
 extern std::once_flag cudnn_dso_flag;
 extern void* cudnn_dso_handle;
+extern bool HasCUDNN();
 
 #ifdef PADDLE_USE_DSO
 
+extern void EnforceCUDNNLoaded(const char* fn_name);
 #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                    \
   struct DynLoad__##__name {                                       \
     template <typename... Args>                                    \
@@ -36,6 +38,7 @@ extern void* cudnn_dso_handle;
       std::call_once(cudnn_dso_flag,                               \
                      paddle::platform::dynload::GetCudnnDsoHandle, \
                      &cudnn_dso_handle);                           \
+      EnforceCUDNNLoaded(#__name);                                 \
       void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
       return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
     }                                                              \
diff --git a/paddle/platform/dynload/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc
index 6feba42c0d..7a82d06a0a 100644
--- a/paddle/platform/dynload/dynamic_loader.cc
+++ b/paddle/platform/dynload/dynamic_loader.cc
@@ -78,12 +78,11 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
     *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
     if (nullptr == *dso_handle) {
       if (dso_path == "libcudnn.dylib") {
-        PADDLE_ENFORCE(true,
-                       "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
-                       "For instance, sudo tar -xzf "
-                       "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
-                       "chmod a+r /usr/local/cuda/include/cudnn.h "
-                       "/usr/local/cuda/lib/libcudnn*");
+        LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
+                        "For instance, sudo tar -xzf "
+                        "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
+                        "chmod a+r /usr/local/cuda/include/cudnn.h "
+                        "/usr/local/cuda/lib/libcudnn*";
       }
     }
   }
@@ -92,7 +91,8 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
 
 static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
                                               const std::string& dso_name,
-                                              void** dso_handle) {
+                                              void** dso_handle,
+                                              bool throw_on_error = true) {
   int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
   *dso_handle = nullptr;
 
@@ -111,15 +111,19 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
       GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
     }
   }
-  PADDLE_ENFORCE(nullptr != *dso_handle,
-                 "Failed to find dynamic library: %s ( %s ) \n Please specify "
-                 "its path correctly using following ways: \n Method. set "
-                 "environment variable LD_LIBRARY_PATH on Linux or "
-                 "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
-                 "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
-                 "using the DYLD_LIBRARY_PATH is impossible unless System "
-                 "Integrity Protection (SIP) is disabled.",
-                 dlPath, dlerror());
+  auto error_msg =
+      "Failed to find dynamic library: %s ( %s ) \n Please specify "
+      "its path correctly using following ways: \n Method. set "
+      "environment variable LD_LIBRARY_PATH on Linux or "
+      "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
+      "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
+      "using the DYLD_LIBRARY_PATH is impossible unless System "
+      "Integrity Protection (SIP) is disabled.";
+  if (throw_on_error) {
+    PADDLE_ENFORCE(nullptr != *dso_handle, error_msg, dlPath, dlerror());
+  } else if (nullptr == *dso_handle) {
+    LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror());
+  }
 }
 
 void GetCublasDsoHandle(void** dso_handle) {
@@ -132,9 +136,10 @@ void GetCublasDsoHandle(void** dso_handle) {
 
 void GetCudnnDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle,
+                             false);
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false);
 #endif
 }
 
diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h
index bb9d59ec0a..148ebaed3d 100644
--- a/paddle/platform/transform.h
+++ b/paddle/platform/transform.h
@@ -31,7 +31,7 @@ namespace paddle {
 namespace platform {
 
 // Transform on host or device. It provides the same API in std library.
-template <typename Place>
+template <typename DeviceContext>
 struct Transform {
   template <typename InputIter, typename OutputIter, typename UnaryOperation>
   void operator()(const DeviceContext& context, InputIter first, InputIter last,
@@ -45,16 +45,16 @@ struct Transform {
 };
 
 template <>
-struct Transform<platform::CPUPlace> {
+struct Transform<platform::CPUDeviceContext> {
   template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(const DeviceContext& context, InputIter first, InputIter last,
-                  OutputIter result, UnaryOperation op) {
+  void operator()(const platform::CPUDeviceContext& context, InputIter first,
+                  InputIter last, OutputIter result, UnaryOperation op) {
     std::transform(first, last, result, op);
   }
 
   template <typename InputIter1, typename InputIter2, typename OutputIter,
             typename BinaryOperation>
-  void operator()(const DeviceContext& context, InputIter1 first1,
+  void operator()(const platform::CPUDeviceContext& context, InputIter1 first1,
                   InputIter1 last1, InputIter2 first2, OutputIter result,
                   BinaryOperation op) {
     std::transform(first1, last1, first2, result, op);
@@ -63,27 +63,25 @@ struct Transform<platform::CPUPlace> {
 
 #ifdef __NVCC__
 template <>
-struct Transform<platform::GPUPlace> {
+struct Transform<platform::CUDADeviceContext> {
   template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(const DeviceContext& context, InputIter first, InputIter last,
-                  OutputIter result, UnaryOperation op) {
+  void operator()(const platform::CUDADeviceContext& context, InputIter first,
+                  InputIter last, OutputIter result, UnaryOperation op) {
     auto place = context.GetPlace();
     PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
-    auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
-    thrust::transform(thrust::cuda::par.on(ctx.stream()),
+    thrust::transform(thrust::cuda::par.on(context.stream()),
                       details::DevPtrCast(first), details::DevPtrCast(last),
                       details::DevPtrCast(result), op);
   }
 
   template <typename InputIter1, typename InputIter2, typename OutputIter,
             typename BinaryOperation>
-  void operator()(const DeviceContext& context, InputIter1 first1,
+  void operator()(const platform::CUDADeviceContext& context, InputIter1 first1,
                   InputIter1 last1, InputIter2 first2, OutputIter result,
                   BinaryOperation op) {
     auto place = context.GetPlace();
     PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
-    auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
-    thrust::transform(thrust::cuda::par.on(ctx.stream()),
+    thrust::transform(thrust::cuda::par.on(context.stream()),
                       details::DevPtrCast(first1), details::DevPtrCast(last1),
                       details::DevPtrCast(first2), details::DevPtrCast(result),
                       op);
diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu
index c76cab80e4..d36eac8379 100644
--- a/paddle/platform/transform_test.cu
+++ b/paddle/platform/transform_test.cu
@@ -39,7 +39,7 @@ TEST(Transform, CPUUnary) {
   using namespace paddle::platform;
   CPUDeviceContext ctx;
   float buf[4] = {0.1, 0.2, 0.3, 0.4};
-  Transform<paddle::platform::CPUPlace> trans;
+  Transform<paddle::platform::CPUDeviceContext> trans;
   trans(ctx, buf, buf + 4, buf, Scale<float>(10));
   for (int i = 0; i < 4; ++i) {
     ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
@@ -54,7 +54,7 @@ TEST(Transform, GPUUnary) {
   float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
   float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
   Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf));
-  Transform<paddle::platform::GPUPlace> trans;
+  Transform<paddle::platform::CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
   ctx.Wait();
   Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf));
@@ -68,7 +68,7 @@ TEST(Transform, CPUBinary) {
   using namespace paddle::platform;
   using namespace paddle::memory;
   int buf[4] = {1, 2, 3, 4};
-  Transform<paddle::platform::CPUPlace> trans;
+  Transform<paddle::platform::CPUDeviceContext> trans;
   CPUDeviceContext ctx;
   trans(ctx, buf, buf + 4, buf, buf, Multiply<int>());
   for (int i = 0; i < 4; ++i) {
@@ -84,7 +84,7 @@ TEST(Transform, GPUBinary) {
   CUDADeviceContext ctx(gpu0);
   int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
   Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf));
-  Transform<paddle::platform::GPUPlace> trans;
+  Transform<paddle::platform::CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
   ctx.Wait();
   Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf));
diff --git a/paddle/scripts/check_env.sh b/paddle/scripts/check_env.sh
new file mode 100755
index 0000000000..af16b84ca8
--- /dev/null
+++ b/paddle/scripts/check_env.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+
+if [ "`uname -s`" != "Linux" ]; then
+  echo "Current scenario only support in Linux yet!"
+  exit 0
+fi
+
+echo "========================= Hardware Information ========================="
+sockets=`grep 'physical id' /proc/cpuinfo | sort -u | wc -l`
+cores_per_socket=`grep 'core id' /proc/cpuinfo | sort -u | wc -l`
+ht=`lscpu |grep "per core" |awk -F':' '{print $2}'|xargs`
+physical_cores=$((sockets * cores_per_socket))
+virtual_cores=`grep 'processor' /proc/cpuinfo | sort -u | wc -l`
+numa_nodes=`lscpu |grep "NUMA node(s)"|awk -F':' '{print $2}'|xargs`
+echo "CPU Name               : `cat /proc/cpuinfo |grep -i "model name" |uniq |awk -F ':' '{print $2}'|xargs`"
+echo "CPU Family             : `lscpu |grep \"CPU family\" |awk -F':' '{print $2}'|xargs`"
+echo "Socket Number          : $sockets"
+echo "Cores Per Socket       : $cores_per_socket"
+echo "Total Physical Cores   : $physical_cores"
+echo "Total Virtual Cores    : $virtual_cores"
+if [ $ht -eq 1 ]; then
+  echo "Hyper Threading        : OFF"
+  if [ $physical_cores -ne $virtual_cores ]; then
+    echo "Error: HT logical error"
+  fi
+else
+  echo "Hyper Threading        : ON"
+  if [ $physical_cores -ge $virtual_cores ]; then
+    echo "Error: HT logical error"
+  fi
+fi
+echo "NUMA Nodes             : $numa_nodes"
+if [ $numa_nodes -lt $sockets ]; then
+  echo "Warning: NUMA node is not enough for the best performance,\
+ at least $sockets"
+fi
+
+echo "-------------------------- Memory Information --------------------------"
+# dmidecode support start from 2.11
+dmi_ver=`dmidecode --version|awk -F '.' '{print $1}'|xargs`
+if [ $dmi_ver -lt 2 ]; then
+  echo "Error: dmidecode unknown or version is too old"
+  exit 0
+fi
+if [ `dmidecode | grep -ic "Permission denied"` -ne 0 ]; then
+  echo "Error: need root to run dmidecode"
+  exit 0
+fi
+max_dimms=0
+num_dimms_installed=0
+for dimm_id in `dmidecode |grep Locator|sort -u | awk -F ':' '{print $2}'`; do
+  num_refered=`dmidecode |grep -wc "$dimm_id"`
+  # the actual dimm id should be refered only once
+  if [ $num_refered -eq 1 ]; then
+    num_unknown=`dmidecode | awk '/'$dimm_id'/ {s=1; f=0};
+      /Unknown/ {f=1};
+      /Manufacturer/ {if (s==1) {print f; exit 0;}};'`
+    if [ $num_unknown -eq 0 ]; then
+      dimms_installed="$dimms_installed \n $dimm_id"
+      ((num_dimms_installed++))
+    else
+      dimms_uninstalled="$dimms_uninstalled \n $dimm_id"
+    fi
+    ((max_dimms++))
+  fi
+done
+echo "Installed DIMM number  : $num_dimms_installed"
+num_dimms_mapped=`dmidecode | grep "Memory Device Mapped" | wc -l`
+if [ $num_dimms_installed -ne $num_dimms_mapped ]; then
+  echo "Error: The installed DIMMs number does ont match the mapped memory device: $num_dimms_mapped"
+fi
+num_clock_configed=`dmidecode | grep -i "Configured Clock Speed" |grep -ic "Hz"`
+if [ $num_dimms_installed -ne $num_clock_configed ]; then
+  echo "Error: The installed DIMMs number does ont match configured clocks: $num_clock_configed"
+fi
+echo -e "Installed DIMMs Locator: $dimms_installed"
+echo -e "Not installed DIMMs    : $dimms_uninstalled"
+max_dimm_slots=`dmidecode | grep -c "Bank Locator"`
+echo "DIMMs max slots        : $max_dimm_slots"
+if [ $max_dimms -ne $max_dimm_slots ]; then
+  echo "Error: The max dimm slots do not match the max dimms: $max_dimms"
+fi
+free_ver_main=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $1}'`
+free_ver_sub=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $2}'`
+if [ $free_ver_main -lt 3 ] || [ $free_ver_sub -lt 3 ]; then
+  mem_sz=`free |grep -i mem |awk -F' ' '{print $2}'|xargs`
+  swap_sz=`free |grep -i swap |awk -F' ' '{print $2}'|xargs`
+  total_sz=`free -t |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
+  mem_sz="`awk 'BEGIN{printf "%.1f\n",('$mem_sz'/1024/1024)}'` GB" 
+  swap_sz="`awk 'BEGIN{printf "%.1f\n",('$swap_sz'/1024/1024)}'` GB"
+  total_sz="`awk 'BEGIN{printf "%.1f\n",('$total_sz'/1024/1024)}'` GB"
+else
+  mem_sz=`free -h |grep -i mem |awk -F' ' '{print $2}'|xargs`
+  swap_sz=`free -h |grep -i swap |awk -F' ' '{print $2}'|xargs`
+  total_sz=`free -th |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
+fi
+echo "Memory Size            : $mem_sz"
+echo "Swap Memory Size       : $swap_sz"
+echo "Total Memory Size      : $total_sz"
+echo "Max Memory Capacity    : `dmidecode |grep -i \"maximum capacity\"|sort -u|awk -F':' '{print $2}'|xargs`"
+# DIMMs fequency
+clock_speeds=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | awk -F':' '{print $2}'|xargs`
+echo "Configed Clock Speed   : $clock_speeds"
+num_clock_type=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | wc -l`
+if [ $num_clock_type -ne 1 ]; then
+  echo "Warning: Have more than 1 speed type, all DIMMs should have same fequency: $clock_speeds"
+fi
+
+echo "-------------------------- Turbo Information  --------------------------"
+scaling_drive=`cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_driver`
+echo "Scaling Driver         : $scaling_drive"
+if [ $scaling_drive == "intel_pstate" ] && [ -e /sys/devices/system/cpu/intel_pstate/no_turbo ]; then
+  turbo=`cat /sys/devices/system/cpu/intel_pstate/no_turbo`
+  if [ $turbo -eq 1 ]; then
+    echo "Turbo Status           : OFF"
+  else
+    echo "Turbo Status           : ON"
+  fi
+else
+  echo "Warning: Scaling driver is not intel_pstarte, maybe should enable it in BIOS"
+  echo "Turbo Status           : Unknown"
+fi
+# cpu frequency
+num_max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| sort -u |wc -l`
+num_min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| sort -u |wc -l`
+if [ $num_max_freq -ne 1 ]; then
+  echo "Error: the max_frequency of all CPU should be equal"
+fi
+if [ $num_min_freq -ne 1 ]; then
+  echo "Error: the min_frequency of all CPU should be equal"
+fi
+max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| uniq|xargs` # kHz
+max_freq=`awk 'BEGIN{printf "%.2f",('$max_freq' / 1000000)}'` # GHz
+min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| uniq|xargs` # kHz
+min_freq=`awk 'BEGIN{printf "%.2f",('$min_freq' / 1000000)}'` # GHz
+echo "CPU Max Frequency      : $max_freq GHz"
+echo "CPU Min Frequency      : $min_freq GHz"
+# cpu governor
+num_governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |wc -l`
+if [ $num_governor -ne 1 ]; then
+  echo "Error: the governor of all CPU should be the same"
+fi
+governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |uniq`
+echo "CPU Freq Governor      : $governor"
+
+
+echo "========================= Software Information ========================="
+echo "BIOS Release Date      : `dmidecode | grep "Release Date"|awk -F ':' '{print $2}'|xargs`"
+echo "OS Version             : `cat /etc/redhat-release`"
+echo "Kernel Release Version : `uname -r`"
+echo "Kernel Patch Version   : `uname -v`"
+echo "GCC Version            :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`"
+if command -v cmake >/dev/null 2>&1; then 
+  cmake_ver=`cmake --version | head -n 1 | awk -F 'version' '{print $2}'`
+else
+  cmake_ver=" Not installed"
+fi
+echo "CMake Version          :$cmake_ver"
+echo "------------------ Environment Variables Information -------------------"
+kmp_affinity=`env | grep KMP_AFFINITY`
+omp_dynamic=`env | grep OMP_DYNAMIC`
+omp_nested=`env | grep OMP_NESTED`
+omp_num_threads=`env | grep OMP_NUM_THREADS`
+mkl_num_threads=`env | grep MKL_NUM_THREADS`
+mkl_dynamic=`env | grep MKL_DYNAMIC`
+if [ ! $kmp_affinity ]; then kmp_affinity="unset"; fi
+if [ ! $omp_dynamic ]; then omp_dynamic="unset"; fi
+if [ ! $omp_nested ]; then omp_nested="unset"; fi
+if [ ! $omp_num_threads ]; then omp_num_threads="unset"; fi
+if [ ! $mkl_num_threads ]; then mkl_num_threads="unset"; fi
+if [ ! $mkl_dynamic ]; then mkl_dynamic="unset"; fi
+echo "KMP_AFFINITY           : $kmp_affinity"
+echo "OMP_DYNAMIC            : $omp_dynamic"
+echo "OMP_NESTED             : $omp_nested"
+echo "OMP_NUM_THREADS        : $omp_num_threads"
+echo "MKL_NUM_THREADS        : $mkl_num_threads"
+echo "MKL_DYNAMIC            : $mkl_dynamic"
+# Check if any MKL related libraries have been installed in LD_LIBRARY_PATH
+for path in `echo $LD_LIBRARY_PATH | awk -F ':' '{for(i=1;i<=NF;++i)print $i}'`; do
+  mkldnn_found=`find $path -name "libmkldnn.so"`
+  if [ "$mkldnn_found" ]; then
+    echo "Found MKL-DNN          : $mkldnn_found"
+  fi
+  mklml_found=`find $path -name "libmklml_intel.so"`
+  if [ "$mklml_found" ]; then
+    echo "Found MKLML            : $mklml_found"
+  fi
+  iomp_found=`find $path -name "libiomp5.so"`
+  if [ "$iomp_found" ]; then
+    echo "Found IOMP             : $iomp_found"
+  fi
+done
+
+# dump all details for fully check
+lscpu > lscpu.dump
+dmidecode > dmidecode.dump
+
+# The expected result would be like:
+# ========================= Hardware Information =========================
+# CPU Name               : Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz
+# CPU Family             : 6
+# Socket Number          : 2
+# Cores Per Socket       : 20
+# Total Physical Cores   : 40
+# Total Virtual Cores    : 40
+# Hyper Threading        : OFF
+# NUMA Nodes             : 2
+# -------------------------- Memory Information --------------------------
+# Installed DIMM number  : 12
+# Installed DIMMs Locator:
+#  CPU1_DIMM_A1
+#  CPU1_DIMM_B1
+#  CPU1_DIMM_C1
+#  CPU1_DIMM_D1
+#  CPU1_DIMM_E1
+#  CPU1_DIMM_F1
+#  CPU2_DIMM_A1
+#  CPU2_DIMM_B1
+#  CPU2_DIMM_C1
+#  CPU2_DIMM_D1
+#  CPU2_DIMM_E1
+#  CPU2_DIMM_F1
+# Not installed DIMMs    :
+#  CPU1_DIMM_A2
+#  CPU1_DIMM_B2
+#  CPU1_DIMM_C2
+#  CPU1_DIMM_D2
+#  CPU1_DIMM_E2
+#  CPU1_DIMM_F2
+#  CPU2_DIMM_A2
+#  CPU2_DIMM_B2
+#  CPU2_DIMM_C2
+#  CPU2_DIMM_D2
+#  CPU2_DIMM_E2
+#  CPU2_DIMM_F2
+# DIMMs max slots        : 24
+# Memory Size            : 376G
+# Swap Memory Size       : 4.0G
+# Total Memory Size      : 380G
+# Max Memory Capacity    : 2304 GB
+# Configed Clock Speed   : 2666 MHz
+# -------------------------- Turbo Information  --------------------------
+# Scaling Driver         : intel_pstate
+# Turbo Status           : ON
+# CPU Max Frequency      : 3.70 GHz
+# CPU Min Frequency      : 1.00 GHz
+# CPU Freq Governor      : performance
+# ========================= Software Information =========================
+# BIOS Release Date      : 03/10/2017
+# OS Version             : CentOS Linux release 7.3.1611 (Core)
+# Kernel Release Version : 3.10.0-514.el7.x86_64
+# Kernel Patch Version   : #1 SMP Tue Nov 22 16:42:41 UTC 2016
+# GCC Version            : 4.8.5 20150623 (Red Hat 4.8.5-11)
+# CMake Version          : 3.5.2
+# ------------------ Environment Variables Information -------------------
+# KMP_AFFINITY           : unset
+# OMP_DYNAMIC            : unset
+# OMP_NESTED             : unset
+# OMP_NUM_THREADS        : unset
+# MKL_NUM_THREADS        : unset
+# MKL_DYNAMIC            : unset
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 0f889e6853..e43b9c218a 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -113,7 +113,10 @@ EOF
             -DWITH_SWIG_PY=ON \
             -DWITH_STYLE_CHECK=OFF
         make -j `nproc` gen_proto_py
+        make -j `nproc` paddle_python
         make -j `nproc` paddle_docs paddle_docs_cn
+        make -j `nproc` print_operators_doc
+        paddle/pybind/print_operators_doc > doc/en/html/operators.json
         popd
     fi
 
@@ -175,7 +178,7 @@ EOF
     # run paddle version to install python packages first
     RUN apt-get update &&\
         ${NCCL_DEPS}\
-        apt-get install -y wget python-pip && pip install -U pip && \
+        apt-get install -y wget python-pip dmidecode && pip install -U pip && \
         pip install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f /*.whl && \
@@ -185,14 +188,6 @@ EOF
     ${DOCKERFILE_GPU_ENV}
     ADD go/cmd/pserver/pserver /usr/bin/
     ADD go/cmd/master/master /usr/bin/
-EOF
-
-    if [[ ${WITH_DOC:-OFF} == 'ON' ]]; then
-        cat >> /paddle/build/Dockerfile <<EOF
-        ADD paddle/pybind/print_operators_doc /usr/bin/
-EOF
-    fi
-    cat >> /paddle/build/Dockerfile <<EOF
     # default command shows the paddle version and exit
     CMD ["paddle", "version"]
 EOF
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index d71cb84df3..43d2d1b410 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -140,7 +140,11 @@ else:
   sys.exit(0)
 EOF
 
-cpu_config
+if [ "`uname -s`" == "Linux" ]; then
+  # only support on linux yet, with mac can use v2
+  cpu_config
+fi
+
 # echo $KMP_AFFINITY $OMP_DYNAMIC
 
 case "$1" in
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index 7d54f0254c..ff0bac6a07 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -8,7 +8,10 @@ cd $TRAVIS_BUILD_DIR/build
 # Compile Documentation only.
 cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 make -j `nproc` gen_proto_py
+make -j `nproc` paddle_python
 make -j `nproc` paddle_docs paddle_docs_cn
+make -j `nproc` print_operators_doc
+paddle/pybind/print_operators_doc > doc/en/html/operators.json
 
 # check websites for broken links
 # It will be failed now!
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index 8f100f02e9..9a7dc0e356 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -20,7 +20,7 @@ DEFINE_bool(use_gpu, false, "Only support CPU training");
 DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
 #endif
 
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
 // TODO(TJ): change to true when MKLDNN layers support multi-inputs
 DEFINE_bool(use_mkldnn, false, "Default still keep use CPU training");
 #else
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 2fcdbbc8bd..1fbdd5bbd8 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -139,6 +139,8 @@ message PoolConfig {
   optional uint32 output_z = 16 [ default = 1 ];
   optional uint32 img_size_z = 17 [ default = 1 ];
   optional uint32 padding_z = 18 [ default = 1 ];
+
+  optional bool exclude_mode = 19;
 }
 
 message SppConfig {
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index c8632295a2..6f589e9169 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -33,6 +33,12 @@ if(WITH_MKLDNN)
   list(APPEND MKL_DEPENDS mkldnn)
 endif()
 
+if(WITH_GPU)
+  SET(PACKAGE_NAME "paddlepaddle-gpu")
+else()
+  SET(PACKAGE_NAME "paddlepaddle")
+endif()
+
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index f662d68263..1030c94e16 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -11,3 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+try:
+    from version import full_version as __version__
+    from version import commit as __git_commit__
+except ImportError:
+    import sys
+    sys.stderr.write('''Warning with import paddle: you should not 
+     import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
+                     )
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 5b173694dd..239fe4204b 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1233,7 +1233,7 @@ def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
     bilinear_conf.out_size_y = bilinear.out_size_y
 
 
-def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
+def parse_pool(pool, input_layer_name, pool_conf, ceil_mode, exclude_mode):
     pool_conf.pool_type = pool.pool_type
     config_assert(pool.pool_type in [
         'max-projection', 'avg-projection', 'max-pool-with-mask', 'cudnn-max-pool', 'cudnn-avg-pool'
@@ -1262,6 +1262,8 @@ def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
     pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
                                          pool_conf.padding_y,
                                          pool_conf.stride_y, not ceil_mode)
+    if exclude_mode != None:
+        pool_conf.exclude_mode = exclude_mode
 
 
 def parse_pool3d(pool, input_layer_name, pool_conf, ceil_mode):
@@ -2287,11 +2289,17 @@ class Conv3DLayer(Conv3DLayerBase):
 class NormLayer(LayerBase):
     def __init__(self, name, inputs, **xargs):
         super(NormLayer, self).__init__(name, 'norm', 0, inputs=inputs, **xargs)
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        use_mkldnn = True if use_mkldnn and self.inputs[
+            0].norm.norm_type == 'cmrnorm-projection' else False
+        self.config.type = 'mkldnn_lrn' if use_mkldnn else self.config.type
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             norm_conf = self.config.inputs[input_index].norm_conf
             parse_norm(self.inputs[input_index].norm, input_layer.name,
                        norm_conf)
+            norm_conf.scale = self.inputs[
+                input_index].norm.scale if use_mkldnn else norm_conf.scale
             self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
                                norm_conf.channels, False)
             if norm_conf.norm_type == "cross-channel-norm":
@@ -2303,7 +2311,8 @@ class NormLayer(LayerBase):
 class PoolLayer(LayerBase):
     layer_type = 'pool'
 
-    def __init__(self, name, inputs, ceil_mode=True, **xargs):
+    def __init__(self, name, inputs, ceil_mode=True, exclude_mode=None,
+                 **xargs):
         use_mkldnn = int(g_command_config_args.get("use_mkldnn", 0))
         if self.layer_type == "mkldnn_pool":
             config_assert(use_mkldnn, "mkldnn_pool only support MKLDNN")
@@ -2314,7 +2323,7 @@ class PoolLayer(LayerBase):
             input_layer = self.get_input_layer(input_index)
             pool_conf = self.config.inputs[input_index].pool_conf
             parse_pool(self.inputs[input_index].pool, input_layer.name,
-                       pool_conf, ceil_mode)
+                       pool_conf, ceil_mode, exclude_mode)
             self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x,
                                pool_conf.channels)
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index f6dc58b9c0..7e118b24a4 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -21,7 +21,7 @@ from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
     ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
 from .evaluators import *
 from .poolings import MaxPooling, AvgPooling, MaxWithMaskPooling, BasePoolingType, \
-    CudnnAvgPooling, CudnnMaxPooling
+    CudnnAvgPooling, CudnnAvgInclPadPooling, CudnnMaxPooling
 from .attrs import *
 from .default_decorators import *
 
@@ -1519,34 +1519,33 @@ def lstmemory(input,
     NOTE: This is a low level user interface. You can use network.simple_lstm
     to config a simple plain lstm layer.
 
-    Please refer to **Generating Sequences With Recurrent Neural Networks** for
-    more details about LSTM.
-
-    Link_ goes as below.
-
-    .. _Link: http://arxiv.org/abs/1308.0850
+    Reference:
+        `Generating Sequences With Recurrent Neural Networks
+        <https://arxiv.org/pdf/1308.0850.pdf>`_
 
-    :param name: The lstmemory layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param size: DEPRECATED. size of the lstm cell
+    :param size: DEPRECATED. The dimension of the lstm cell.
     :type size: int
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param reverse: is sequence process reversed or not.
+    :param reverse: Whether the input sequence is processed in a reverse order.
     :type reverse: bool
     :param act: Activation type. TanhActivation is the default activation.
     :type act: BaseActivation
-    :param gate_act: gate activation type, SigmoidActivation by default.
+    :param gate_act: Activation type of this layer's gates. SigmoidActivation is the
+                     default activation.
     :type gate_act: BaseActivation
-    :param state_act: state activation type, TanhActivation by default.
+    :param state_act: Activation type of the state. TanhActivation is the default activation.
     :type state_act: BaseActivation
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: Parameter Attribute.
-    :type param_attr: ParameterAttribute | None | False
-    :param layer_attr: Extra Layer attribute
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1635,14 +1634,14 @@ def grumemory(input,
         h_t = (1 - z_t) h_{t-1} + z_t {\\tilde{h_t}}
 
     NOTE: In PaddlePaddle's implementation, the multiplication operations
-    :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not computed in
-    gate_recurrent layer. Consequently, an additional mixed_layer with
+    :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not performed
+    in gate_recurrent layer. Consequently, an additional mixed_layer with
     full_matrix_projection or a fc_layer must be included before grumemory
     is called.
 
-    More details can be found by referring to `Empirical Evaluation of Gated
-    Recurrent Neural Networks on Sequence Modeling.
-    <https://arxiv.org/abs/1412.3555>`_
+    Reference:
+        `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling
+        <https://arxiv.org/abs/1412.3555>`_
 
     The simple usage is:
 
@@ -1650,28 +1649,29 @@ def grumemory(input,
 
        gru = grumemory(input)
 
-    :param name: The gru layer name.
-    :type name: None | basestring
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput.
-    :param size: DEPRECATED. size of the gru cell
+    :param size: DEPRECATED. The dimension of the gru cell.
     :type size: int
-    :param reverse: Whether sequence process is reversed or not.
+    :param reverse: Whether the input sequence is processed in a reverse order.
     :type reverse: bool
     :param act: Activation type, TanhActivation is the default. This activation
                 affects the :math:`{\\tilde{h_t}}`.
     :type act: BaseActivation
-    :param gate_act: gate activation type, SigmoidActivation by default.
-                     This activation affects the :math:`z_t` and :math:`r_t`. It is the
-                     :math:`\\sigma` in the above formula.
+    :param gate_act: Activation type of this layer's two gates. SigmoidActivation is
+                     the default activation. This activation affects the :math:`z_t`
+                     and :math:`r_t`. It is the :math:`\\sigma` in the above formula.
     :type gate_act: BaseActivation
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: Parameter Attribute.
-    :type param_attr: ParameterAttribute | None | False
-    :param layer_attr: Extra Layer attribute
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1715,10 +1715,10 @@ def last_seq(input,
     """
     Get Last Timestamp Activation of a sequence.
 
-    If stride > 0, this layer slides a window whose size is determined by stride,
-    and return the last value of the window as the output. Thus, a long sequence
-    will be shorten. Note that for sequence with sub-sequence, the default value
-    of stride is -1.
+    If stride > 0, this layer will slide a window whose size is determined by stride,
+    and return the last value of the sequence in the window as the output. Thus, a
+    long sequence will be shortened. Note that for sequence with sub-sequence, the
+    default value of stride is -1.
 
     The simple usage is:
 
@@ -1727,14 +1727,16 @@ def last_seq(input,
        seq = last_seq(input=layer)
 
     :param agg_level: Aggregated level
+    :type agg_level: AggregateLevel
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
     :param stride: The step size between successive pooling regions.
-    :type stride: Int
-    :param layer_attr: extra layer attributes.
-    :type layer_attr: ExtraLayerAttribute.
+    :type stride: int
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1771,10 +1773,10 @@ def first_seq(input,
     """
     Get First Timestamp Activation of a sequence.
 
-    If stride > 0, this layer slides a window whose size is determined by stride,
-    and return the first value of the window as the output. Thus, a long sequence
-    will be shorten. Note that for sequence with sub-sequence, the default value
-    of stride is -1.
+    If stride > 0, this layer will slide a window whose size is determined by stride,
+    and return the first value of the sequence in the window as the output. Thus, a
+    long sequence will be shortened. Note that for sequence with sub-sequence, the
+    default value of stride is -1.
 
     The simple usage is:
 
@@ -1783,13 +1785,15 @@ def first_seq(input,
        seq = first_seq(input=layer)
 
     :param agg_level: aggregation level
+    :type agg_level: AggregateLevel
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
     :param stride: The step size between successive pooling regions.
-    :type stride: Int
-    :param layer_attr: extra layer attributes.
+    :type stride: int
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1847,8 +1851,8 @@ def expand_layer(input,
                  expand_level=ExpandLevel.FROM_NO_SEQUENCE,
                  layer_attr=None):
     """
-    A layer for "Expand Dense data or (sequence data where the length of each
-    sequence is one) to sequence data."
+    A layer for expanding dense data or (sequence data where the length of each
+    sequence is one) to sequence data.
 
     The example usage is:
 
@@ -1860,7 +1864,9 @@ def expand_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param expand_as: Expand as this layer's sequence info.
+    :param expand_as: Expand the input according to this layer's sequence infomation. And
+                      after the operation, the input expanded will have the same number of
+                      elememts as this layer.
     :type expand_as: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
@@ -1868,9 +1874,10 @@ def expand_layer(input,
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param expand_level: whether input layer is timestep(default) or sequence.
+    :param expand_level: Whether the input layer is a sequence or the element of a sequence.
     :type expand_level: ExpandLevel
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2709,7 +2716,8 @@ def img_pool_layer(input,
                    pool_size_y=None,
                    stride_y=None,
                    padding_y=None,
-                   ceil_mode=True):
+                   ceil_mode=True,
+                   exclude_mode=None):
     """
     Image pooling Layer.
 
@@ -2721,15 +2729,17 @@ def img_pool_layer(input,
 
     ..  math::
 
-        w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+        w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride}
+
+        h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
 
     - ceil_mode=False:
 
     ..  math::
 
-        w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+        w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride}
+
+        h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
 
     The example usage is:
 
@@ -2773,10 +2783,15 @@ def img_pool_layer(input,
     :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
                        details.
     :type layer_attr: ExtraLayerAttribute
-    :param ceil_mode: Wether to use the ceil function to calculate output height and width.
+    :param ceil_mode: Whether to use the ceil function to calculate output height and width.
                       True is the default. If it is set to False, the floor function will
                       be used.
     :type ceil_mode: bool
+    :param exclude_mode: Whether to exclude the padding cells when calculating, but only 
+                         work when pool_type is AvgPooling. If None, also exclude the padding 
+                         cells. If use cudnn, use CudnnAvgPooling or CudnnAvgInclPadPooling 
+                         as pool_type to identify the mode.
+    :type exclude_mode: bool
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2790,7 +2805,7 @@ def img_pool_layer(input,
         pool_type.name = 'avg'
 
     assert type(pool_type) in [AvgPooling, MaxPooling, MaxWithMaskPooling, CudnnAvgPooling,
-                               CudnnMaxPooling], \
+                               CudnnMaxPooling, CudnnAvgInclPadPooling], \
         "only (Cudnn)AvgPooling, (Cudnn)MaxPooling, MaxWithMaskPooling are supported"
 
     type_name = pool_type.name + '-projection' \
@@ -2819,6 +2834,7 @@ def img_pool_layer(input,
                     padding_y=padding_y))
         ],
         ceil_mode=ceil_mode,
+        exclude_mode=exclude_mode,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
         name,
@@ -2856,17 +2872,21 @@ def img_pool3d_layer(input,
 
     ..  math::
 
-        w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
-        d = 1 + int(ceil(input\_depth + 2 * padding\_z - pool\_size\_z) / float(stride\_z))
+        w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride}
+
+        h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
+
+        d & = 1 + \\frac{ceil(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z}
 
     - ceil_mode=False:
 
     ..  math::
 
-        w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
-        d = 1 + int(floor(input\_depth + 2 * padding\_z - pool\_size\_z) / float(stride\_z))
+        w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride}
+
+        h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
+
+        d & = 1 + \\frac{floor(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z}
 
     The example usage is:
 
@@ -2989,7 +3009,7 @@ def spp_layer(input,
 
     Reference:
         `Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
-        https://arxiv.org/abs/1406.4729`_
+        <https://arxiv.org/abs/1406.4729>`_
 
     The example usage is:
 
@@ -3091,7 +3111,7 @@ def img_cmrnorm_layer(input,
 
     Reference:
         `ImageNet Classification with Deep Convolutional Neural Networks
-        http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf`_
+        <http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf>`_
 
     The example usage is:
 
@@ -3159,7 +3179,7 @@ def batch_norm_layer(input,
     Reference:
         `Batch Normalization: Accelerating Deep Network Training by Reducing
         Internal Covariate Shift
-        http://arxiv.org/abs/1502.03167`_
+        <http://arxiv.org/abs/1502.03167>`_
 
     The example usage is:
 
@@ -3297,7 +3317,7 @@ def row_l2_norm_layer(input, name=None, layer_attr=None):
     A layer for L2-normalization in each row.
 
     .. math::
-       out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
+       out[i] = \\frac{in[i]} {\\sqrt{\\sum_{k=1}^N in[k]^{2}}}
 
     where the size of :math:`in` is (batchSize x dataDim) ,
     and the size of :math:`out` is a (batchSize x dataDim) .
@@ -5417,17 +5437,27 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
 
     Reference:
         `Maxout Networks
-        http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf`_
+        <http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf>`_
         `Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
-        https://arxiv.org/pdf/1312.6082v4.pdf`_
+        <https://arxiv.org/pdf/1312.6082v4.pdf>`_
+
 
     .. math::
-       y_{si+j} = \max_k x_{gsi + sk + j}
-       g = groups
-       s = input.size / num_channels
-       0 \le i < num_channels / groups
-       0 \le j < s
-       0 \le k < groups
+
+       & out = \max_k (in[n, k, o_c , s])
+
+       & out_{i * s + j} = \max_k in_{  k * o_{c} * s + i * s + j}
+
+       & s = \\frac{input.size}{ num\_channels}
+
+       & o_{c} = \\frac{num\_channels}{groups}
+
+       & 0 \le i < o_{c}
+
+       & 0 \le j < s
+
+       & 0 \le k < groups
+
 
     The simple usage is:
 
@@ -5486,7 +5516,7 @@ def ctc_layer(input,
     Reference:
         `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
         with Recurrent Neural Networks
-        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
+        <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf>`_
 
     Note:
         Considering the 'blank' label needed by CTC, you need to use (num_classes + 1)
@@ -5560,7 +5590,7 @@ def warp_ctc_layer(input,
     Reference:
         `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
         with Recurrent Neural Networks
-        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
+        <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf>`_
 
     Note:
         - Let num_classes represents the category number. Considering the 'blank'
@@ -5781,7 +5811,7 @@ def nce_layer(input,
 
     Reference:
         `A fast and simple algorithm for training neural probabilistic language
-        models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf`_
+        models. <https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf>`_
 
     The example usage is:
 
@@ -5897,7 +5927,7 @@ def rank_cost(left,
 
     Reference:
         `Learning to Rank using Gradient Descent
-        http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf`_
+        <http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf>`_
 
     .. math::
 
@@ -6164,9 +6194,11 @@ def huber_regression_cost(input,
     Given a prediction f(x), a label y and :math:`\delta`, the loss function
     is defined as:
 
-    .. math:
-       loss = 0.5*\left ( y-f(x) \right )^2, \left | y-f(x) \right |\leq \delta
-       loss = \delta \left | y-f(x) \right |-0.5\delta ^2, otherwise
+    .. math::
+
+       loss = 0.5*(y-f(x))^{2}, | y-f(x) | < \delta
+
+       loss = \delta | y-f(x) | - 0.5 \delta ^2, otherwise
 
     The example usage is:
 
@@ -6213,12 +6245,14 @@ def huber_classification_cost(input,
     """
     For classification purposes, a variant of the Huber loss called modified Huber
     is sometimes used. Given a prediction f(x) (a real-valued classifier score) and
-    a true binary class label :math:`y\in \left \{-1, 1 \right \}`, the modified Huber
+    a true binary class label :math:`y\in \{-1, 1 \}`, the modified Huber
     loss is defined as:
 
     .. math:
-       loss = \max \left ( 0, 1-yf(x) \right )^2, yf(x)\geq 1
-       loss = -4yf(x), \text{otherwise}
+
+       loss = \max ( 0, 1-yf(x) )^2, yf(x) \geq -1
+
+       loss = -4yf(x), otherwise
 
     The example usage is:
 
@@ -6433,7 +6467,7 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
 
     Reference:
         `Fast R-CNN
-        https://arxiv.org/pdf/1504.08083v2.pdf`_
+        <https://arxiv.org/pdf/1504.08083v2.pdf>`_
 
     The example usage is:
 
@@ -6640,7 +6674,7 @@ def prelu_layer(input,
 
     Reference:
         `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
-        ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf`_
+        ImageNet Classification <http://arxiv.org/pdf/1502.01852v1.pdf>`_
 
     .. math::
        z_i &\\quad if \\quad z_i > 0 \\\\
@@ -6737,7 +6771,7 @@ def gated_unit_layer(input,
 
     Reference:
         `Language Modeling with Gated Convolutional Networks
-        https://arxiv.org/abs/1612.08083`_
+        <https://arxiv.org/abs/1612.08083>`_
 
     .. math::
        y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
@@ -6963,7 +6997,7 @@ def clip_layer(input, min, max, name=None):
 
     .. math::
 
-        out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right)
+        out[i] = \min (\max (in[i],p_{1} ),p_{2} )
 
     .. code-block:: python
 
diff --git a/python/paddle/trainer_config_helpers/poolings.py b/python/paddle/trainer_config_helpers/poolings.py
index f45616551b..e0aeb311b3 100644
--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
@@ -16,7 +16,8 @@
 
 __all__ = [
     "BasePoolingType", "MaxPooling", "AvgPooling", "MaxWithMaskPooling",
-    "CudnnMaxPooling", "CudnnAvgPooling", "SumPooling", "SquareRootNPooling"
+    "CudnnMaxPooling", "CudnnAvgPooling", "CudnnAvgInclPadPooling",
+    "SumPooling", "SquareRootNPooling"
 ]
 
 
@@ -88,6 +89,16 @@ class CudnnAvgPooling(BasePoolingType):
         BasePoolingType.__init__(self, "cudnn-avg-pool")
 
 
+class CudnnAvgInclPadPooling(BasePoolingType):
+    """
+    Cudnn average pooling only support GPU. Return the average value in the
+    pooling window taking into account the padding cells.
+    """
+
+    def __init__(self):
+        BasePoolingType.__init__(self, "cudnn-avg-incl-pad-pool")
+
+
 class AvgPooling(BasePoolingType):
     """
     Average pooling.
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index e31e501ce9..191d9ecfb1 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -71,7 +71,7 @@ def download(url, module_name, md5sum):
         if retry < retry_limit:
             retry += 1
         else:
-            raise RuntimeError("Cannot download {0} within retry limit {2}".
+            raise RuntimeError("Cannot download {0} within retry limit {1}".
                                format(url, retry_limit))
         print "Cache file %s not found, downloading %s" % (filename, url)
         r = requests.get(url, stream=True)
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 49c6d89834..bf0cd275b6 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -3,6 +3,7 @@ import collections
 import numpy as np
 from . import core
 import proto.framework_pb2 as framework_pb2
+import google.protobuf.message
 import contextlib
 
 __all__ = [
@@ -13,11 +14,28 @@ __all__ = [
 
 
 def unique_name(prefix):
+    """
+    Generate unique names with prefix
+
+    Args:
+        prefix(str): The prefix of return string
+
+    Returns(str): A unique string with the prefix
+
+    """
     uid = core.unique_integer(prefix)  # unique during whole process.
     return "_".join([prefix, str(uid)])
 
 
 def convert_np_dtype_to_dtype_(np_dtype):
+    """
+    Convert the data type in numpy to the data type in Paddle
+    Args:
+        np_dtype(np.dtype): the data type in numpy
+
+    Returns(core.DataType): the data type in Paddle
+
+    """
     dtype = np.dtype(np_dtype)
     if dtype == np.float32:
         return core.DataType.FP32
@@ -38,17 +56,33 @@ def convert_np_dtype_to_dtype_(np_dtype):
 
 
 def dtype_is_floating(dtype):
+    """
+    Check the data type is floating or not.
+    Args:
+        dtype(np.dtype|core.DataType): data type.
+            Could be numpy format or Paddle format
+
+    Returns(bool): True if data type is a float value
+
+    """
     if not isinstance(dtype, core.DataType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if (dtype == core.DataType.FP16 or dtype == core.DataType.FP32 or
-            dtype == core.DataType.FP64):
-        return True
-    else:
-        return False
+    return dtype in [core.DataType.FP16, core.DataType.FP32, core.DataType.FP64]
 
 
 def _debug_string_(proto, throw_on_error=True):
+    """
+    Get the debug string of a protobuf message. The message could be not
+    initialized.
+    Args:
+        proto(google.protobuf.message.Message): The protobuf message
+        throw_on_error(bool): True if raise an error when the protobuf message
+            is not initialized.
+
+    Returns(str): The debug string of the protobuf message
+
+    """
     error_fields = list()
     if not proto.IsInitialized(error_fields) and throw_on_error:
         raise ValueError("{0} are not initialized\nThe message is {1}".format(
@@ -57,6 +91,38 @@ def _debug_string_(proto, throw_on_error=True):
 
 
 class Variable(object):
+    """
+    Python variable. Every input and output of an operator is a variable. Every
+    variable belongs to a block. The variable has a name and two variables in
+    different blocks could have the same name.
+
+    There are many kinds of variables. Please reference the framework.proto for
+    details.
+
+    Notes: The constructor of Variable should not be invoked directly. Please
+    use `Block.create_var` to create a variable.
+
+    >>> cur_program = Program()
+    >>> cur_block = cur_program.current_block()
+    >>> new_variable = cur_block.create_var(
+    >>>                    name="X", shape=[-1, 23, 48], dtype='float32')
+
+    Args:
+        block(Block): The associated block. It will be passed by
+            `Block.create_var` automatically.
+        type(core.VarDesc.VarType): Variable type. Please reference the
+            framework.proto for details.
+        shape(tuple|list|None): The shape of variable. -1 means the batch size.
+            Some kinds of variable do not contain shape, just set it to None.
+        dtype(np.dtype|core.DataType|str): The data type of variable.
+        lod_level(int): The level of lod tensor. 0 means there is not a time
+            series data.
+        persistable(bool): True if the variable should be saved as check point.
+            Defaults to False.
+        stop_gradient(bool): True if the variable will stop to calculate
+            gradients when backward. Defaults to False.
+    """
+
     def __init__(self,
                  block,
                  type=core.VarDesc.VarType.LOD_TENSOR,
@@ -140,6 +206,16 @@ class Variable(object):
         return self.to_string(True)
 
     def to_string(self, throw_on_error):
+        """
+        Get debug string.
+
+        Args:
+            throw_on_error(bool): True if raise an exception when self is not
+                intialized.
+
+        Returns(str): The debug string.
+
+        """
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.VarDesc.FromString(str(protostr))
         return _debug_string_(proto, throw_on_error)
@@ -185,7 +261,9 @@ class Variable(object):
 def get_all_op_protos():
     """
     Get all registered op proto from PaddlePaddle C++ end.
-    :return: A list of registered OpProto.
+
+    Returns(list): list of OpProto
+
     """
     protostrs = core.get_all_op_protos()
     ret_values = []
@@ -196,6 +274,10 @@ def get_all_op_protos():
 
 
 class OpProtoHolder(object):
+    """
+    A global variable to hold all OpProtos from C++ as a map
+    """
+
     @classmethod
     def instance(cls):
         if not hasattr(cls, '_instance'):
@@ -212,12 +294,26 @@ class OpProtoHolder(object):
             self.op_proto_map[proto.type] = proto
 
     def get_op_proto(self, type):
+        """
+        Get OpProto by a type string.
+        Args:
+            type(str): The type that operator registered in C++ side.
+
+        Returns(framework_pb2.OpProto): The OpProto
+
+        """
         if type not in self.op_proto_map:
             raise ValueError("Operator \"%s\" has not been registered." % type)
         return self.op_proto_map[type]
 
 
 class Operator(object):
+    """
+    Python Operator class. The operator represents the build in instructs in a
+    Block. Users can use the build in instructs to describe their neural
+    network.
+    """
+
     def __init__(self,
                  block,
                  desc,
@@ -225,6 +321,30 @@ class Operator(object):
                  inputs=None,
                  outputs=None,
                  attrs=None):
+        """
+        Constructor.
+
+        Notes: The constructor of operator should not be invoked directly. Use
+        Block.append_op or Block.prepend_op instead.
+
+        >>> cur_program = Program()
+        >>> cur_block = cur_program.current_block()
+        >>> # var1 += var2 + var3
+        >>> cur_block.append_op(type="sum",
+        >>>                     inputs={"X": [var1, var2, var3]},
+        >>>                     outputs={"Out": [var1]})
+
+        Args:
+            block(Block): The block has the current operator
+            desc(core.OpDesc): The protobuf description
+            type(str): The type of operator.
+            inputs(dict): The input dictionary. Key is the input parameter name.
+                Value is a list of variables.
+            outputs(dict): The output dictionary. Has same format with inputs
+            attrs(dict): The attributes dictionary. Key is attribute name. Value
+                is the attribute value. The attribute type should be as same as
+                the type registered in C++
+        """
         self.block = block
         self.desc = desc
         if len(self.desc.type()) != 0:
@@ -237,7 +357,7 @@ class Operator(object):
 
         def find_name(var_list, name):
             for var_name in var_list:
-                if var_name == name:
+                if var_list[var_name] is not None and var_name == name:
                     return True
             return False
 
@@ -311,6 +431,15 @@ class Operator(object):
             self.desc.infer_shape(self.block.desc)
 
     def to_string(self, throw_on_error):
+        """
+        To debug string.
+        Args:
+            throw_on_error(bool): raise exception when self is not initialized
+                when throw_on_error is True
+
+        Returns(str): The debug string.
+
+        """
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.OpDesc.FromString(str(protostr))
         return _debug_string_(proto, throw_on_error)
@@ -325,21 +454,55 @@ class Operator(object):
         return self.desc.type()
 
     def input(self, name):
+        """
+        Get input arguments by the input parameter name
+        Args:
+            name(str): The input parameter name
+
+        Returns(list): return the list of argument names associated with the
+            specific parameter name.
+
+        """
         return self.desc.input(name)
 
     @property
     def input_names(self):
+        """
+        Get all input parameter names
+        Returns(list): return a list of input parameter names
+
+        """
         return self.desc.input_names()
 
     def output(self, name):
+        """
+        Get output arguments by the output parameter name
+        Args:
+            name(str): The output parameter name
+
+        Returns(list): return the list of argument names associated with the
+            specific parameter name.
+
+        """
         return self.desc.output(name)
 
     @property
     def output_names(self):
+        """
+        Get all output parameter names
+        Returns(list): return a list of output parameter names
+
+        """
         return self.desc.output_names()
 
     @property
     def idx(self):
+        """
+        Return the array index of current operator.
+        Returns(int): The array index in block.ops array
+        Raises:
+            ValueError: when the operator is not found.
+        """
         for i, op in enumerate(self.block.ops):
             if op == self:
                 return i
@@ -347,19 +510,57 @@ class Operator(object):
             "Can't find op itself in it's block. It could be a bug of Paddle.")
 
     def has_attr(self, name):
+        """
+        operator has the attribute with name or not.
+        Args:
+            name(str): the attribute name
+
+        Returns(bool): True if has this attribute.
+
+        """
         return self.desc.has_attr(name)
 
     def attr_type(self, name):
+        """
+        Get the type of attribute by attribute name
+        Args:
+            name(str): the attribute name
+
+        Returns(core.AttrType): the attribute type
+
+        """
         return self.desc.attr_type(name)
 
     @property
     def attr_names(self):
+        """
+        Get all attribute names
+        Returns(list): The list of attribute name
+
+        """
         return self.desc.attr_names()
 
     def attr(self, name):
+        """
+        Get attribute by name
+        Args:
+            name(str): the attribute name
+
+        Returns(bool|int|str|float|list): The attribute value. The return value
+            can be any valid attribute type.
+
+        """
         return self.desc.attr(name)
 
     def block_attr(self, name):
+        """
+        Get the block attribute by name
+        Args:
+            name(str): the attribute name
+
+        Returns(int): the block index
+
+        """
         return self.desc.block_attr(name)
 
 
@@ -479,7 +680,7 @@ class Block(object):
         """
         Copy the information of parameters from other block
         Args:
-            other(Block): other block 
+            other(Block): other block
 
         Returns:
             None
@@ -512,6 +713,7 @@ class Program(object):
         self.desc = core.ProgramDesc()
         self.blocks = [Block(self, 0)]
         self.current_block_idx = 0
+        self._seed = 0
 
     def __str__(self):
         return self.to_string(True)
@@ -564,6 +766,16 @@ class Program(object):
         p.sync_with_cpp()
         return p
 
+    @property
+    def random_seed(self):
+        return self._seed
+
+    @random_seed.setter
+    def random_seed(self, seed):
+        if not isinstance(seed, int):
+            raise ValueError("Seed must be a integer.")
+        self._seed = seed
+
     def __repr__(self):
         return str(self)
 
@@ -612,7 +824,7 @@ class Program(object):
 
     def copy_param_info_from(self, other):
         """
-        Copy the information of parameters from other program. 
+        Copy the information of parameters from other program.
         Args:
             other(Program): Other program
 
@@ -664,7 +876,7 @@ def default_startup_program():
     """
     Get default startup program. In startup program, Paddle will initialize
     parameters, initialize nccl handle, etc.
-    
+
     Returns:
         Program: startup program
     """
@@ -674,7 +886,7 @@ def default_startup_program():
 def default_main_program():
     """
     Get default main program. The main program is used for training or testing.
-    
+
     Returns:
         Program: main program
     """
@@ -684,7 +896,7 @@ def default_main_program():
 def switch_main_program(program):
     """
     Switch the main program to a new program.
-    
+
     Args:
         program(Program): The new main program
 
@@ -699,7 +911,7 @@ def switch_main_program(program):
 
 def switch_startup_program(program):
     """
-    Switch the startup program to a new program 
+    Switch the startup program to a new program
     Args:
         program(Program): The new startup program
 
@@ -716,15 +928,15 @@ def switch_startup_program(program):
 def program_guard(main_program, startup_program=None):
     """
     Switch program with `with` statement
-    
+
     Examples:
         >>> with program_guard(Program()):
         >>>   data = fluid.layers.data(...)
         >>>   hidden = fluid.layers.fc(...)
-        
+
     Args:
         main_program(Program): New main program inside `with` statement
-        startup_program(Program): New startup program inside `with` statement. 
+        startup_program(Program): New startup program inside `with` statement.
             None means do not change startup program.
 
     Returns:
diff --git a/python/paddle/v2/fluid/initializer.py b/python/paddle/v2/fluid/initializer.py
index d3f648f846..c0839caaf2 100644
--- a/python/paddle/v2/fluid/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
@@ -132,6 +132,8 @@ class UniformInitializer(Initializer):
         assert isinstance(var, framework.Variable)
         assert isinstance(block, framework.Block)
         # Initialization Ops should be prepended and not appended
+        if self._seed == 0:
+            self._seed = block.program.random_seed
         op = block.prepend_op(
             type="uniform_random",
             outputs={"Out": var},
@@ -180,6 +182,8 @@ class NormalInitializer(Initializer):
         assert isinstance(var, framework.Variable)
         assert isinstance(block, framework.Block)
         # Initialization Ops should be prepended and not appended
+        if self._seed == 0:
+            self._seed = block.program.random_seed
         op = block.prepend_op(
             type="gaussian_random",
             outputs={"Out": var},
@@ -255,6 +259,9 @@ class XavierInitializer(Initializer):
         fan_in = f_in if self._fan_in is None else self._fan_in
         fan_out = f_out if self._fan_out is None else self._fan_out
 
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
         if self._uniform:
             limit = np.sqrt(6.0 / float(fan_in + fan_out))
             op = block.prepend_op(
@@ -338,6 +345,9 @@ class MSRAInitializer(Initializer):
         # If fan_in is passed, use it
         fan_in = f_in if self._fan_in is None else self._fan_in
 
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
         if self._uniform:
             limit = np.sqrt(6.0 / float(fan_in))
             op = block.prepend_op(
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
index 5b384e5cf5..3963e13222 100644
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -1,7 +1,7 @@
 import copy
 import itertools
 
-from framework import Variable, default_main_program, default_startup_program, \
+from framework import Variable, Parameter, default_main_program, default_startup_program, \
     unique_name, dtype_is_floating
 from paddle.v2.fluid.initializer import Constant, Xavier
 from param_attr import ParamAttr
@@ -122,6 +122,12 @@ class LayerHelper(object):
         return self.main_program.global_block().create_parameter(
             dtype=dtype, shape=shape, **attr.to_kwargs())
 
+    def get_parameter(self, name):
+        param = self.main_program.global_block().var(name)
+        if not isinstance(param, Parameter):
+            raise ValueError("no Parameter name %s found" % name)
+        return param
+
     def create_tmp_variable(self, dtype):
         return self.main_program.current_block().create_var(
             name=unique_name(".".join([self.name, 'tmp'])),
@@ -145,6 +151,13 @@ class LayerHelper(object):
             persistable=True,
             initializer=initializer)
 
+    @property
+    def to_kwargs(self):
+        return {
+            'main_program': self.main_program,
+            'startup_program': self.startup_program
+        }
+
     def append_bias_op(self, input_var, dim_start=1, dim_end=None):
         """
         Append bias operator and return its output. If the user does not set
diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py
index 7c1514efad..9d6ff7c47a 100644
--- a/python/paddle/v2/fluid/layers.py
+++ b/python/paddle/v2/fluid/layers.py
@@ -6,6 +6,7 @@ from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
 import re
 import cStringIO
 from param_attr import ParamAttr
+import contextlib
 
 __all__ = [
     'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
@@ -179,6 +180,77 @@ def dynamic_lstm(input,
     return hidden, cell
 
 
+def gru_unit(input,
+             hidden,
+             size,
+             weight=None,
+             bias=None,
+             activation='tanh',
+             gate_activation='sigmoid',
+             main_program=None,
+             startup_program=None):
+    """
+    GRUUnit Operator implements partial calculations of the GRU unit as following:
+
+    $$
+    update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+    reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+    output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+    output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
+    $$
+
+    which is same as one time step of GRU Operator.
+
+    @note To implement the complete GRU unit, fully-connected operator must be
+    used before to feed xu, xr and xc as the Input of GRUUnit operator.
+
+    TODO(ChunweiYan) add more document here
+    """
+    activation_dict = dict(
+        identity=0,
+        sigmoid=1,
+        tanh=2,
+        relu=3, )
+    activation = activation_dict[activation]
+    gate_activation = activation_dict[gate_activation]
+
+    helper = LayerHelper('gru_unit', **locals())
+    dtype = helper.input_dtype()
+    size = size / 3
+
+    # create weight
+    if weight is None:
+        weight = helper.create_parameter(
+            attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
+
+    # create bias
+    if bias is None:
+        bias_size = [1, 3 * size]
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    gate = helper.create_tmp_variable(dtype)
+    reset_hidden_pre = helper.create_tmp_variable(dtype)
+    updated_hidden = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='gru_unit',
+        inputs={'Input': input,
+                'HiddenPrev': hidden,
+                'Weight': weight},
+        outputs={
+            'Gate': gate,
+            'ResetHiddenPrev': reset_hidden_pre,
+            'Hidden': updated_hidden,
+        },
+        attrs={
+            'activation': 0,
+            'gate_activation': 1,
+        })
+
+    return updated_hidden, reset_hidden_pre, gate
+
+
 def data(name,
          shape,
          append_batch_size=True,
@@ -358,7 +430,8 @@ def _create_op_func_(op_type):
                     dtype = each.dtype
                 elif dtype != each.dtype:
                     raise ValueError(
-                        "operator {0} must input same dtype".format(op_type))
+                        "operator {0} must input same dtype. {1} vs {2}".format(
+                            op_type, dtype, each.dtype))
 
         return dtype
 
@@ -477,6 +550,24 @@ def linear_chain_crf(input,
     return log_likelihood
 
 
+def crf_decoding(input,
+                 param_attr,
+                 label=None,
+                 main_program=None,
+                 startup_program=None):
+    helper = LayerHelper('crf_decoding', **locals())
+    transition = helper.get_parameter(param_attr.name)
+    viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='crf_decoding',
+        inputs={"Emission": [input],
+                "Transition": transition,
+                "Label": label},
+        outputs={"ViterbiPath": [viterbi_path]})
+
+    return viterbi_path
+
+
 def assign(input, output, main_program=None, startup_program=None):
     helper = LayerHelper('assign', **locals())
     helper.append_op(
@@ -614,6 +705,40 @@ def accuracy(input, label, k=1, correct=None, total=None, **kwargs):
     return acc_out
 
 
+def chunk_eval(input,
+               label,
+               chunk_scheme,
+               num_chunk_types,
+               excluded_chunk_types=None,
+               **kwargs):
+    """
+    This function computes the accuracy using the input and label.
+    The output is the top_k inputs and their indices.
+    """
+    helper = LayerHelper("chunk_eval", **kwargs)
+
+    # prepare output
+    precision = helper.create_tmp_variable(dtype="float32")
+    recall = helper.create_tmp_variable(dtype="float32")
+    f1_score = helper.create_tmp_variable(dtype="float32")
+
+    helper.append_op(
+        type="chunk_eval",
+        inputs={"Inference": [input],
+                "Label": [label]},
+        outputs={
+            "Precision": [precision],
+            "Recall": [recall],
+            "F1-Score": [f1_score]
+        },
+        attrs={
+            "num_chunk_types": num_chunk_types,
+            'chunk_scheme': chunk_scheme,
+            'excluded_chunk_types': excluded_chunk_types or []
+        })
+    return precision, recall, f1_score
+
+
 def sequence_conv(input,
                   num_filters,
                   filter_size=3,
@@ -637,7 +762,7 @@ def sequence_conv(input,
     helper = LayerHelper('sequence_conv', **locals())
     dtype = helper.input_dtype()
     filter_shape = [filter_size * input.shape[1], num_filters]
-    filter = helper.create_parameter(
+    filter_param = helper.create_parameter(
         attr=helper.param_attr, shape=filter_shape, dtype=dtype)
     pre_bias = helper.create_tmp_variable(dtype)
 
@@ -645,7 +770,7 @@ def sequence_conv(input,
         type='sequence_conv',
         inputs={
             'X': [input],
-            'Filter': [filter],
+            'Filter': [filter_param],
         },
         outputs={"Out": pre_bias},
         attrs={
@@ -660,7 +785,7 @@ def sequence_conv(input,
 def conv2d(input,
            num_filters,
            filter_size,
-           stride=[1, 1],
+           stride=None,
            padding=None,
            groups=None,
            param_attr=None,
@@ -677,6 +802,8 @@ def conv2d(input,
     conv-2d output, if mentioned in the input parameters.
     """
 
+    if stride is None:
+        stride = [1, 1]
     helper = LayerHelper('conv2d', **locals())
     dtype = helper.input_dtype()
 
@@ -702,7 +829,7 @@ def conv2d(input,
         std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
         return Normal(0.0, std, 0)
 
-    filter = helper.create_parameter(
+    filter_param = helper.create_parameter(
         attr=helper.param_attr,
         shape=filter_shape,
         dtype=dtype,
@@ -711,10 +838,10 @@ def conv2d(input,
     pre_bias = helper.create_tmp_variable(dtype)
 
     helper.append_op(
-        type='conv2d',
+        type='conv2d_cudnn',
         inputs={
             'Input': input,
-            'Filter': filter,
+            'Filter': filter_param,
         },
         outputs={"Output": pre_bias},
         attrs={'strides': stride,
@@ -750,8 +877,8 @@ def sequence_pool(input, pool_type, **kwargs):
 def pool2d(input,
            pool_size,
            pool_type,
-           pool_stride=[1, 1],
-           pool_padding=[0, 0],
+           pool_stride=None,
+           pool_padding=None,
            global_pooling=False,
            main_program=None,
            startup_program=None):
@@ -759,6 +886,10 @@ def pool2d(input,
     This function adds the operator for pooling in 2 dimensions, using the
     pooling configurations mentioned in input parameters.
     """
+    if pool_padding is None:
+        pool_padding = [0, 0]
+    if pool_stride is None:
+        pool_stride = [1, 1]
     if pool_type not in ["max", "avg"]:
         raise ValueError(
             "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
@@ -1343,7 +1474,7 @@ def lod_tensor_to_array(x, table, main_program=None):
     return array
 
 
-def array_to_lod_tensor(x, table, main_program=None):
+def array_to_lod_tensor(x, table, main_program=None, startup_program=None):
     """
     This function creates an operator to convert an array to a
     LOD_Tensor.
@@ -1424,7 +1555,11 @@ def zeros(shape, dtype, main_program=None):
     return fill_constant(value=0.0, **locals())
 
 
-def increment(x, value=1.0, in_place=True, main_program=None):
+def increment(x,
+              value=1.0,
+              in_place=True,
+              main_program=None,
+              startup_program=None):
     """
     This function creates an operator to increment each value in the input
     `x` by an amount: `value` as mentioned in the input parameter. This
@@ -1443,7 +1578,7 @@ def increment(x, value=1.0, in_place=True, main_program=None):
     return out
 
 
-def array_write(x, i, array=None, main_program=None):
+def array_write(x, i, array=None, main_program=None, startup_program=None):
     """
     This function creates an operator to write the data out as a
     LOD_TENSOR_ARRAY.
@@ -1482,7 +1617,7 @@ def less_than(x, y, cond=None, main_program=None, **ignored):
     return cond
 
 
-def array_read(array, i, main_program=None):
+def array_read(array, i, main_program=None, startup_program=None):
     """
     This function creates an operator to read the data in as a
     LOD_TENSOR_ARRAY.
@@ -1501,7 +1636,7 @@ def array_read(array, i, main_program=None):
     return out
 
 
-def shrink_memory(x, i, table, main_program=None):
+def shrink_memory(x, i, table, main_program=None, startup_program=None):
     """
     This function creates an operator to shrink_rnn_memory using the RankTable
     as mentioned in the input parameter.
@@ -1607,11 +1742,13 @@ def conv2d_transpose(input,
 
         h_in = input.shape[2]
         w_in = input.shape[3]
+
         filter_size_h = (output_size[0] - (h_in - 1) * stride[0] + 2 *
                          padding[0] - 1) / dilation[0] + 1
         filter_size_w = (output_size[1] - (w_in - 1) * stride[1] + 2 *
                          padding[1] - 1) / dilation[1] + 1
         filter_size = [filter_size_h, filter_size_w]
+
     elif isinstance(filter_size, int):
         filter_size = [filter_size, filter_size]
 
@@ -1850,3 +1987,209 @@ class IfElse(object):
                     main_program=self.helper.main_program,
                     startup_program=self.helper.startup_program))
         return rlist
+
+
+class DynamicRNN(object):
+    BEFORE_RNN = 0
+    IN_RNN = 1
+    AFTER_RNN = 2
+
+    def __init__(self, name=None, main_program=None, startup_program=None):
+        self.helper = LayerHelper(
+            'dynamic_rnn',
+            name=name,
+            main_program=main_program,
+            startup_program=startup_program)
+        self.status = DynamicRNN.BEFORE_RNN
+        self.lod_rank_table = None
+        self.max_seq_len = None
+        self.step_idx = None
+        self.zero_idx = fill_constant(shape=[1], value=0, dtype='int64')
+        self.mem_dict = dict()
+        self.output_array = []
+        self.outputs = []
+        self.cond = self.helper.create_tmp_variable(dtype='bool')
+        self.cond.stop_gradient = False
+        self.while_op = While(self.cond)
+        self.input_array = []
+        self.mem_link = []
+
+    def step_input(self, x):
+        self._assert_in_rnn_block_("step_input")
+        if not isinstance(x, Variable):
+            raise TypeError(
+                "step_input() can only take a Variable as its input")
+        parent_block = self._parent_block_()
+        if self.lod_rank_table is None:
+            self.lod_rank_table = parent_block.create_var(
+                name=unique_name('lod_rank_table'),
+                type=core.VarDesc.VarType.LOD_RANK_TABLE)
+            self.lod_rank_table.stop_gradient = True
+            parent_block.append_op(
+                type='lod_rank_table',
+                inputs={"X": x},
+                outputs={"Out": self.lod_rank_table})
+            self.max_seq_len = parent_block.create_var(
+                name=unique_name('dynamic_rnn_max_seq_len'), dtype='int64')
+            self.max_seq_len.stop_gradient = False
+            parent_block.append_op(
+                type='max_sequence_len',
+                inputs={'RankTable': self.lod_rank_table},
+                outputs={"Out": self.max_seq_len})
+            self.cond.stop_gradient = True
+            parent_block.append_op(
+                type='less_than',
+                inputs={'X': self.step_idx,
+                        'Y': self.max_seq_len},
+                outputs={'Out': self.cond})
+
+        input_array = parent_block.create_var(
+            name=unique_name('dynamic_rnn_input_array'),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=x.dtype)
+        self.input_array.append((input_array, x.dtype))
+        parent_block.append_op(
+            type='lod_tensor_to_array',
+            inputs={'X': x,
+                    'RankTable': self.lod_rank_table},
+            outputs={'Out': input_array})
+        return array_read(
+            array=input_array, i=self.step_idx, **self.helper.to_kwargs)
+
+    @contextlib.contextmanager
+    def block(self):
+        if self.status != DynamicRNN.BEFORE_RNN:
+            raise ValueError("rnn.block() can only be invoke once")
+        self.step_idx = fill_constant(shape=[1], dtype='int64', value=0)
+        self.step_idx.stop_gradient = False
+        self.status = DynamicRNN.IN_RNN
+        with self.while_op.block():
+            yield
+            increment(
+                x=self.step_idx,
+                value=1.0,
+                in_place=True,
+                **self.helper.to_kwargs)
+
+            for new_mem, mem_array in self.mem_link:
+                array_write(
+                    x=new_mem,
+                    i=self.step_idx,
+                    array=mem_array,
+                    **self.helper.to_kwargs)
+
+            less_than(
+                x=self.step_idx,
+                y=self.max_seq_len,
+                cond=self.cond,
+                **self.helper.to_kwargs)
+
+        self.status = DynamicRNN.AFTER_RNN
+        for each_array in self.output_array:
+            self.outputs.append(
+                array_to_lod_tensor(
+                    x=each_array,
+                    table=self.lod_rank_table,
+                    **self.helper.to_kwargs))
+
+    def __call__(self, *args, **kwargs):
+        if self.status != DynamicRNN.AFTER_RNN:
+            raise ValueError(
+                "Dynamic RNN outputs can only be retrieved after rnn block")
+        if len(self.outputs) == 1:
+            return self.outputs[0]
+        else:
+            return self.outputs
+
+    def memory(self, init=None, shape=None, value=0.0, dtype='float32'):
+        self._assert_in_rnn_block_('memory')
+        if init is not None:
+            if not isinstance(init, Variable):
+                raise TypeError(
+                    "The input arg `init` of memory() must be a Variable")
+            parent_block = self._parent_block_()
+            mem_array = parent_block.create_var(
+                name=unique_name('dynamic_rnn_mem_array'),
+                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                dtype=init.dtype)
+            parent_block.append_op(
+                type='write_to_array',
+                inputs={'X': init,
+                        'I': self.zero_idx},
+                outputs={'Out': mem_array})
+            retv = array_read(
+                array=mem_array, i=self.step_idx, **self.helper.to_kwargs)
+            retv = shrink_memory(
+                x=retv,
+                i=self.step_idx,
+                table=self.lod_rank_table,
+                **self.helper.to_kwargs)
+            self.mem_dict[retv.name] = mem_array
+            return retv
+        else:
+            if len(self.input_array) == 0:
+                raise ValueError(
+                    "step_input should be invoked before memory(shape=..., value=...)"
+                )
+            parent_block = self._parent_block_()
+            init = parent_block.create_var(
+                name=unique_name('mem_init'), dtype=dtype)
+            arr, dtype = self.input_array[0]
+            in0 = parent_block.create_var(name=unique_name('in0'), dtype=dtype)
+            parent_block.append_op(
+                type='read_from_array',
+                inputs={'X': [arr],
+                        'I': [self.zero_idx]},
+                outputs={'Out': [in0]})
+            parent_block.append_op(
+                type='fill_constant_batch_size_like',
+                inputs={'Input': [in0]},
+                outputs={'Out': [init]},
+                attrs={
+                    'shape': [-1] + shape,
+                    'value': float(value),
+                    'dtype': init.dtype
+                })
+            return self.memory(init=init)
+
+    def update_memory(self, ex_mem, new_mem):
+        self._assert_in_rnn_block_('update_memory')
+        if not isinstance(ex_mem, Variable):
+            raise TypeError("The input arg `ex_mem` of update_memory() must "
+                            "be a Variable")
+        if not isinstance(new_mem, Variable):
+            raise TypeError("The input arg `new_mem` of update_memory() must "
+                            "be a Variable")
+
+        mem_array = self.mem_dict.get(ex_mem.name, None)
+        if mem_array is None:
+            raise ValueError("Please invoke memory before update_memory")
+        if self.lod_rank_table is None:
+            raise ValueError("Please invoke step_input before update_memory")
+
+        self.mem_link.append((new_mem, mem_array))
+
+    def output(self, *outputs):
+        self._assert_in_rnn_block_('output')
+        parent_block = self._parent_block_()
+        for each in outputs:
+            outside_array = parent_block.create_var(
+                name=unique_name("_".join(
+                    [self.helper.name, "output_array", each.name])),
+                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                dtype=each.dtype)
+            array_write(x=each, i=self.step_idx, array=outside_array)
+            self.output_array.append(outside_array)
+
+    def _parent_block_(self):
+        prog = self.helper.main_program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+
+        return parent_block
+
+    def _assert_in_rnn_block_(self, method):
+        if self.status != DynamicRNN.IN_RNN:
+            raise ValueError("{0} can only be invoked inside rnn block.".format(
+                method))
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py
index 05728ad75a..7ef524318e 100644
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -9,6 +9,7 @@ def simple_img_conv_pool(input,
                          pool_size,
                          pool_stride,
                          act,
+                         param_attr=None,
                          pool_type='max',
                          main_program=None,
                          startup_program=None):
@@ -16,6 +17,7 @@ def simple_img_conv_pool(input,
         input=input,
         num_filters=num_filters,
         filter_size=filter_size,
+        param_attr=param_attr,
         act=act,
         main_program=main_program,
         startup_program=startup_program)
@@ -36,6 +38,7 @@ def img_conv_group(input,
                    conv_padding=1,
                    conv_filter_size=3,
                    conv_act=None,
+                   param_attr=None,
                    conv_with_batchnorm=False,
                    conv_batchnorm_drop_rate=None,
                    pool_stride=1,
@@ -57,6 +60,7 @@ def img_conv_group(input,
 
     conv_padding = __extend_list__(conv_padding)
     conv_filter_size = __extend_list__(conv_filter_size)
+    param_attr = __extend_list__(param_attr)
     conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
     conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
 
@@ -70,6 +74,7 @@ def img_conv_group(input,
             num_filters=conv_num_filter[i],
             filter_size=conv_filter_size[i],
             padding=conv_padding[i],
+            param_attr=param_attr[i],
             act=local_conv_act,
             main_program=main_program,
             startup_program=startup_program)
@@ -101,6 +106,7 @@ def img_conv_group(input,
 def sequence_conv_pool(input,
                        num_filters,
                        filter_size,
+                       param_attr=None,
                        act="sigmoid",
                        pool_type="max",
                        main_program=None,
@@ -109,6 +115,7 @@ def sequence_conv_pool(input,
         input=input,
         num_filters=num_filters,
         filter_size=filter_size,
+        param_attr=param_attr,
         act=act,
         main_program=main_program,
         startup_program=startup_program)
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py
index 719e3b2563..bbdfab2df9 100644
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -18,8 +18,9 @@ class Optimizer(object):
     but need to use one of it's implementation.
     """
 
-    def __init__(self, global_step=None):
+    def __init__(self, global_step=None, regularization=None):
         self._global_step = global_step
+        self.regularization = regularization
         # Dictionary of accumulators. Some optimizer subclasses need to
         # allocate and manage extra variables associated with the parameters
         # to train. These variables are called accumulators.
@@ -199,7 +200,8 @@ class Optimizer(object):
         """
         params_grads = append_backward_ops(loss, parameter_list, no_grad_set)
         # Add regularization if any
-        params_grads = append_regularization_ops(params_grads)
+        params_grads = append_regularization_ops(params_grads,
+                                                 self.regularization)
         optimize_ops = self.create_optimization_pass(params_grads, loss,
                                                      startup_program)
         return optimize_ops
@@ -209,9 +211,9 @@ class SGDOptimizer(Optimizer):
     """ Simple SGD optimizer without any state.
     """
 
-    def __init__(self, learning_rate, global_step=None):
+    def __init__(self, learning_rate, **kwargs):
         assert learning_rate is not None
-        super(SGDOptimizer, self).__init__(global_step)
+        super(SGDOptimizer, self).__init__(**kwargs)
         self.type = "sgd"
         self._learning_rate = learning_rate
 
@@ -236,14 +238,10 @@ class MomentumOptimizer(Optimizer):
     """
     _velocity_acc_str = "velocity"
 
-    def __init__(self,
-                 learning_rate,
-                 momentum,
-                 use_nesterov=False,
-                 global_step=None):
+    def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs):
         assert learning_rate is not None
         assert momentum is not None
-        super(MomentumOptimizer, self).__init__(global_step)
+        super(MomentumOptimizer, self).__init__(**kwargs)
         self.type = "momentum"
         self._learning_rate = learning_rate
         self._momentum = momentum
@@ -284,10 +282,10 @@ class AdagradOptimizer(Optimizer):
     """
     _moment_acc_str = "moment"
 
-    def __init__(self, learning_rate, epsilon=1.0e-6, global_step=None):
+    def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs):
         assert learning_rate is not None
         assert epsilon is not None
-        super(AdagradOptimizer, self).__init__(global_step)
+        super(AdagradOptimizer, self).__init__(**kwargs)
         self.type = "adagrad"
         self._learning_rate = learning_rate
         self._epsilon = epsilon
@@ -331,12 +329,12 @@ class AdamOptimizer(Optimizer):
                  beta1=0.9,
                  beta2=0.999,
                  epsilon=1e-8,
-                 global_step=None):
+                 **kwargs):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        super(AdamOptimizer, self).__init__(global_step)
+        super(AdamOptimizer, self).__init__(**kwargs)
         self.type = "adam"
         self._learning_rate = learning_rate
         self._beta1 = beta1
@@ -436,12 +434,12 @@ class AdamaxOptimizer(Optimizer):
                  beta1=0.9,
                  beta2=0.999,
                  epsilon=1e-8,
-                 global_step=None):
+                 **kwargs):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        super(AdamaxOptimizer, self).__init__()
+        super(AdamaxOptimizer, self).__init__(**kwargs)
         self.type = "adamax"
         self._learning_rate = learning_rate
         self._beta1 = beta1
@@ -514,16 +512,12 @@ class DecayedAdagradOptimizer(Optimizer):
     """
     _moment_acc_str = "moment"
 
-    def __init__(self,
-                 learning_rate,
-                 decay=0.95,
-                 epsilon=1.0e-6,
-                 global_step=None):
+    def __init__(self, learning_rate, decay=0.95, epsilon=1.0e-6, **kwargs):
         assert learning_rate is not None
         assert decay is not None
         assert epsilon is not None
 
-        super(DecayedAdagradOptimizer, self).__init__(global_step)
+        super(DecayedAdagradOptimizer, self).__init__(**kwargs)
         self.type = "decayed_adagrad"
         self._learning_rate = learning_rate
         self._decay = decay
diff --git a/python/paddle/v2/fluid/regularizer.py b/python/paddle/v2/fluid/regularizer.py
index c2c18e1951..d1955b0047 100644
--- a/python/paddle/v2/fluid/regularizer.py
+++ b/python/paddle/v2/fluid/regularizer.py
@@ -3,7 +3,7 @@ import framework
 __all__ = ['append_regularization_ops', 'L1Decay', 'L2Decay']
 
 
-def append_regularization_ops(parameters_and_grads):
+def append_regularization_ops(parameters_and_grads, regularization=None):
     """Create and add backward regularization Operators
 
     Creates and adds backward regularization operators in the BlockDesc.
@@ -14,6 +14,8 @@ def append_regularization_ops(parameters_and_grads):
     Args:
         parameters_and_grads: A list of (parameters, gradients) pairs
                               that need to be regularized.
+        regularization: A global regularizer. If the parameter is not
+                        set. It will be applied with regularizer.
 
     Returns:
         list of (parameters, gradients) pair with the regularized gradient
@@ -23,14 +25,19 @@ def append_regularization_ops(parameters_and_grads):
     """
     params_and_grads = []
     for param, grad in parameters_and_grads:
+        regularization_term = None
+        if param.regularizer is not None:
+            # Add variable for regularization term in grad block
+            regularization_term = param.regularizer(param, grad.block)
+        elif regularization is not None:
+            regularization_term = regularization(param, grad.block)
+
         # If no gradient or no regularization specified,
         # then we don't need to do anything
-        if grad is None or param.regularizer is None:
+        if grad is None or regularization_term is None:
             params_and_grads.append((param, grad))
             continue
 
-        # Add variable for regularization term in grad block
-        regularization_term = param.regularizer(param, grad.block)
         assert grad.shape == regularization_term.shape
 
         grad.block.append_op(
@@ -145,7 +152,7 @@ class L1DecayRegularizer(WeightDecayRegularizer):
 # import paddle.fluid as fluid
 #
 # hidden = fluid.layers.fc(...,
-#                          param_attr=ParamAttr(fluid.regularizer.Xavier()))
+#                          param_attr=fluid.regularizer.Xavier())
 #
 # It is no need to add a `Regularizer` as the class suffix
 L1Decay = L1DecayRegularizer
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
index 0494c7cdca..d2693b602e 100644
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -1,3 +1,5 @@
+import math
+
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
@@ -137,12 +139,23 @@ def main():
         param_attr=fluid.ParamAttr(
             name='crfw', learning_rate=mix_hidden_lr))
     avg_cost = fluid.layers.mean(x=crf_cost)
+
     # TODO(qiao)
-    #   1. add crf_decode_layer and evaluator
-    #   2. use other optimizer and check why out will be NAN
+    # check other optimizers and check why out will be NAN
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
     sgd_optimizer.minimize(avg_cost)
 
+    # TODO(qiao)
+    # add dependency track and move this config before optimizer
+    crf_decode = fluid.layers.crf_decoding(
+        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
+
+    precision, recall, f1_score = fluid.layers.chunk_eval(
+        input=crf_decode,
+        label=target,
+        chunk_scheme="IOB",
+        num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
+
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.conll05.test(), buf_size=8192),
@@ -166,11 +179,17 @@ def main():
         for data in train_data():
             outs = exe.run(fluid.default_main_program(),
                            feed=feeder.feed(data),
-                           fetch_list=[avg_cost])
+                           fetch_list=[avg_cost, precision, recall, f1_score])
             avg_cost_val = np.array(outs[0])
+            precision_val = np.array(outs[1])
+            recall_val = np.array(outs[2])
+            f1_score_val = np.array(outs[3])
 
             if batch_id % 10 == 0:
                 print("avg_cost=" + str(avg_cost_val))
+                print("precision_val=" + str(precision_val))
+                print("recall_val:" + str(recall_val))
+                print("f1_score_val:" + str(f1_score_val))
 
             # exit early for CI
             exit(0)
diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
new file mode 100644
index 0000000000..80ffc5a544
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
@@ -0,0 +1,119 @@
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 32
+word_dim = 16
+IS_SPARSE = True
+batch_size = 10
+max_length = 50
+topk_size = 50
+trg_dic_size = 10000
+
+decoder_size = hidden_dim
+
+
+def encoder_decoder():
+    # encoder
+    src_word_id = layers.data(
+        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = layers.embedding(
+        input=src_word_id,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = layers.sequence_pool(input=lstm_hidden0, pool_type="last")
+
+    # decoder
+    trg_language_word = layers.data(
+        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = layers.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        current_word = rnn.step_input(trg_embedding)
+        mem = rnn.memory(init=encoder_out)
+        fc1 = fluid.layers.fc(input=[current_word, mem],
+                              size=decoder_size,
+                              act='tanh')
+        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
+        rnn.update_memory(mem, fc1)
+        rnn.output(out)
+
+    return rnn()
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    rnn_out = encoder_decoder()
+    label = layers.data(
+        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = layers.cross_entropy(input=rnn_out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
+    optimizer.minimize(avg_cost)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    place = core.CPUPlace()
+    exe = Executor(place)
+
+    exe.run(framework.default_startup_program())
+
+    batch_id = 0
+    for pass_id in xrange(2):
+        for data in train_data():
+            word_data = to_lodtensor(map(lambda x: x[0], data), place)
+            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
+            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
+            outs = exe.run(framework.default_main_program(),
+                           feed={
+                               'src_word_id': word_data,
+                               'target_language_word': trg_word,
+                               'target_language_next_word': trg_word_next
+                           },
+                           fetch_list=[avg_cost])
+            avg_cost_val = np.array(outs[0])
+            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                  " avg_cost=" + str(avg_cost_val))
+            if batch_id > 3:
+                exit(0)
+            batch_id += 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/test_activation_op.py b/python/paddle/v2/fluid/tests/test_activation_op.py
index bd52bef260..b052374dc7 100644
--- a/python/paddle/v2/fluid/tests/test_activation_op.py
+++ b/python/paddle/v2/fluid/tests/test_activation_op.py
@@ -1,6 +1,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+from scipy.special import expit
 
 
 class TestExp(OpTest):
@@ -455,5 +456,20 @@ class TestHardSigmoid(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.002)
 
 
+class TestSwish(OpTest):
+    def setUp(self):
+        self.op_type = "swish"
+        X = np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        self.inputs = {'X': X}
+        self.attrs = {'beta': 2.3}
+        self.outputs = {'Y': X * expit(self.attrs['beta'] * X)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.008)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
index 48673296a6..819e65a653 100644
--- a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
+++ b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
@@ -120,7 +120,7 @@ class TestChunkEvalOp(OpTest):
         self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 4, 5, 9
 
     def set_data(self):
-        infer = np.zeros((self.batch_size, )).astype('int32')
+        infer = np.zeros((self.batch_size, )).astype('int64')
         infer.fill(self.num_chunk_types * self.num_tag_types)
         label = np.copy(infer)
         starts = np.random.choice(
diff --git a/python/paddle/v2/fluid/tests/test_crf_decoding_op.py b/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
index ee2b996bf4..ab573da31d 100644
--- a/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
+++ b/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
@@ -20,14 +20,14 @@ class CRFDecoding(object):
         self.w = transition_weights[2:, :]
 
         self.track = np.zeros(
-            (seq_start_positions[-1], self.tag_num), dtype="int32")
+            (seq_start_positions[-1], self.tag_num), dtype="int64")
         self.decoded_path = np.zeros(
-            (seq_start_positions[-1], 1), dtype="int32")
+            (seq_start_positions[-1], 1), dtype="int64")
 
     def _decode_one_sequence(self, decoded_path, x):
         seq_len, tag_num = x.shape
         alpha = np.zeros((seq_len, tag_num), dtype="float64")
-        track = np.zeros((seq_len, tag_num), dtype="int32")
+        track = np.zeros((seq_len, tag_num), dtype="int64")
 
         for i in range(tag_num):
             alpha[0, i] = self.a[i] + x[0, i]
@@ -125,10 +125,10 @@ class TestCRFDecodingOp2(OpTest):
             axis=0)
 
         labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int32")
+            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
         predicted_labels = np.ones(
-            (lod[-1][-1], 1), dtype="int32") * (TAG_NUM - 1)
-        expected_output = (labels == predicted_labels).astype("int32")
+            (lod[-1][-1], 1), dtype="int64") * (TAG_NUM - 1)
+        expected_output = (labels == predicted_labels).astype("int64")
 
         self.inputs = {
             "Emission": (emission, lod),
diff --git a/python/paddle/v2/fluid/tests/test_dyn_rnn.py b/python/paddle/v2/fluid/tests/test_dyn_rnn.py
index 271e39a0e0..034266c26f 100644
--- a/python/paddle/v2/fluid/tests/test_dyn_rnn.py
+++ b/python/paddle/v2/fluid/tests/test_dyn_rnn.py
@@ -7,7 +7,7 @@ import numpy
 class TestDynRNN(unittest.TestCase):
     def setUp(self):
         self.word_dict = paddle.dataset.imdb.word_dict()
-        self.BATCH_SIZE = 100
+        self.BATCH_SIZE = 2
         self.train_data = paddle.batch(
             paddle.dataset.imdb.train(self.word_dict),
             batch_size=self.BATCH_SIZE)
@@ -55,6 +55,7 @@ class TestDynRNN(unittest.TestCase):
                 mem = fluid.layers.shrink_memory(x=mem, i=i, table=rank_table)
 
                 hidden = fluid.layers.fc(input=[mem, ipt], size=100, act='tanh')
+
                 fluid.layers.array_write(x=hidden, i=i, array=out)
                 fluid.layers.increment(x=i, in_place=True)
                 fluid.layers.array_write(x=hidden, i=i, array=mem_array)
@@ -82,6 +83,48 @@ class TestDynRNN(unittest.TestCase):
         print(val)
         self.assertFalse(numpy.isnan(val))
 
+    def test_train_dyn_rnn(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            sentence = fluid.layers.data(
+                name='word', shape=[1], dtype='int64', lod_level=1)
+            sent_emb = fluid.layers.embedding(
+                input=sentence, size=[len(self.word_dict), 32], dtype='float32')
+
+            rnn = fluid.layers.DynamicRNN()
+
+            with rnn.block():
+                in_ = rnn.step_input(sent_emb)
+                mem = rnn.memory(shape=[100], dtype='float32')
+                out_ = fluid.layers.fc(input=[in_, mem], size=100, act='tanh')
+                rnn.update_memory(mem, out_)
+                rnn.output(out_)
+
+            last = fluid.layers.sequence_pool(input=rnn(), pool_type='last')
+            logits = fluid.layers.fc(input=last, size=1, act=None)
+            label = fluid.layers.data(name='label', shape=[1], dtype='float32')
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+                x=logits, label=label)
+            loss = fluid.layers.mean(x=loss)
+            sgd = fluid.optimizer.Adam(1e-3)
+            sgd.minimize(loss=loss)
+
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        exe.run(startup_program)
+        feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu)
+        data = next(self.train_data())
+        loss_0 = exe.run(main_program,
+                         feed=feeder.feed(data),
+                         fetch_list=[loss])[0]
+        for _ in xrange(100):
+            val = exe.run(main_program,
+                          feed=feeder.feed(data),
+                          fetch_list=[loss])[0]
+        # loss should be small after 100 mini-batch
+        self.assertLess(val[0], loss_0[0])
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_fill_op.py b/python/paddle/v2/fluid/tests/test_fill_op.py
new file mode 100644
index 0000000000..88337598c8
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_fill_op.py
@@ -0,0 +1,24 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.v2.fluid.core as core
+
+
+class TestFillOp(OpTest):
+    def setUp(self):
+        self.op_type = "fill"
+        val = np.random.random(size=[100, 200])
+        self.inputs = {}
+        self.attrs = {
+            'value': val.flatten().tolist(),
+            'shape': [100, 200],
+            'dtype': int(core.DataType.FP64)
+        }
+        self.outputs = {'Out': val.astype('float64')}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_initializer.py b/python/paddle/v2/fluid/tests/test_initializer.py
index 6c20203f8e..3175010f48 100644
--- a/python/paddle/v2/fluid/tests/test_initializer.py
+++ b/python/paddle/v2/fluid/tests/test_initializer.py
@@ -60,6 +60,29 @@ class TestUniformInitializer(unittest.TestCase):
         self.assertAlmostEqual(init_op.attr('max'), 1.0, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 0)
 
+    def test_uniform_initializer_random_seed(self):
+        """Test the uniform initializer with manually setting seed
+        """
+        program = framework.Program()
+        program.random_seed = 123
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.UniformInitializer())
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.UniformInitializer(seed=456))
+        init_op = block.ops[1]
+        self.assertEqual(init_op.attr("seed"), 123)
+        init_op1 = block.ops[0]
+        self.assertEqual(init_op1.attr("seed"), 456)
+
     def test_uniform_initializer(self):
         """Test uniform initializer with supplied attributes
         """
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index a9d9d369c7..57f6a362de 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -4,6 +4,7 @@ import unittest
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.framework import Program, program_guard
+from paddle.v2.fluid.param_attr import ParamAttr
 
 
 class TestBook(unittest.TestCase):
@@ -129,11 +130,21 @@ class TestBook(unittest.TestCase):
     def test_linear_chain_crf(self):
         program = Program()
         with program_guard(program, startup_program=Program()):
+            label_dict_len = 10
             images = layers.data(name='pixel', shape=[784], dtype='float32')
             label = layers.data(name='label', shape=[1], dtype='int32')
             hidden = layers.fc(input=images, size=128)
-            crf = layers.linear_chain_crf(input=hidden, label=label)
+            crf = layers.linear_chain_crf(
+                input=hidden, label=label, param_attr=ParamAttr(name="crfw"))
+            crf_decode = layers.crf_decoding(
+                input=hidden, param_attr=ParamAttr(name="crfw"))
+            layers.chunk_eval(
+                input=crf_decode,
+                label=label,
+                chunk_scheme="IOB",
+                num_chunk_types=(label_dict_len - 1) / 2)
             self.assertNotEqual(crf, None)
+            self.assertNotEqual(crf_decode, None)
 
         print(str(program))
 
diff --git a/python/paddle/v2/fluid/tests/test_lrn_op.py b/python/paddle/v2/fluid/tests/test_lrn_op.py
index 7e34b3c91c..9abb09e53a 100644
--- a/python/paddle/v2/fluid/tests/test_lrn_op.py
+++ b/python/paddle/v2/fluid/tests/test_lrn_op.py
@@ -23,7 +23,7 @@ class TestLRNOp(OpTest):
         start = -(self.n - 1) / 2
         end = start + self.n
 
-        mid = np.empty((self.N, self.C, self.H, self.W), dtype=float)
+        mid = np.empty((self.N, self.C, self.H, self.W)).astype("float32")
         mid.fill(self.k)
         for m in range(0, self.N):
             for i in range(0, self.C):
@@ -74,5 +74,4 @@ class TestLRNOp(OpTest):
 
 
 if __name__ == "__main__":
-    exit(0)  # LRN grad implement wrong
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_row_conv_op.py b/python/paddle/v2/fluid/tests/test_row_conv_op.py
new file mode 100644
index 0000000000..1ed86e23ac
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_row_conv_op.py
@@ -0,0 +1,95 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def row_conv_forward(x, lod, wt):
+    out = np.zeros_like(x)
+    seq_info = lod[0]
+    num_sequences = len(seq_info) - 1
+    context_length = wt.shape[0]
+
+    for i in range(num_sequences):  # loop over number of sequences
+        start = seq_info[i]
+        end = seq_info[i + 1]
+        curinput = x[start:end, :]
+        curoutput = out[start:end, :]
+
+        cur_timesteps = end - start
+        for j in range(cur_timesteps):  # loop over different timesteps
+            for k in range(context_length):
+
+                if j + k >= cur_timesteps:
+                    continue
+                curoutput[j, :] += curinput[j + k, :] * wt[k, :]
+
+    return out
+
+
+class TestRowConvOp1(OpTest):
+    def setUp(self):
+
+        self.op_type = "row_conv"
+        lod = [[0, 2, 5, 7]]
+        T = lod[0][-1]
+        D = 16
+        context_length = 2
+
+        x = np.random.random((T, D)).astype("float32")
+        wt = np.random.random((context_length, D)).astype("float32")
+        self.inputs = {'X': (x, lod), 'Filter': wt}
+
+        out = row_conv_forward(x, lod, wt)
+        self.outputs = {'Out': (out, lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Filter'], 'Out', max_relative_error=0.05)
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(
+            ['Filter'], 'Out', max_relative_error=0.05, no_grad_set=set('X'))
+
+    def test_check_grad_ignore_wt(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Filter'))
+
+
+class TestRowConvOp2(OpTest):
+    def setUp(self):
+
+        self.op_type = "row_conv"
+        lod = [[0, 20, 50, 100]]
+        T = lod[0][-1]
+        D = 35
+        context_length = 35
+
+        x = np.random.random((T, D)).astype("float32")
+        wt = np.random.random((context_length, D)).astype("float32")
+        self.inputs = {'X': (x, lod), 'Filter': wt}
+
+        out = row_conv_forward(x, lod, wt)
+        self.outputs = {'Out': (out, lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    #max_relative_error is increased from 0.05 to 0.06 as for higher
+    #dimensional input, the dX on CPU for some values has max_rel_error 
+    #slightly more than 0.05
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Filter'], 'Out', max_relative_error=0.06)
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(
+            ['Filter'], 'Out', max_relative_error=0.06, no_grad_set=set('X'))
+
+    def test_check_grad_ignore_wt(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.06, no_grad_set=set('Filter'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index fe91df10da..9ccb4dc176 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,8 +1,61 @@
 from setuptools import setup, Distribution, Extension
+import subprocess
 class BinaryDistribution(Distribution):
     def has_ext_modules(foo):
         return True
 
+MAJOR   = 0
+MINOR   = 11
+PATCH   = 0
+RC      = 0
+ISTAGED = False
+
+
+
+def git_commit():
+    try:
+        cmd = ['git', 'rev-parse', 'HEAD']
+        git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
+    except:
+        git_commit = 'Unknown'
+    return git_commit
+
+def write_version_py(filename='paddle/version.py'):
+    cnt = '''
+# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
+#
+full_version    = '%(major)d.%(minor)d.%(patch)d'
+major           = '%(major)d'
+minor           = '%(minor)d'
+patch           = '%(patch)d'
+rc              = '%(rc)d'
+istaged         = %(istaged)s
+commit          = '%(commit)s'
+
+def show():
+    if istaged:
+        print 'full_version:', full_version
+        print 'major:', major
+        print 'minor:', minor
+        print 'patch:', patch
+        print 'rc:', rc
+    else:
+        print 'commit:', commit
+'''
+    commit = git_commit()
+    with open(filename, 'w') as f:
+        f.write(cnt % {
+            'major': MAJOR,
+            'minor': MINOR,
+            'patch': PATCH,
+            'rc': RC,
+            'version': '${PADDLE_VERSION}',
+            'commit': commit,
+            'istaged': ISTAGED})
+
+write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
+
+
 packages=['paddle',
           'paddle.proto',
           'paddle.trainer',
@@ -21,7 +74,7 @@ with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
     setup_requires = f.read().splitlines()
 
 if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
-    setup_requires+=["opencv-python"]
+    setup_requires+=['opencv-python']
 
 # the prefix is sys.prefix which should always be usr
 paddle_bin_dir = 'opt/paddle/bin'
@@ -36,7 +89,7 @@ paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
 if '${MKL_SHARED_LIBS}'!= '':
   paddle_rt_libs += '${MKL_SHARED_LIBS}'.split(';')
 
-setup(name='paddlepaddle',
+setup(name='${PACKAGE_NAME}',
       version='${PADDLE_VERSION}',
       description='Parallel Distributed Deep Learning',
       install_requires=setup_requires,